### additional R functions # see more in http://genomics.lbl.gov/supplemental/DvHtranscripts2011/utils.R by Morgan Price andNoNA <- function(x) { ifelse(is.na(x),FALSE,x) } orNA <- function(x) { ifelse(is.na(x),TRUE,x) } without <- function(list,columns=list$without) { list2 <- list; for (i in columns) { list2[[i]] <- NULL; } return(list2); } withoutNA <- function(x) { x[!is.na(x)]; } # given data that is subgrouped by the same markers, do findNearby on each and merge the results findNearbyGrouped = function(splitx, splity, by.x, by.y=by.x, within=1) { out = NULL; for(i in names(splitx)) { if (!is.null(splity[[i]])) { rows = findNearby(splitx[[i]], splity[[i]], by.x, by.y, within); if(nrow(rows) > 0) { out = if(is.null(out)) rows else rbind(out,rows); } } } return(out); } findNearbyByLoc = function(x, y, ...) { return( findNearbyGrouped(split(x, x[,c("scaffoldId","strand")]), split(without(y,c("scaffoldId","strand")), y[,c("scaffoldId","strand")]), ...)); } # returns a matrix with columns as -span to span and rows as # the centers of the adjacent items embedCircular = function(series, span=5) { n = length(series); if (span < 0) stop("embedCircular requires nonnegative span"); if (span >= n-1) stop("span too high in embedCircular"); d = sapply(-span:span, function(off) series[if(off>0) c((off+1):n, 1:off) else if (off<0) c((n+off+1):n,1:(n+off)) else 1:n]); return(d); } # note rnafp is sorted by scaffoldId, strand, start, b/c of how merge was done above rnafpeak = function(d, span=25, limit=25) { centeri = which((-span:span) == 0); apply(embedCircular(1:nrow(d),span), 1, function(i) { # i is indices near our target index midi = i[centeri]; center = d$start[midi]; i = i[abs(d$start[i] - center) < limit]; return(which(i==midi) == which.max(d$n[i])); }); } # this is the new version with n instead of ntot # span2 is to avoid reporting multiple local maxima peaksByStrandScaffold = function(data, name="normll", thresh=0.6, sz=50, span2=21, circular=T) { if(is.null(data[[name]])) fail("bad name"); data$index = 1:nrow(data); data$up = FALSE; data$down = FALSE; data$r = NA; d = split(data,data[,c("strand","scaffoldId")]); if(is.null(data)) fail("bad data"); for(i in names(d)) { v = d[[i]]; v = v[order(v$end),]; v = v[!is.na(v[[name]]),]; r = fastlocalR(v[[name]],circular=circular,sz=sz); cat(nrow(v), length(r),"\n"); data$r[v$index] = r; sign = if(v$strand[1] == "+") 1 else -1; data$up[v$index] = peaks(sign*r, span=span2) & sign*r >= thresh; data$down[v$index] = peaks(-sign*r, span=span2) & -sign*r >= thresh; } return(data[data$up | data$down,]); } # like a filter() -- a fast estimate to local correlation of data to rep(0, sz), rep(1, sz) # not sure why not exactly the same (numerical problem?), but very close in practice fastlocalR = function(data, sz=50, circular=T) { f = c(rep(0,sz),rep(1,sz)); f = (f-mean(f))/sd(f); xy = filter(data,f,circular=circular); meanx = filter(data,rep(1,2*sz)/(2*sz),circular=circular); meanxx = filter(data**2,rep(1,2*sz)/(2*sz),circular=circular) return( -xy/(sqrt(meanxx - meanx**2)*2*sz) ); } # local peak-finding; use -argument to get minima peaks = function(series,span=3) { if (span %% 2 != 1) stop("span must be odd in peaks() -- ", span); z <- embed(series, span); s <- span%/%2; v<- max.col(z) == 1 + s; return(c(rep(FALSE,s),v,rep(FALSE,s))); } # requires a "sorted" data frame with scaffoldId, strand, end, norm (or other data name), # and index that matches the values in indexes # sorted must be sorted by scaffold, strand, and end SideAverages = function(sorted, indexes, off1, off2, minpoints=10, nt=250, name="norm") { sorted$index2 = 1:nrow(sorted); sorted2 = sorted[sorted$index %in% indexes,]; return( sapply(indexes, function(index) { row = sorted2[sorted2$index == index,]; if (is.null(row$scaffoldId)) stop("Invalid index ",index); i = (row$index2+off1):(row$index2+off2); i = i[i >= 1 & i <= nrow(sorted)]; i = i[sorted$scaffoldId[i] == row$scaffoldId & sorted$strand[i] == row$strand & abs(sorted$end[i] - row$end) <= nt & !is.na(sorted[i,name])]; if (length(i) < minpoints) return(NA); return(mean(sorted[i,name])); }) ); } # given boolean input, aggregate regions with T or F; returns dataframe with indices ifirst, ilast trueRegions = function(b) { len = length(b); ifirst = (2:len)[b[2:len] & !b[1:(len-1)]]; if (b[1]) { ifirst = c(1, ifirst); } ilast = (1:(len-1))[b[1:(len-1)] & !b[2:len]]; if (b[len]) { ilast = c(ilast, len); } if (length(ifirst) < 1 || length(ilast) < 1) return(NULL); return(data.frame(ifirst=ifirst[1:length(ilast)], ilast=ilast)); } # given a frame with begin, end, normll, for a single strand and scaffold, # smooth by the indicated number of adjacent probes and then # identify regions that are above the threshold onRegions = function(d, name, thresh=median(d[[name]]), smooth=40) { d = d[!is.na(d[[name]]),]; d = d[order(d$end),]; d$smooth = filter(d[[name]], rep(1,smooth)/smooth, circular=T); d$mid = (d$begin+d$end)/2; r = trueRegions(d$smooth > thresh); # and compute the midpoint and the average for each region r$begin = d$mid[r$ifirst]; r$end = d$mid[r$ilast]; r$avg = sapply(1:nrow(r), function(x) mean(d[r$ifirst[x]:r$ilast[x],name])); return(r[,c("begin","end","avg")]); } # run onRegions separately for each strand and scaffold and combine the results onRegionsSS = function(d, name="normll", ...) { s = split(d, d[,c("strand","scaffoldId")]); out = NULL; for (i in names(s)) { rows = onRegions(s[[i]], name, ...); rows = cbind(scaffoldId = s[[i]]$scaffoldId[1], strand = s[[i]]$strand[1], rows); out = if(is.null(out)) rows else rbind(out, rows); } return(out); } # assumes that the data table includes scaffoldId, strand, and mid, is sorted by those # assumes that the ranges table includes scaffoldId, strand, begin, end fetchValuesForRanges = function(data, ranges, FUN = mean, name="norm", begin="begin", end="end", debug = FALSE, ...) { if (is.null(data[[name]])) stop("No data column named",norm); if (is.null(data$mid)) stop("No data column named mid"); r = ranges; r$i = 1:nrow(r); r$b = pmin(r[[begin]],r[[end]]); r$e = pmax(r[[begin]],r[[end]]); rSplit = split(r, r[,c("scaffoldId","strand")], drop=T); indexes = split(1:nrow(data), data[,c("scaffoldId","strand")]); ranges$i1 = NA; ranges$i2 = NA; ranges$value = NA; iSplit = list(); for (n in names(rSplit)) { r2 = rSplit[[n]]; dataindex = indexes[[n]]; if(is.null(dataindex)) stop("No data for",n); bi = findInterval(r2$b, data$mid[dataindex], all.inside=T); ei = findInterval(r2$e, data$mid[dataindex], all.inside=T); # note drop above to ensure this does not get invoked for (row in 1:nrow(r2)) { i = r2$i[row]; ranges$i1[i] = dataindex[bi[row]]; ranges$i2[i] = dataindex[ei[row]]; if(debug) cat("Name",n,"Range",r2$b[row],r2$e[row],"becomes",ranges$i1[i],ranges$i2[i],"name",name,"\n"); ranges$value[i] = FUN(data[dataindex[bi[row]:ei[row]],name], ...); } } return(ranges); } BinnedBinaryFit2 <- function(xT, xF, gSize=NULL, gSep=NULL, xout=c(xT,xF), fT=length(xT)/(length(xT)+length(xF)), # to compute abcd pActual=.5, a=NULL,b=NULL,c=NULL,d=NULL, smooth=TRUE,span=NULL, ...) { x <- c(xT,xF); y <- ( 1:length(x) <= length(xT) ); if (!is.null(a)) { fT <- NULL; } # don't override abcd if passed in return(BinnedBinaryFit(x,y, gSize=gSize, gSep=gSep, xout=xout, fT=fT, pActual=pActual, a=a, b=b, c=c, d=d, smooth=smooth, span=NULL, ...)); } BinnedBinaryFit <- function(x,y,gSize=NULL,gSep=NULL,xout=x,fT=NULL,pActual=.5,a=0,b=1,c=1,d=0, alpha=2,beta=2,debug=FALSE, plot=FALSE,log="",ylim=NULL,smooth=TRUE,span=NULL,...) { xsave <- xout; nTrueNA <- countif(is.na(x) & y); nFalseNA <- countif(is.na(x) & !y); # xsave will handle NAs y <- y[!is.na(x)]; x <- x[!is.na(x)]; if (!is.null(fT)) { # notice this is a no-op if fT==pActual ratio <- (pActual/(1-pActual)) * (1-fT)/fT; a = 0; b <- 1; c <- ratio; d <- 1-c; if(debug) cat("a b c d",a,b,c,d,"\n"); } if (is.null(gSize)) { gSize <- 2 * (length(x) %/% 40); gSize <- pmin(200,pmax(100,gSize)); if (length(x) < 100) { warning("BinnedBinaryFit on under 100 items, using a single bin"); nT <- countif(y); nF <- if(length(y)>0) countif(!y) else 0; p <- pseudocountLL(nT,nF,a,b,c,d,alpha=alpha,beta=beta,debug=debug)$p; return(list(xSorted=x,groups=data.frame(min=-Inf,med=0,max=Inf,p=p,nT=nT,nF=nF), gSize=length(x),gSep=0,a=a,b=b,c=c,d=d,smooth=smooth,span=span, pNA=pseudocountLL(nTrueNA, nFalseNA,a,b,c,d, alpha=alpha,beta=beta,debug=debug)$p, p=rep(p,length(xsave)), index=rep(1,length(xsave)))); } if (length(x) <= 2*gSize) { gSize <- length(x) %/% 3; warning("BinnedBinaryFit on only ",length(x)," items, reset gSize to ",gSize); } } if (is.null(gSep)) gSep <- gSize %/% 2; orderX <- order(x); xSorted <- x[orderX]; nG <- ((length(x)-gSize) %/% gSep); gBeg <- 1 + gSep*(0:(nG-1)); gEnd <- gBeg + gSize-1; gEnd[length(gEnd)] <- length(x); # the last group includes up to an extra gSep gMin <- xSorted[gBeg]; gMax <- xSorted[gEnd]; nT <- nF <- p <- gMed <- p <- 0*(1:length(gBeg)); for (i in 1:length(p)) { if(debug) cat(i, gMin[i], gMax[i], gBeg[i], gEnd[i], length(xSorted),"\n"); # use all values between min,max in case of a large group with the same sep gx <- select(x,x >= gMin[i] & x <= gMax[i]); gy <- select(y,x >= gMin[i] & x <= gMax[i]); gMed[i] <- median(gx); nT[i] <- countif(gy); nF[i] <- length(gy) - nT[i]; if(debug) cat("", nT[i], nF[i], gMed[i], "\n"); pLL <- pseudocountLL(nT[i],nF[i],a,b,c,d,alpha=alpha,beta=beta,debug=debug); if (pLL$code > 2) { cat("Warning: pseudocount failed","nT",nT[i],"nF",nF[i], "a",a,"b",b,"c",c,"d",d,"code",pLL$code,"p",pLL$p,"\n"); } p[i] <- pLL$p; if(debug) { cat("group",i,"min",gMin[i],"max",gMax[i], "n",length(gx),"nT",nT[i],"nF",nF[i],"p",p[i],"\n"); } } if (plot) { if (is.null(ylim)) { ylim <- (if(log=="y"||log=="xy") c(.001+min(p),0.5+max(c(nT,nF))) else c(0,1)); } plot(gMed,p,log=log,type="l", ylim=ylim, ...); for(i in 1:length(gMin)) { lines(c(gMin[i], gMax[i]),c(p[i],p[i]),lty=1,col="grey"); } if (log=="y"||log=="xy") { points(gMed,.5+nT,type="l",col=3); points(gMed,.5+nF,type="l",col=2); } } bbf <- list(xSorted=xSorted,groups=data.frame(min=gMin,med=gMed,max=gMax,p=p,nT=nT,nF=nF), gSize=gSize,gSep=gSep,a=a,b=b,c=c,d=d,smooth=smooth,span=span, pNA=pseudocountLL(nTrueNA, nFalseNA,a,b,c,d,alpha=alpha,beta=beta,debug=debug)$p); if (length(unique(x)) < 2) smooth = FALSE; if (smooth == "groups") { bbf$groups$pRough = bbf$groups$p; x = rank(bbf$groups$med); y = logodds(bbf$groups$pRough); odds = loess(y ~ x, bbf$groups)$fitted; bbf$groups$p = logodds2p(odds); } if (smooth == TRUE) { out <- BBFPredict(xsave,bbf); lo <- logodds(out$p); bbf$loess <- loess(lo~rank(xsave),span=if(is.null(span)) 0.5 else span); } out <- BBFPredict(xsave,bbf); bbf$index <- out$index; bbf$p <- out$p; return(bbf); } BBFPredict <- function(xeval, bbf) { xeval2 <- xeval[!is.na(xeval)]; indexScaled <- pmax(1,pmin(length(bbf$xSorted),findIntervalAvg(xeval2,bbf$xSorted))); index <- 1 + ( indexScaled - 1 - bbf$gSize/2 )/bbf$gSep; indexMM <- pmin(length(bbf$groups$p),pmax(1,index)); p <- approx(1:length(bbf$groups$p),bbf$groups$p, xout=indexMM)$y; expand <- function(values,default,x) { v <- rep(default,length(x)); v[!is.na(x)] <- values; return(v); } if(!is.null(bbf$loess)) { pRough <- p; limits <- range(bbf$loess$x); indexScaled <- pmin(pmax(indexScaled, limits[1]),limits[2]); p <- logodds2p(predict(bbf$loess,indexScaled)); return(data.frame(index=expand(index,-1,xeval), pRough=expand(pRough,bbf$pNA,xeval), indexScaled=expand(indexScaled,-1,xeval), p=expand(p,bbf$pNA,xeval))); } else { return(data.frame(index=expand(index,-1,xeval), p=expand(p,bbf$pNA,xeval))); } } pseudocountLL <- function(nT,nF,a,b,c,d,alpha=2,beta=2,plot=FALSE,debug=FALSE) { if(debug) cat("pseudocountLL","nT",nT,"nF",nF,"abcd",a,b,c,d,"alphabeta",alpha,beta,"\n"); f <- (nT+1)/(nF+nT+2); naivep <- (c*f-a)/(b-d*f); pStart <- if(naivep > .001 && naivep < .999) naivep else (a+b*.5)/(c+d*.5); # the function we will minimize = -log likelihood func <- function(q) { if (q < -1e-8 || q >= 1 || a+b*q < 1e-8 || c+d*q < 1e-8 || (c-a + (d-b)*q) < 1e-8) { # nlm is smart enough to backtrack from illegal values v <- 1e10; attr(v,"gradient") <- 10; attr(v,"hessian") <- 10; if(debug) { cat("Illegal",q,"\n"); } return(v); } v <- nT*log(a+b*q) + nF*log((c-a)+(d-b)*q) - (nT+nF)*log(c+d*q) + (alpha-1)*log(q) + (beta-1)*log(1-q); gradient <- nT*b/(a+b*q) + nF*(d-b)/(c-a+(d-b)*q) - (nT+nF)*d/(c+d*q) + (alpha-1)/q - (beta-1)/(1-q); hessian <- -nT*b**2/(a+b*q)**2 - nF*(d-b)**2/(c-a+(d-b)*q)**2 + (nT+nF)*d**2/(c+d*q)**2 - (alpha-1)/q**2 - (beta-1)/(1-q)**2; # we want to maximize; nlm minimizes v <- -v; attr(v,"gradient") <- -gradient; attr(v,"hessian") <- -hessian; return(v); } opt <- nlm(func,pStart,hessian=TRUE,check.analyticals=FALSE); if (plot) { cat("Estimate",opt$estimate,"Start",pStart,"MaxLL",-opt$minimum, "Gradient",-opt$gradient,"Hessian",-opt$hessian, "Iter",opt$iter,"Code",opt$code,"\n"); oldpar <- par(no.readonly=TRUE); on.exit(par(oldpar)); par(mfrow=c(1,2)); p=(1:299)/300 dll <- nT*b/(a+b*p) + nF*(d-b)/(c-a+(d-b)*p) - (nT+nF)*d/(c+d*p) + (alpha-1)/p - (beta-1)/(1-p); plot(p,dll,ylim=c(-20,20),type="b"); vline(naivep,lty=1); vline(opt$estimate,lty=1,col="red"); vline(0); vline(1); hline(0,lty=1); plot(p,(a+b*p)/(c+d*p),type="l",ylim=c(0,1),lwd=2); vline(0);vline(1);hline(0);hline(1);vline(naivep,lty=1);hline(f,lty=1);eqline(); vline(opt$estimate,lty=1,col="red"); } else { return(list(p=opt$estimate,code=opt$code,iter=opt$iter,gradient=-opt$gradient)); } } countif <- function(x) { return(sum(ifelse(x,1,0))); } select <- function(X,Y) { return(split(X,Y)$"TRUE"); } logodds <- function(p) { return(log(p/(1-p))); } logodds2p <- function(x) { return( 1/(1+exp(-x)) ); } findIntervalAvg <- function(x,sorted) { index <- findInterval(x,sorted); return( ifelse(x < sorted[1] | x > sorted[length(sorted)-1], index, rank(sorted)[ifelse(index<1,1,index)] ) ); }