# R scripts for "Evidence-based annotation of genes and transcripts # in the sulfate-reducing bacterium Desulfovibrio vulgaris Hildenborough" # # Morgan Price, Arkin group, Lawrence Berkeley National Lab, April 2011 # Tables: # # Annotation related: # dvg -- the original annotation # remove, change, newlist -- the revisions to the annotation # dvcrit -- CRITICA calls # dvrast -- RAST calls # dvEssential -- likely essential genes based on orthologs in G20 and # essential in E. coli or B. subtilis or conserved across genomes and >= 1.5 kB # and not hit in preliminary mutagenesis screen # # Tiling related: # minl -- tiling data for minimal media, including normalization # richl -- tiling data for rich media (average of 2 replicates), including normalization # rich3 -- raw tiling data for rich media, 1 replicate, including control probes # rbreaksNew, minbreaksNew -- sharp rises or drops in rich or minimal tiling data # (from the local correlation method) # genel -- median intensity of "coding" and "non-coding" strands of each gene in rich and minimal media # (using the original annotation) # # 5' RNASeq related: # rnafp -- counts of 5' RNASeq reads at each position # rpeaks -- classification of potential transcript start sites # nprom3 -- 5' UTRs # # Terminator related: # tt -- predicted intrinsic terminators from TransTermHP # ttConfirm -- intrinsic terminators confirmed by the tiling data # ttMatch2 -- 3' UTRs # # Operon related: # predN -- operon pairs with updated predictions # # Proteomics related: # comb -- whole-proteome experiments against the 6 frame translation (in prot_acc, ORFs match DORFs in pcapE) # pcapN -- fractionation and pull-down proteomics experiments # pcapE -- the open reading frames # pcapR -- the peptides in pcapN mapped to the original annotation # pcapP -- the peptides in pcapN mapped to the 6-frame translation # source("Genomics/util/utils.R"); AssignGlobal = function(x) { sapply(names(x), function(n) assign(n, x[[n]], env=.GlobalEnv)); return(NULL); } smoothMin = function(x, width=3) { if(length(x) < width) return(NA); return(min(apply(embed(x, width), 1, mean, na.rm=T), na.rm=T)); } # normalized values in tiling data were computed as follows: # rmodel3 = lm(rich~genomic+nA+nC+nG,richl[richl$code==1,]); # use coding probes only # richl$fit = predict(rmodel3,richl) # richl$norm = richl$rich - richl$fit; richl$norm = richl$norm - median(richl$norm, na.rm=T); # # mmodel3 = lm(min~genomic+nA+nC+nG,minl[minl$code==1,]) # minl$fit = predict(mmodel3,minl); # minl$norm = minl$min - minl$fit; # minl$norm = ifelse(minl$genomic >= quantile(minl$genomic,0.01), minl$norm, NA); # minl$norm = minl$norm - median(minl$norm, na.rm=T); # rbreaksNew and mbreaksNew were computed as follows: # sorted=cbind(richl,index=1:nrow(richl))[order(richl$scaffoldId,richl$strand,richl$begin+richl$end),]; # rbreaksNew = peaksByStrandScaffold(richl, name="norm", thresh=0.8); # rbreaksNew$avgBefore = SideAverages(sorted,rbreaksNew$index,-5,-20); # rbreaksNew$avgAfter = SideAverages(sorted,rbreaksNew$index,5,20); # rbreaksNew$delta = rbreaksNew$avgAfter-rbreaksNew$avgBefore; # rbreaksNew$mid = floor((rbreaksNew$beg+rbreaksNew$end)/2); # rbreaksNew$at = rbreaksNew$mid + ifelse(rbreaksNew$strand=="+",-15,15); # # sorted=cbind(minl,index=1:nrow(minl))[order(minl$scaffoldId,minl$strand,minl$begin+minl$end),]; # mbreaksNew = peaksByStrandScaffold(minl, name="norm", thresh=0.7, span=25); # 1269 down 1592 up # mbreaksNew$avgBefore = SideAverages(sorted,mbreaksNew$index,-5,-25); # mbreaksNew$avgAfter = SideAverages(sorted,mbreaksNew$index,5,25); # mbreaksNew$delta = mbreaksNew$avgAfter-mbreaksNew$avgBefore; # mbreaksNew$mid = floor((mbreaksNew$beg+mbreaksNew$end)/2); # mbreaksNew$at = mbreaksNew$mid + ifelse(mbreaksNew$strand=="+",-15,15); # relies on dvg, remove, change, newlist ComputeNewGenes = function() { gnew = dvg[!dvg$sysName %in% remove$sysName, c("locusId", "sysName","scaffoldId","strand","start","stop","type","desc")] gnew = merge(gnew, list(sysName=change$sysName,off=change$start), all.x=T); gnew$off[is.na(gnew$off)] = 0; gnew$start = gnew$start + ifelse(gnew$strand=="+",1,-1)*3*gnew$off; gnew$off = NULL; newlist2 = newlist; names(newlist2)[names(newlist2)=="dv"] = "sysName"; gnew = rbind(gnew, cbind(newlist2[,c("sysName","scaffoldId","strand","start","stop","desc")],type=1,locusId=NA)); gnew$begin = pmin(gnew$start,gnew$stop); gnew$end = pmax(gnew$start,gnew$stop); gnew$ntlen = gnew$end-gnew$begin+1; gnew = gnew[order(gnew$scaffoldId,gnew$begin),]; gnew$sysName = ifelse(gnew$sysName=="", paste("VIMSS",gnew$locusId,sep=""), as.character(gnew$sysName)); # If pseudogene (type=7) and protein-coding gene (type=1) have matching start or stop, then remove pseudogene badpseudo = merge(gnew[gnew$type==7,], gnew[gnew$type==1,], by=c("scaffoldId","strand","start")); badpseudo2 = merge(gnew[gnew$type==7,], gnew[gnew$type==1,], by=c("scaffoldId","strand","stop")); gnew = gnew[!gnew$locusId %in% c(badpseudo$locusId.x,badpseudo2$locusId.x),]; AssignGlobal(list(gnew=gnew)); } # minimum log odds for promoters used in downstream analyses -- # chosen to give a reasonable false positive rate of 2.8% minlo = 4; # nprom -- promoters for (new) genes # relies on newg, rpeaks, minl, richl ComputeNProm = function() { nprom = findAfterSS(rpeaks[rpeaks$lo > minlo,], gnew, "start", "start") names(nprom)[names(nprom)=="start"] = "prom"; # promoter location names(nprom)[names(nprom)=="start.1"] = "start"; # gene start nprom$at = nprom$prom + ifelse(nprom$strand=="+",1,-1)*30; nprom$updist = abs(nprom$start-nprom$prom); nprom$min5 = fetchValuesForRanges(minl, nprom, begin="at", end="stop", smoothMin, width=5)$value; nprom$rich5 = fetchValuesForRanges(richl, nprom, begin="at", end="stop", smoothMin, width=5)$value; d = aggregate(nprom$updist, nprom[,c("scaffoldId","strand","prom")], min); names(d)[4] = "updist"; nprom2 = merge(nprom, d); nprom3 = nprom2[nprom2$updist <= 30 | andNoNA(nprom2$rich5 > 0) | andNoNA(nprom2$min5) > 0,]; d = findWithinByLoc(nprom3[,c("scaffoldId","strand","prom")], gnew[,c("scaffoldId","strand","locusId","begin","end")], "prom", "begin", "end"); d$genic = TRUE; names(d)[names(d)=="locusId"] = "locusWithin"; nprom3 = merge(nprom3,without(d, c("begin","end")),all.x=T); nprom3$locusWithin[andNoNA(nprom3$locusWithin==nprom3$locusId)] = NA; nprom3$genic = !is.na(nprom3$locusWithin); AssignGlobal(list(nprom=nprom, nprom2=nprom2, nprom3=nprom3)); } # Computes ttMatch, ttDropped, ttMatch2 using gnew and ttConfirm ComputeTermMatch = function() { # confirmed terminators after genes, filtering out cases with intermediate drop in expression if too far downstream, # and ignoring the handful of ones within genes ttMatch = findAfterSS(gnew, ttConfirm, "stop", "stop"); d = findWithinByLoc(ttConfirm, without(gnew,c("start","stop","type","desc")), "stop", "begin", "end") # 20 rows, 10 distinct terminators ttMatch = ttMatch[!ttMatch$stop.1 %in% d$stop,]; ttMatch$dndist = (ttMatch$stop.1-ttMatch$stop) * ifelse(ttMatch$strand=="+",1,-1); # ignore drops in signal within 30nt of the start of the terminator, as these might be due to 2ndary structure of terminator itself ttMatch$start.2 = ttMatch$start.1 - ifelse(ttMatch$strand=="+", 1, -1) * 30; ttMatch$min5 = fetchValuesForRanges(minl, ttMatch, begin="stop", end="start.2", smoothMin, width=5)$value; ttMatch$rich5 = fetchValuesForRanges(richl, ttMatch, begin="stop", end="start.2", smoothMin, width=5)$value; ttMatch$avg5 = (ttMatch$min5+ttMatch$rich5)/2; ttkeep = ttMatch$dndist < 100 | andNoNA(ttMatch$avg5 >= 0); ttDropped = ttMatch[!ttkeep,]; ttMatch = ttMatch[ttkeep,]; d = merge(aggregate(list(min=ttMatch$dndist), list(sysName=ttMatch$sysName), min), aggregate(list(max=ttMatch$dndist), list(sysName=ttMatch$sysName), max)); # only the 1st terminator downstream of each gene ttMatch2 = merge(ttMatch, d, by.x=c("sysName","dndist"), by.y=c("sysName","min")); ttMatch2 = ttMatch2[!ttMatch2$sysName=="DVU1532",]; # wierd terminator near 3' end of downstream gene, not sure what is going on AssignGlobal(list(ttMatch=ttMatch, ttDropped=ttDropped, ttMatch2=ttMatch2)); } # Used to find local peaks. In case of ties, only the first gets the peak flag set # input must be sorted by location and all on one scaffold/strand # span is how many rows to look within and limit is how many nucleotides to look within rnafpeak = function(d, span=25, limit=25) { centeri = which((-span:span) == 0); apply(embedCircular(1:nrow(d),span), 1, function(i) { # i is indices near our target index midi = i[centeri]; center = d$start[midi]; i = i[abs(d$start[i] - center) < limit]; return(which(i==midi) == which.max(d$ntot[i])); }); } # Given 5' RNASeq data in rnafp, adds on the match to rich and minimal tiling data and to # sequence analysis, and trains the classifier. # Saves rpeaks ComputePeakData = function() { rnafp = rnafp[order(rnafp$scaffoldId, rnafp$strand, ifelse(rnafp$strand=="+",1,-1)*rnafp$start),]; rnafp$peakNew = unsplit(lapply(split(rnafp, rnafp[,c("scaffoldId","strand")]), rnafpeak, span=25, limit=25), rnafp[,c("scaffoldId","strand")]); sum(rnafp$peakNew & rnafp$nf >= 10 & rnafp$nppp >= 10) # 13822 sum(rnafp$peak & rnafp$nf >= 10 & rnafp$nppp >= 10) # 11936 # hmm, number rose, prob. b/c of less masking rpeaks = rnafp[rnafp$peakNew & rnafp$nppp >= 10 & rnafp$nf >= 10,]; rpeaks$peak = NULL; rpeaks$peakNew = NULL; rpeaks = AddTilingAndBits(rpeaks); bbfRich = BinnedBinaryFit2(abs(rpeaks$rrich)[rpeaks$min>0 & rpeaks$bits>0], abs(rpeaks$rrich)[rpeaks$min==0 & rpeaks$bits==0]); rpeaks$logoddsRich = logodds(BBFPredict(abs(rpeaks$rrich), bbfRich)$p); bbfMin = BinnedBinaryFit2(abs(rpeaks$rmin)[rpeaks$rich>0 & rpeaks$bits>0], abs(rpeaks$rmin)[rpeaks$rich==0 & rpeaks$bits==0]); rpeaks$logoddsMin = logodds(BBFPredict(abs(rpeaks$rmin), bbfMin)$p); b = ifelse(rpeaks$bits==0, NA, rpeaks$bits); bbfBits = BinnedBinaryFit2(b[rpeaks$min>0 & rpeaks$rich>0], b[rpeaks$min==0 & rpeaks$rich==0]); rpeaks$logoddsBits = logodds(BBFPredict(b, bbfBits)$p); lo1 = rpeaks$logoddsBits + rpeaks$logoddsMin + rpeaks$logoddsRich; bbfNTot = BinnedBinaryFit2(log2(rpeaks$ntot)[lo1 >= 4], log2(rpeaks$ntot)[lo1 <= -4]); rpeaks$logoddsNTot = logodds(BBFPredict(log2(rpeaks$ntot), bbfNTot)$p); rpeaks$lo = rpeaks$logoddsRich + rpeaks$logoddsMin + rpeaks$logoddsBits + rpeaks$logoddsNTot; AssignGlobal(list(rpeaks=rpeaks, bbfRich=bbfRich, bbfMin=bbfMin, bbfBits=bbfBits, bbfNTot=bbfNTot)); } AddTilingAndBits = function(d, hits=hitsall2) { drich = findNearbyGrouped(split(d,d[,c("scaffoldId","strand")]), split(without(richprom,c("scaffoldId","strand")), richprom[,c("scaffoldId","strand")]), by.x="start",by.y="at",within=30); drichMax = aggregate(list(ntot=drich$ntot), drich[,c("scaffoldId","strand","at")], max); drich = merge(drich, drichMax); # only keep reads at maximum dmin = findNearbyGrouped(split(d,d[,c("scaffoldId","strand")]), split(without(minprom,c("scaffoldId","strand")), minprom[,c("scaffoldId","strand")]), by.x="start",by.y="at",within=30); dminMax = aggregate(list(ntot=dmin$ntot), dmin[,c("scaffoldId","strand","at")], max); dmin = merge(dmin, dminMax); dSeq = findNearbyGrouped(split(d, d[,c("scaffoldId","strand")]), split(without(hits,c("scaffoldId","strand")), hits[,c("scaffoldId","strand")]), "start", "startm", within=1); d = merge(merge(d, drich,all.x=T), dmin, all.x=T, by=c("scaffoldId","start","strand","nf","nppp","ntot"), suffixes=c("rich","min")); d = merge(d, dSeq, all.x=T); d$c=NULL; d$bits = ifelse(is.na(d$bits),0,d$bits); d$motif = ifelse(is.na(d$motif),-1,d$motif); d$min = ifelse(is.na(d$rmin),0,abs(d$rmin)); d$rich = ifelse(is.na(d$rrich),0,abs(d$rrich)); return(d); } # Uses genel, rpeaks, ttConfirm, and the updates to genes to re-build operon predictions; results in predN ComputeOpUpdate = function() { predN = pred[pred$Gene1 %in% dvg$locusId & pred$Gene2 %in% dvg$locusId,]; predN = pred[!in12(predN, remove), ]; predN = merge(predN, dvg[,c("scaffoldId","locusId","strand")], by.x="Gene1", by.y="locusId"); predN$upg = ifelse(predN$strand=="+",predN$Gene1,predN$Gene2); predN$dng = ifelse(predN$strand=="+",predN$Gene2,predN$Gene1); predN = merge(merge(predN, gnew[,c("locusId","start","stop")], by.x="upg", by.y="locusId"), gnew[,c("locusId","start","stop")], by.x="dng", by.y="locusId", suffixes=c(".up",".dn")); predN$Sep = (predN$start.dn - predN$stop.up) * ifelse(predN$strand=="+",1,-1); # to reflect updated annotation predN$min5 = fetchValuesForRanges(minl, predN, begin="stop.up", end="start.dn", smoothMin, width=5)$value; predN$rich5 = fetchValuesForRanges(richl, predN, begin="stop.up", end="start.dn", smoothMin, width=5)$value; # count number of terminators or confirmed terminators within the intergenic regions d = findWithinGrouped(split(tt, tt[,c("scaffoldId","strand")]), split(predN, predN[,c("scaffoldId","strand")]), "stop", "stop.up", "start.dn", minmax=T); ntt = aggregate(d$stop, d[,c("Gene1","Gene2")], length); names(ntt)[3] = "ntt"; predN = merge(predN, ntt, all.x=T); predN$ntt[is.na(predN$ntt)] = 0; d = findWithinGrouped(split(ttConfirm, ttConfirm[,c("scaffoldId","strand")]), split(predN, predN[,c("scaffoldId","strand")]), "stop", "stop.up", "start.dn", minmax=T); ntt = aggregate(d$stop, d[,c("Gene1","Gene2")], length); names(ntt)[3] = "ttConfirm"; predN = merge(predN, ntt, all.x=T); predN$ttConfirm[is.na(predN$ttConfirm)] = 0; # compare to expression-free predictions rebuilt using this subset of genes and corrected Sep pairsN = pairs[pairs$Gene1 %in% gnew$locusId & pairs$Gene2 %in% gnew$locusId,]; pairsN$upg = ifelse(pairsN$Strand1=="+",pairsN$Gene1,pairsN$Gene2); pairsN$dng = ifelse(pairsN$Strand1=="+",pairsN$Gene2,pairsN$Gene1); pairsN = merge(merge(pairsN, gnew[,c("locusId","start","stop")], by.x="upg", by.y="locusId"), gnew[,c("locusId","start","stop")], by.x="dng", by.y="locusId", suffixes=c(".up",".dn")); pairsN$Sep = (pairsN$start.dn - pairsN$stop.up) * ifelse(pairsN$Strand1=="+",1,-1); # to reflect updated annotation; correct only for same-strand pairs but that is OK predNoExpr = OperonsPredict(pairsN, vars=c("MOGScore","GOScore"))$pred; # will use comparative part only names(predNoExpr)[names(predNoExpr)=="bOp"] = "bOpNoExpr"; names(predNoExpr)[names(predNoExpr)=="pOp"] = "pOpNoExpr"; predN = merge(predN, predNoExpr[,c("Gene1","Gene2","bOpNoExpr","pOpNoExpr")]); # add on gene level information and best promoter (highest lo score) predN = merge(merge(predN, genel, by.x="upg", by.y="locusId", all.x=T), genel, by.x="dng", by.y="locusId", all.x=T, suffixes=c(".up",".dn")); predN$start.up2 = predN$start.up + 10 * ifelse(predN$strand=="+", 1, -1); # ignore 1st 10 nucleotides of upstream gene while looking for internal promoters d = findWithinByLoc(rpeaks, predN[!is.na(predN$start.up2) & !is.na(predN$start.dn),], "start", "start.up2", "start.dn", minmax=T); # 8123 rows, 621 with lo > 0 best = aggregate(list(start=1:nrow(d)), d[,c("scaffoldId","strand","upg","dng")], function(x) d$start[x[which.max(d$lo[x])]]); best = merge(best, d[,c("upg","dng","start","scaffoldId","strand","lo")]); predN = merge(predN, best, all.x=T); predN$diff = pmax(predN$min.c.up,predN$rich.c.up) - pmax(predN$min5,predN$rich5); # drop from upstream gene to intergenic predN$badnop = andNoNA(predN$min5 > 0.25) & andNoNA(predN$rich5 > 0.25) & !predN$bOp & predN$Sep >= 20; predN$code = ifelse(predN$bOp, "op", ifelse(!predN$badnop, "nop_ok", "badnop")); predN$code[andNoNA(predN$diff>1) & andNoNA(pmax(predN$min5,predN$rich5) < 0) & (andNoNA(predN$lo > minlo) | predN$ttConfirm>0)] = "nop_confirm"; predN$code[!andNoNA(predN$lo > minlo) & predN$bOpNoExpr & andNoNA(predN$diff < 1)] = "simple"; predN$internal = andNoNA(predN$lo > minlo) & andNoNA(predN$diff < 1) & predN$bOpNoExpr; predN$code[predN$internal] = "internal"; predN$attenuator = predN$code %in% c("op","simple","internal","badnop") & predN$ttConfirm>0; predN$code2 = ifelse(predN$code == "internal", "internal", ifelse(predN$code %in% c("nop_confirm","nop_ok"), "nop", ifelse(predN$attenuator, "attenuator", "op"))); predN$code3 = ifelse(andNoNA(predN$lo > minlo) & predN$code2=="op", "internal", predN$code2); # the final code AssignGlobal(list(predN=predN,predNoExpr=predNoExpr)); } # relies on: # dvg/gnew # comb, from reanalyzed_peptides.comb with dv added on # pcapR/pcapP # dvg # mnon/rnon # gvrich/gvmin (so that removed have a chance to show up in tiling; other genes are recomputed) CoverageFigure = function(...) { #dom = read.delim("data/Dvul/domains"); # genes with homology to InterPro domains (already loaded) #cog = read.delim("data/Dvul/cogrpsblast"); # genes with PSI-BLAST hits to COGs (already loaded) hom = unique(c(dom$locusId,cog$locusId)); homS = dvg$sysName[dvg$locusId %in% hom]; prot = c(as.character(dvg$sysName[dvg$type==1]), as.character(newlist$dv)); # 1=Family, 2=Other, 3=New, 4=Bad pcode = ifelse(prot %in% newlist$dv, 3, ifelse(prot %in% remove$sysName, 4, ifelse(prot %in% homS, 1,2))); cat("pcode counts", table(pcode),"\n"); gnmin = findWithinByLoc(gnew[,c("scaffoldId","strand","start","stop","locusId","sysName")], mnon, "start", "beg2", "end2") gnmin2 = findWithinByLoc(gnew[,c("scaffoldId","strand","start","stop","locusId","sysName")], mnon, "start", "beg2", "end2", all="y") gnmin = unique(rbind(gnmin,gnmin2)); gnmin = gnmin[gnmin$stop >= gnmin$beg2 & gnmin$stop <= gnmin$end2,]; gnrich = findWithinByLoc(gnew[,c("scaffoldId","strand","start","stop","locusId","sysName")], rnon, "start", "beg2", "end2") gnrich2 = findWithinByLoc(gnew[,c("scaffoldId","strand","start","stop","locusId","sysName")], rnon, "start", "beg2", "end2", all="y") gnrich = unique(rbind(gnrich,gnrich2)); gnrich = gnrich[gnrich$stop >= gnrich$beg2 & gnrich$stop <= gnrich$end2,]; seen = unique(c(gnew$sysName[gnew$locusId %in% pcapR$locusId], pcapP$dv, comb$dv)); seenRNA = c(as.character(gnrich$sysName[gnrich$avg>=0]), as.character(gnmin$sysName[gnmin$avg>=0]), as.character(gvrich$sysName[gvrich$avg>=0]), as.character(gvmin$sysName[gvmin$avg>=0])); counts3 = matrix(c(sapply(split(prot,pcode), function(x) mean(x %in% seen)), sapply(split(prot,pcode), function(x) mean(x %in% seenRNA))), ncol=2); barplot(counts3,beside=T,col=2:5,names=c("Proteomics","Tiling"),ylab="Proportion Expressed", ylim=0:1, ...); legend(1.0,1.0,c("Known Family","No Family","Added","Removed"),col=2:5,pch=15); } # uses rich3 and richl ProbeTypeFigure = function(...) { CompareHistogramList(list(Coding=log2(rich3$SIGNAL_MEAN[rich3$coding]), Antisense=log2(rich3$SIGNAL_MEAN[rich3$anti]), Intergenic=log2(rich3$SIGNAL_MEAN[rich3$interg]), Control=log2(rich3$SIGNAL_MEAN[rich3$control])), breaks=seq(6,16,0.5), xlab="Log2(Raw Intensity)", showCounts=F, ylim=c(0,0.6), ...); text(12, 0.25, sprintf("D(Coding,Anti)\n= %.2f", ks.test(rich3$SIGNAL_MEAN[rich3$coding], rich3$SIGNAL_MEAN[rich3$anti])$statistic)); } ProbeTypeFigure2 = function(data, ...) { CompareHistogramList(list(Coding=data$norm[data$code==1], Antisense=data$norm[data$code==-1], Intergenic=data$norm[data$code==0]), breaks=seq(-6,6,0.5), xlab="Normalized (Log2) Level", showCounts=F, legendX=0.5, ylim=c(0,0.4), ...); # pos=4 means to the right of text(0.5, 0.2, sprintf("D(Coding,Anti)\n= %.2f", ks.test(data$norm[data$code==1], data$norm[data$code==-1])$statistic), pos=4); vline(0); } # relies on genel and dvEssential GeneAvgIntensitiesFigure = function(...) { dom = read.delim("data/Dvul/domains"); cog = read.delim("data/Dvul/cogrpsblast"); hom = unique(c(dom$locusId,cog$locusId)); d = genel[genel$locusId %in% dvg$locusId[dvg$type==1],]; d$code = ifelse(d$locusId %in% dvEssential, 1, ifelse(d$locusId %in% remove$locusId, 3, ifelse(d$locusId %in% hom, 2, 4))); color=c("blue","green","red","darkgrey"); pch=c(1,3,4,2); # these are medians of normalized log2 levels plot(d$sense, d$anti, xlab="Expression of Sense Strand", ylab="Expression of Antisense Strand", col=color[d$code], pch=pch[d$code],...); eqline(); legend(0.75, 5, c("Essential","Known Family","Removed","Other"), col=color, pch=pch); } # relies on rpeaks RiseVs5RNASeqFigure = function(main="") { d = rpeaks[rpeaks$lo > minlo,]; hist((d$midmin-d$start)*ifelse(d$strand=="+",1,-1), xlab="Offset of Rise Relative to 5' RNASeq Peak", main=main); } # relies on rbreaksNew, tt DropVsTerminatorsFigure = function(main="") { dn = rbreaksNew[rbreaksNew$down & abs(rbreaksNew$r) >= 0.8 & andNoNA(abs(rbreaksNew$delta) >= 1),]; dn$checkat = ifelse(dn$strand=="+",dn$end,dn$begin); richtermNew = findNearbyGrouped(split(tt, tt[,c("scaffoldId","strand")]), split(without(dn,c("up","down","strand","scaffoldId","mean","genomic")), dn[,c("scaffoldId","strand")]), "stop", "checkat", within=60); # 676 richtermNew$mid = (richtermNew$begin+richtermNew$end)/2; hist(ifelse(richtermNew$strand=="+",1,-1)* (richtermNew$mid - richtermNew$stop), main=main, xlab="Offset of Drop Relative to Terminator"); } SaveCoverageFigure = function() { postscript("data/Dvul/coverage.eps", width=3.5, height=4, pointsize=10, horiz=F); CoverageFigure(); dev.off(); } BigQualityFigure = function() { par(mfrow=c(2,3)); ProbeTypeFigure(main="A. Raw Tiling Intensities\n(Rich Media, One Array)"); ProbeTypeFigure2(richl, main="B. Normalized Tiling Intensities\n(Rich Media, Average of Two Arrays)"); #ProbeTypeFigure2(richl, main="Normalized Tiling Intensities\n(Minimal Media, One Array)"); GeneAvgIntensitiesFigure(main="C. Normalized Expression Levels\n(Average of Rich & Minimal Tiling Data)"); DropVsTerminatorsFigure(main="D. Tiling Drops vs. Intrinsic Terminators\n(Rich media)"); RiseVs5RNASeqFigure(main="E. Tiling Rises vs. 5' RNASeq Peaks\n(Minimal media)"); CoverageFigure(main="F. Coverage of Tiling and Proteomics"); par(mfrow=c(1,1)); } SaveBigQualityFigure = function() { postscript("data/Dvul/BigQuality.eps", width=7, height=5.5, pointsize=10, horiz=F); BigQualityFigure(); dev.off(); } # uses predN, but does not use code2 (should this be changed?) InternalPromoterFigure = function() { d = predN[andNoNA(predN$lo > minlo) & predN$bOpNoExpr & andNoNA(predN$diff < 1),]; d$intergenic = ifelse(d$strand == "+", d$start > d$stop.up & d$start <= d$start.dn, d$start < d$stop.up & d$start >= d$start.dn); CompareHistogramList(list("Simple operon" = predN$ExprSim[!andNoNA(predN$lo > minlo) & predN$bOpNoExpr & andNoNA(predN$diff < 1)], "Promoter in upstream gene"=d$ExprSim[!d$intergenic], "Internal intergenic promoter"=d$ExprSim[d$intergenic], "Non-operon"=predN$ExprSim[!predN$bOpNoExpr & andNoNA(predN$diff > 1)]), breaks=seq(-1,1,0.2), xlab="Correlation of Expression of Adjacent Genes", ylim=c(0,0.7), showCounts=T, legendX = -1, col=c(1,4,3,2)); } SaveInternalPromoterFigure = function() { postscript("data/Dvul/InternalPromoter.eps", width=3.5, height=4.5, pointsize=9, horiz=F); InternalPromoterFigure(); dev.off(); } SaveProtLenFigure = function(file="data/Dvul/ProtLenFigure.eps") { if (!is.null(file)) postscript(file=file, width=6, height=3.5, pointsize=9, horiz=F); par(mfrow=c(1,2)); added = gnew[gnew$sysName %in% newlist$dv | gnew$sysName %in% change$sysName[change$start==0],]; CompareHistogramList(list(Unchanged=(dvg$ntlen/3 - 1)[!dvg$locusId %in% c(remove$locusId,change$locusId)], Removed=(dvg$ntlen/3 - 1)[dvg$locusId %in% remove$locusId], Added=(added$ntlen/3 - 1)), breaks=seq(0,400,25), xlab="Length in Amino Acids", main="A. Protein Lengths", ylim=c(0,0.5), showCounts=T, legendX=75, lwd=c(1,2,2)); hist(-change$start[change$start!=0], xlab="Change in Length, in Amino Acids", main="B. Changes to Protein Lengths", breaks=20); # plot within plot; rem mar is bottom, left, top, right oldpar = par(fig=c(0.75,0.99,0.55,0.85),new=T,oma=c(0,0,0,0), mar=c(3,2,1,1), mgp=c(2,1,0), cex=0.9); hist(-change$start[abs(change$start) <= 50 & change$start!=0], xlab="Changes up to 50 a.a.", main="", ylab="", breaks=20); box(col="black",lty=1, which="figure"); if (is.null(file)) { par(oldpar); par(mfrow=c(1,1)); } else { dev.off(); } } # relies on nprom3 and ttMatch2 SaveUTRFigure = function() { postscript("data/Dvul/UTRFigure.eps", width=6, height=3.5, pointsize=9, horiz=F); par(mfrow=c(1,2)); CompareHistogramList(split(nprom3$updist, nprom3$genic), breaks=seq(0,700,50), ylim=c(0,0.5), labels=c("Intergenic","Within-gene"), xlab="Length of 5' UTR", showCounts=T, legendX=75); hist(ttMatch2$dndist, xlab=sprintf("Length of 3' UTR (n=%d)", nrow(ttMatch2)), main=""); dev.off(); } RegionPlots = function(at, newnames=NULL, tilingLegendFrac = 0.85, geneTextShift = 0.3, geneLegendX=at[1]+80, geneLegendY = 2, # was 0.25, seqLegendX = at[1]+(at[2]-at[1])*0.8, labelSize=1, showLegend=T, featureLines=T, yaxis=T, offNew=0, gene=NULL, geneAltCodon=NULL, ylimTiling=c(-3,5), geneLabelOff=0.1) { yaxt = if(yaxis) "s" else "t"; at2 = at + c(-1,1)*1000; dm = minl[minl$scaffoldId==1944 & minl$mid >= at2[1] & minl$mid <= at2[2],]; dr = richl[richl$scaffoldId==1944 & richl$mid >= at2[1] & richl$mid <= at2[2],]; d5 = rnafp[rnafp$scaffoldId==1944 & rnafp$start >= at2[1] & rnafp$start <= at2[2],]; d5$lo = d5$start %in% rpeaks$start[rpeaks$scaffoldId==1944 & rpeaks$lo > minlo]; dg = dvg[dvg$scaffoldId==1944 & dvg$end >= at2[1] & dvg$begin <= at2[2],]; dg$sysName = sub("DVU_","",dg$sysName); dn = gnew[gnew$scaffoldId==1944 & gnew$end >= at2[1] & gnew$begin <= at2[2] & !gnew$locusId %in% dvg$locusId,]; if(!is.null(newnames)) { newnames = data.frame(sysName=names(newnames), label=c(newnames,recursive=T)); dn = merge(dn, newnames, all.x=T); dn$label = as.character(dn$label); dn$label[is.na(dn$label)] = dn$sysName; } else { dn$label = dn$sysName; } if (is.null(gene)) { genelines = NULL; ischanged = FALSE; } else { genelines = dvg[dvg$sysName %in% gene,]; geneends = c(genelines$start, genelines$stop); ischanged = genelines$locusId %in% change$locusId; } # dc not handled yet dc = gnew[gnew$scaffoldId==1944 & gnew$end >= at2[1] & gnew$begin <= at2[2] & gnew$locusId %in% change$locusId,]; dtt = tt[tt$scaffoldId==1944 & tt$start >= at2[1] & tt$start <= at2[2],]; plot(dr$mid, dr$norm, ylab=if(yaxis) "Rich Tiling\nNormalized Log2 Intensity" else "", xlab="", col=ifelse(dr$strand=="+",2,3), xlim=at, xaxt="n", pch=as.character(dr$strand), yaxt=yaxt, ylim=ylimTiling); hline(0, lty=1, col=1); if (featureLines) { sapply(d5$start[d5$lo], vline, lty=1); sapply(intersect(dtt$stop, ttConfirm$stop[ttConfirm$scaffoldId==1944]), vline, lty=1); } if (!is.null(genelines)) sapply(geneends, vline, col="darkgrey", lty=2); plot(dm$mid, dm$norm, xaxt="n",xlab="", ylab=if(yaxis) "Minimal Tiling\nNormalized Log2 Intensity" else "", col=ifelse(dm$strand=="+",2,3), pch=as.character(dm$strand), xlim=at, yaxt=yaxt, ylim=ylimTiling); hline(0, lty=1, col=1); if(showLegend) legend(at[1] + tilingLegendFrac*(at[2]-at[1]), max(dm$norm,na.rm=T), c("Plus strand","Minus strand"), col=c(2,3), pch=c("+","-")); if(featureLines) { sapply(d5$start[d5$lo], vline, lty=1); sapply(intersect(dtt$stop, ttConfirm$stop[ttConfirm$scaffoldId==1944]), vline, lty=1); } if (!is.null(genelines)) sapply(geneends, vline, col="darkgrey", lty=2); plot(d5$start, d5$ntot, log="y", xaxt="n", pch=ifelse(d5$lo,1,NA), col=ifelse(d5$strand=="+",2,3), xlim=at, ylab=if(yaxis) "5' RNASeq\nNumber of Reads" else "", yaxt=yaxt, ylim=c(5, 10000)); segments(d5$start, 1, d5$start, d5$ntot, col=ifelse(d5$strand=="+",2,3), lty=1); if (!is.null(genelines)) sapply(geneends, vline, col="darkgrey", lty=2); if(showLegend) legend(seqLegendX, max(d5$ntot), c("Starts:", "High-confidence", "Other"), pch=c(NA,1,NA), lty=c(NA,1,1), col=c(NA,1,1)); plot(at, c(-1,1), pch="", xlim=at, ylim=c(-1.75,1.75), xlab="", ylab=if(yaxis) "Genes & Terminators" else "", yaxt="n"); # xlab does not work, not sure why if (nrow(dg) > 0) { dashlist = if(ischanged) gene else NULL; arrows(dg$start, ifelse(dg$strand=="+",1,-1), dg$stop, ifelse(dg$strand=="+",1,-1), angle=30, code=2, col=ifelse(dg$locusId %in% remove$locusId, "darkgrey", "blue"), length=0.1, lty=ifelse(dg$sysName %in% dashlist, 3, 1), lwd=2); # lwd=ifelse(dg$locusId %in% hom, 3, 1) dg$above = ifelse(1:nrow(dg) %% 2 == 0, 1, -1) * geneTextShift; dg$labelx = (dg$begin+dg$end)/2; # where the label goes # avoid partly-off-screen labels, especially for target gene gene2 = c(as.character(newnames$sysName),gene); if (any(dg$sysName %in% gene2)) { cat(intersect(dg$sysName, gene2), geneLabelOff,"\n"); dg$labelx[dg$sysName %in% gene2 & dg$labelx < at[1]] = at[1]+geneLabelOff*diff(at); dg$labelx[dg$sysName %in% gene2 & dg$labelx > at[2]] = at[1]+(1-geneLabelOff)*diff(at); cat(dg$labelx[dg$sysName %in% gene2],"\n"); } dg2 = dg[dg$labelx >= at[1] & dg$labelx <= at[2],]; text(dg2$labelx, ifelse(dg2$strand=="+",1,-1) + dg2$above, dg2$sysName, col=ifelse(dg2$locusId %in% remove$locusId,"darkgrey","blue"), cex=labelSize); } if (nrow(dn) > 0) { arrows(dn$start, ifelse(dn$strand=="+",1,-1)*(1+offNew), dn$stop, ifelse(dn$strand=="+",1,-1)*(1+offNew), angle=30, code=2, col="orange", length=0.1, lwd=2); dn$above = ifelse(1:nrow(dn) %% 2 == 0, 1, -1) * geneTextShift; text((dn$begin+dn$end)/2, ifelse(dn$strand=="+",1,-1) + dn$above, dn$label, col="orange", cex=labelSize); } if (nrow(dtt) > 0) { text(dtt$stop, ifelse(dtt$strand=="+",1,-1), "T", col=ifelse(dtt$stop %in% ttConfirm$stop[ttConfirm$scaffoldId==1944], "black", "darkgrey"), cex=1.5); } if (!is.null(genelines) && !is.null(geneAltCodon)) { altstart = genelines$start + ifelse(genelines$strand=="+",1,-1)*3*(geneAltCodon); arrows(altstart, ifelse(genelines$strand=="+",1,-1)*(1-offNew), genelines$stop, ifelse(genelines$strand=="+",1,-1)*(1-offNew), angle=30, code=2, length=0.1, col=if(ischanged) "blue" else "darkgrey", lty=if(ischanged) 1 else 1, lwd=2); } if(showLegend) legend(geneLegendX, geneLegendY, c("Genes:", "Original","Removed","New", "Terminators:", "Confirmed", "Other"), col=c(NA, "blue","darkgrey","orange",NA,"black","darkgrey"), pch=c(rep(NA,5),"T","T"), lty=c(rep(1,4),rep(NA,3)), lwd=2, pt.cex=c(rep(1,5),rep(1.5,2))); axis(2, at=c(-1,1), labels=c("-","+"), tick=F); # 2 = at left } SaveExampleFigure = function(at = c(1719,1725.0)*1000, # was 1719.5 to 1723 or 1715:1725 newnames = list(DORF12455="copG", DORF12457="DUF497", DORF12461="higB", DORF12465="?", DORF12467="DUF497 (C)", DORF12469="DUF497 (N)", DVU1645="DVU1645"), geneLabelOff=0.005, file="data/Dvul/ExampleFigure.eps", ...) { if (!is.null(file)) postscript(file=file, width=7, height=5.5, pointsize=10, horiz=F); # outer margin at bottom only for y axis labels; inner margin at left for x axis labels # mgp moves y axis labels closer oldpar = par(mfrow=c(4,1), mar=c(0,4,0,0), oma=c(3,0,0,0), mgp=c(2,1,0)); RegionPlots(at, newnames=newnames, geneLabelOff=geneLabelOff, ...); if (is.null(file)) { par(oldpar); } else { dev.off(); } } SaveGidBFigure = function(at = c(1335,1339)*1000, newnames = NULL, geneLabelOff=0.005, file="data/Dvul/GidBFigure.eps", ...) { if (!is.null(file)) postscript(file=file, width=5, height=5.5, pointsize=10, horiz=F); # outer margin at bottom only for y axis labels; inner margin at left for x axis labels # mgp moves y axis labels closer oldpar = par(mfrow=c(4,1), mar=c(0,4,0,0), oma=c(3,0,0,0), mgp=c(2,1,0)); RegionPlots(at, newnames=newnames, geneLabelOff=geneLabelOff, ...); if (is.null(file)) { par(oldpar); } else { dev.off(); } } GeneUpdateFigure = function() { oldpar = par(mfcol=c(4,3), mar=c(0,4,0,0), oma=c(3,0,0,0.5), mgp=c(2,1,0)); # oma/mar are bottom,left,top,right RegionPlots(c(1554600,1555200), showLegend=F, featureLines=F, offNew=0.1, newnames=list(DORF11409="?"), gene="DVU1473"); RegionPlots(c(2042100,2042700), showLegend=F, featureLines=F, yaxis=F, gene="DVU1966", geneAltCodon=31-1, offNew=0.25); RegionPlots(c(1421050,1421650), showLegend=F, featureLines=F, yaxis=F, gene="DVU1344", geneAltCodon=23, offNew=0.25); par(oldpar); } SaveGeneUpdateFigure = function() { postscript(file="data/Dvul/GeneUpdateFigure.eps", width=7, height=5.5, pointsize=10, horiz=F); GeneUpdateFigure(); dev.off(); } SaveSupFigClassifier = function(file="data/Dvul/SupFigClassifier.eps") { if (!is.null(file)) postscript(file, width=7, height=5, pointsize=10, horiz=F); oldpar = par(mfrow=c(2,3), mar=c(4,4,1,1)); # 4 lines at left and bottom hist(rpeaks$lo, xlab="Log odds", main=""); vline(minlo); CompareHistogramList(split(ifelse(is.na(rpeaks$rrich), 0, abs(rpeaks$rrich)), rpeaks$lo > minlo), breaks=seq(0,1,0.05), xlab="Local Correlation of Matching Rise\nin Rich Media Tiling", labels=c("High confidence", "Other"), col=2:3, legendX=0.2); CompareHistogramList(split(ifelse(is.na(rpeaks$rmin), 0, abs(rpeaks$rmin)), rpeaks$lo > minlo), breaks=seq(0,1,0.05), xlab="Local Correlation of Matching Rise\nin Minimal Media Tiling", labels=c("High confidence", "Other"), col=2:3, legendX=0.2); CompareHistogramList(split(ifelse(is.na(rpeaks$bits),0,rpeaks$bits), rpeaks$lo > minlo), breaks=0.5+(0:20), xlab="Bit Score of Motif Hit", labels=c("High confidence", "Other"), col=2:3, legendX=5); CompareHistogramList(split(rpeaks$ntot, rpeaks$lo > minlo), breaks=10**(seq(1,6,0.25)), log="x", xlab="Total Number of Reads", labels=c("High confidence", "Other"), col=2:3, legendX=20); if (!is.null(file)) { dev.off(); } else { par(oldpar); } } SaveFigures = function() { SaveExampleFigure(); SaveCoverageFigure(); SaveGeneUpdateFigure(); SaveBigQualityFigure(); SaveProtLenFigure(); SaveUTRFigure(); SaveInternalPromoterFigure(); SaveSupFigClassifier(); SaveGidBFigure(); }