LoadCost = function(...) { LoadEcFit(); # creates ecf # ribosome profiling data from Li et al 2014, table S1 # Unfortunately it has gene names but not the b numbers, # and the gene names do not always match ecocyc or the annotation used for the fitness data. # # Some genes from the Li et al data will not appear below in ecRibo below. # The only expressed ones (fmin > 5e-6) are cases where the mapping between # genes and proteins breaks down: # tufA+tufB, dnaXgamma vs. dnaXtau, gadA+gadB, pinR+pinQ, ynaE+ydfK ecRiboAll = read.delim("data/Ec/LiEtAl2014_TableS1.tab",as.is=T); colnames(ecRiboAll) = c("Gene","rich","min","nomet"); # all MOPS based; nomet is rich minus methionine ecRiboAll$fmin = ecRiboAll$min/sum(ecRiboAll$min); ecRiboAll$frich = ecRiboAll$rich/sum(ecRiboAll$rich); # derived from ecocyc 14.6 ecIds <<- read.delim("data/mutagenesis/cmp/gene_ids.tab",header=F,as.is=T, col.names=c("ecocyc","name","BNum","eck","begin","end","strand","synonyms")); # additional names mapped by searching the E. coli wiki map = read.delim("workbook/fitevo/Li_names.map",as.is=T); d = rbind(merge(ecRiboAll[!ecRiboAll$Gene %in% map$Gene,], ecIds[,c("name","BNum")], by.x="Gene", by.y="name"), merge(ecRiboAll[!ecRiboAll$Gene %in% c(ecIds$name,map$Gene),], ecIds[,c("synonyms","BNum")], by.x="Gene", by.y="synonyms")); d = rbind(d, merge(ecRiboAll[!ecRiboAll$Gene %in% d$Gene,], map)); ecRibo = d; stopifnot( all(is.unique(ecRibo$Gene)) ); stopifnot( all(is.unique(ecRibo$BNum)) ); stopifnot( length(intersect(setdiff(ecRiboAll$Gene,ecRibo$Gene), map$Gene)) == 0 ); # use gene length to get codon fraction (fweight or its log10, logW) genes = read.delim("data/FEBA/g/Keio/genes.tab", as.is=T); genes$ntlen = genes$end-genes$begin+1; # always set up so begin < end and includes stop codon ecRibo = merge(ecRibo, genes[,c("sysName","ntlen","desc")], by.x="BNum", by.y="sysName"); # still 4062 ecRibo$aa = ecRibo$ntlen/3 - 1; ecRibo$fweight = (ecRibo$fmin * ecRibo$aa)/sum(ecRibo$fmin*ecRibo$aa); # minor correction for missing genes ecRibo$fweight = ecRibo$fweight * sum(ecRibo$fmin)/sum(ecRiboAll$fmin); ecRibo$logW = log10(pmax(1e-7,ecRibo$fweight)); # and add fitness information ecRibo = merge(ecRibo, ecf, by.x="BNum", by.y="sysName", all.x=T); stopifnot( all(is.unique(ecRibo$BNum)) ); # Essential genes are from PEC (v4), Keyamura et al. 2005 and Kato & Hashimoto MSB 2007 ecEss = read.delim("data/Ec/PECv4.tab",as.is=T); ecEss$bnum = sapply(ecEss$Alternative.Name, function(x) { w = words(x,by=","); w = w[grep("^b\\d+",w)]; w[1]; }); ecRibo$ess = ecRibo$BNum %in% ecEss$bnum | ecRibo$Gene=="polA"; # Growth of Keio collection deletions in minimal or rich media, from Baba et al. MSB 2006 keioG <<- read.delim("data/mutagenesis/keio_growth.csv2",na.strings=c("NA","NT")); # If a non-essential gene has no fitness data, it might be that it is important for fitness # in LB (and hence is absent from the pool) as well as in minimal media. # To identify these genes, require that it be in the bottom 5% of knockout OD values # in both rich and minimal media. ecRibo$sick = with(ecRibo, !ess & is.na(imp) & BNum %in% keioG$bnum[keioG$LB_22hr < quantile(keioG$LB_22hr,0.05,na.rm=T) & keioG$MOPS_24hr < quantile(keioG$MOPS_24hr,0.05,na.rm=T)]); # class: 1 essential, 2 important for fitness, 3 detrimental, 4 no phenotype, 5 ambiguous or unknown ecRibo$class = with(ecRibo, ifelse(ess, 1, ifelse(sick, 2, ifelse(is.na(imp), 5, imp+1)))); # save the key tables ecRiboAll <<- ecRiboAll; ecRibo <<- ecRibo; genes <<- genes; ecEss <<- ecEss; # TF information from RegulonDB, in tfRibo tfop = read.delim("motifs/RegulonDB/network_tf_operon_Sep2015.txt",comment="#",header=F,col.names=c("tfname","operon","sign","evidence","confidence","blank")); tf = unique(tfop[,"tfname",drop=F]); # 198 tf$lower = tolower(tf$tfname); tf = merge(tf, cbind(ecRibo[,c("Gene","BNum")],lower=tolower(ecRibo$Gene)), all.x=T); # still 198 tfbnum = read.delim("workbook/fitevo/tf_bnum.tab",as.is=T); tf = merge(tf, tfbnum, by="tfname", all.x=T); tf$BNum = ifelse(is.na(tf$BNum.x), as.character(tf$BNum.y), as.character(tf$BNum.x)); tfN = as.data.frame.table(table(unique(tfop[,c("tfname","operon")])$tfname)); names(tfN) = c("tfname","nTarget"); tfRibo = merge(tf[,c("tfname","BNum")], ecRibo); # 186 rows tfRibo = merge(tfRibo,tfN); tfRibo <<- tfRibo; # Also load the t values from Time0 comparisons for E. coli K-12 from # Wetmore et al. 2015 old_t = read.delim("http://genomics.lbl.gov/supplemental/rbarseq/html/Keio/fit_t.tab", as.is=T, check.names=F); t0cmp <<- cbind(old_t[,1:3], old_t[,grepl("Time0",names(old_t))]); # genes with class == 4 (no detectable phenotype) and fweight > 2e-4, # sorted into groups by hand and with comments (including if redundant # for genes that might be expected to have a phenotype, i.e. not catabolic, anaerobic, transporters) nophe <<- merge(read.delim("workbook/fitevo/highexpr_useless.tab6",as.is=T), ecRibo); # Number of phenotypes for each gene in a large fitness compendium ecsig <<- read.delim("workbook/fitevo/sig.Keio"); } # build ecf, a data frame with E. coli fitness data including # locusId (MicrobesOnline or VIMSS id), sysName (or b number), # various fitness values or t values, and # imp -- 1 if important, 2 if detrimental, 3 if no phenotype, 4 if ambiguous # Genes with inadequate representation in the Time0 samples are ignored # and imp is based on the fitness data only. LoadEcFit = function(bigdir1="data/FEBA/g/Keio/glucose_big", bigdir2="workbook/fitevo/Keio_big2") { loadfrom = function(dir,name) read.delim(paste(dir,"/",name,sep=""), as.is=T, check.names=F); # Two independent sequencing runs of the same PCR products # set 1 was run on MiSeq and is lower coverage # set 2 was run on HiSeq and is higher coverage # Each includes a Time0 and two 12-generation samples (independently from that same Time0) # set 1 also includes two sample collected at ~6 generations, before the transfer big_fit1 = loadfrom(bigdir1, "fit_logratios.tab"); big_t1 = loadfrom(bigdir1, "fit_t.tab"); big_fit2 = loadfrom(bigdir2, "fit_logratios.tab"); big_t2 = loadfrom(bigdir2, "fit_t.tab"); d = merge(cbind(big_t1, fit6A=big_fit1$"setAS2 A", fit6B=big_fit1$"setAS3 B"), cbind(big_t2, fit12A=big_fit2$setAIT095, fit12B=big_fit2$setAIT096), by=c("locusId","sysName","desc")); d$fit12 = rowMeans(d[,c("fit12A","fit12B")]); # because of the shared Time0, divide by sqrt(3) # var(Y-X + Z-X) = var(Y) + var(Z) + 4 * var(X) = 6 * var(Component) # var(Y-X) = 2 * var(Component) # So tA + tB is sqrt(3) times bigger than from a standard experiment # Also the name t6 is misleading -- that is t at 4.9 generations not 6 generations. d$t6 = (d$"setAS2 A" + d$"setAS3 B")/sqrt(3); d$t12_1 = (d$"setAS4 A2" + d$"setAS5 B2")/sqrt(3); d$t12_2 = (d$setAIT095 + d$setAIT096)/sqrt(3); # To identify significant fitness effects, # use the better-sampled results at 12 generations, and require # |combined t| > 4, corresponding to ~0.2 expected false positives d$imp = ifelse(d$t12_2 < -4 & d$setAIT095 < 0 & d$setAIT096 < 0, 1, ifelse(d$t12_2 > 4 & d$setAIT095 > 0 & d$setAIT096 > 0, 2, 3)); # If the sign was different at 6 generations, then call it ambiguous d$imp[d$imp == 1 & d$t6 > 0] = 4; # 5 genes d$imp[d$imp == 2 & d$t6 < 0] = 4; # 2 genes # If |average fitness| after 12 generations was above 0.5, yet the # gene was not called significant, then it is ambiguous d$imp[abs(d$fit12) > 0.5 & d$imp==3] = 4; # 41 genes ecf <<- d; } andNoNA = function(x) ifelse(is.na(x), FALSE, x); is.unique = function(x) { counts = as.data.frame.table(table(x), dnn="x"); return(x %in% counts$x[counts$Freq==1]); } CompareDensities <- function(list,labels=names(list),xlim=range(unlist(list)),ylim=c(0,3), col=1:length(labels), lty=1:length(labels), lwd=rep(1,length(labels)), legendX=mean(xlim),legendY=ylim[2],main="",xlab="",ylab="",showCounts=FALSE, showLegend=TRUE, bty="o", ...) { for (i in 1:length(labels)) { x = na.omit(list[[ names(list)[i] ]]); d <- density(x,from=xlim[1],to=xlim[2]); if(i==1) { plot(d$x,d$y,type="l",col=col[i],lty=lty[i],lwd=lwd[i], main=main,xlab=xlab,ylab=ylab,xlim=xlim,ylim=ylim,bty="l"); } else { lines(d$x,d$y,col=col[i],lty=lty[i],lwd=lwd[i]); } if (showCounts) labels[i] <- paste(labels[i]," (n=", sum(!is.na(x)), ")", sep=""); } if(showLegend) { if(is.numeric(legendX)) { legend(legendX,legendY,labels,col=col,lty=lty,lwd=lwd, bty=bty, ...); } else { # e.g. "topleft" legend(legendX, labels,col=col,lty=lty,lwd=lwd, bty=bty, ...); } } } # split a string separated by spaces into a list of word words = function(s, by=" ", ...) { strsplit(s[1], by, ...)[[1]]; } writeDelim = function(table,file=stdout(),report=FALSE,...) { write.table(table,file,sep="\t",quote=FALSE,row.names=FALSE,...); if(report) cat("Wrote",nrow(table),"rows","to",file,"\n") } NoPheTable = function() { d = nophe; d = d[order(d$fweight,decreasing=T),]; d = d[1:20,]; d$fweight = paste(formatC(100*d$fweight,digits=1,format="f"),"%",sep=""); writeDelim(d[,words("Gene fweight key desc comment")]); } CostVsBenefit = function(file = "workbook/fitevo/CostVsBenefit.eps") { if(!is.null(file)) { setEPS(); postscript(file, width=6.75, height=3.5, pointsize=10, horizontal=F, paper="special", onefile=F); } oldpar = par(mfrow=c(1,2), mgp=c(2,1,0),mar=c(3,1,4,0)+0.5, bty="l", xaxs="i", yaxs="i", yaxt="n", xaxt="n"); labels=c("Essential","Important","Detrimental (1%)","No phenotype","Ambiguous"); cols=words("orange red darkgreen blue darkgrey"); d = subset(ecRibo, class != 5); # remove ambiguous cases CompareDensities(split(d$logW,d$class), labels=labels[1:4], showCounts=TRUE, col=cols[1:4], lwd=rep(2,4), lty=c("1131","51","121121","11"), xlab="Protein expression by weight (parts per million)", ylab="", main="A. The Distribution of Expression", legendX="topleft", ylim=c(0,0.8), bty="n"); par(xaxt="s"); axis(side=1, at=c(-7:-2), labels=c(0.1,1,10,100,1000,"10,000")); mtext("Density", 2, line=0.5); d = tapply(ecRibo$fweight, ecRibo$class, sum); d = d/sum(d); # sums to 98% labels[4] = "No\nphenotype"; pie(d, clockwise=TRUE, init.angle=270, labels=labels, main="B. Aggregate Expression", col=cols, radius=0.8); # relative to standard of 1 r = 0.5; anglesEnd = 270 - cumsum(d) * 360; anglesBeg = 270 - c(0,cumsum(d[-5])) * 360; angles = (anglesBeg+anglesEnd)/2; x = r * cos(angles*2*pi/360); y = r * sin(angles*2*pi/360); frac = paste(round(d*100), "%", sep=""); frac[3] = ""; # too small, do not show text(x,y,frac); par(oldpar); if(!is.null(file)) dev.off(); } ExprVsSig = function(file="workbook/fitevo/ExprVsSig.eps") { if(!is.null(file)) { setEPS(); postscript(file, width=3.5, height=3.5, pointsize=10, horizontal=F, paper="special", onefile=F); } oldpar = par(mgp=c(2,1,0),mar=c(3,1,2,0)+0.5, xaxs="i", yaxs="i", xaxt="n", yaxt="n", bty="l"); d = merge(subset(ecRibo,class==4), ecsig, by="locusId"); CompareDensities(split(d$logW,ifelse(d$nMinus>0,1,2)), labels=c("Sick in other condition(s)","Not"), ylim=c(0,0.7), legendX=-7, xlab="Protein expression by weight (parts per million)", showCounts=T, bty="n",lwd=c(2,2)); par(xaxt="s"); axis(side=1, at=c(-7:-2), labels=c(0.1,1,10,100,1000,"10,000")); par(oldpar); if(!is.null(file)) dev.off(); } FitReps = function(file="workbook/fitevo/FitReps.eps") { if(!is.null(file)) { setEPS(); postscript(file, width=7.0, height=3.25, pointsize=10, horizontal=F, paper="special", onefile=F); } oldpar = par(mfrow=c(1,2), mgp=c(2,1,0),mar=c(3,3,1,3)+0.5, mgp=c(2,1,0), xaxs="i", yaxs="i", bty="l"); cols = words("red darkgreen darkgrey blue") plot(pmax(-3,ecf$fit12A), pmax(-3,ecf$fit12B), col=cols[ecf$imp], pch=ecf$imp, xlab="Fitness at 12 generations, Replicate A", ylab="Fitness at 12 generations, Replicate B", cex=0.7, main="A. Consistency of Replicates", xpd=T); abline(0,1); abline(h=0); abline(v=0); legend("topleft",paste(c("Important","Detrimental","No Phenotype","Ambiguous"), table(ecf$imp), sep=", "), col=cols, pch=1:4, cex=0.85, pt.cex=0.7, bty="n"); d = subset(ecRibo, class %in% 2:3 & fmin < 2e-6); ecfcode = ifelse(abs(ecf$t12_2) > 4 & sign(ecf$t6) != sign(ecf$t12_2), 1, ifelse(ecf$sysName %in% d$BNum, 2, ifelse(ecf$sysName %in% ecRibo$BNum[ecRibo$class %in% 2:3], 3, 4))); x = pmax(-3,rowMeans(ecf[,words("fit6A fit6B")])); y = pmax(-3,rowMeans(ecf[,words("fit12A fit12B")])); cols = words("blue red black darkgrey"); pchs = c(3,2,4,1); pt.large = 0.8; plot(x, y, xlab="Average fitness at 5 generations", ylab="Average fitness at 12 generations", main="B. Consistency Across Time", pch=pchs[ecfcode], cex=pt.large, xpd=T, col=cols[ecfcode]); # make sure highlighted points are on top points(x, y, pch=ifelse(ecfcode %in% 1:3, pchs[ecfcode], NA), col=cols[ecfcode], cex=pt.large); abline(h=0); abline(v=0); abline(0,1); legend("topleft", c("Outlier","Weak expression", "Other significant", "Not significant"), col=cols, pch=pchs, pt.cex=pt.large, cex=0.85, bty="n"); par(oldpar); if(!is.null(file)) dev.off(); }