Microarray analysis
Here is the R code for a squamous cell carcinoma microarray dataset project that included data visualization, outlier identification, missing data imputation, normalization, statistical testing for differential expression, functional annotation, dimensionality reduction, cluster analysis and classification:
#DataSet Record GDS2520, Platform GPL8300, Series GSE6631 source("http://bioconductor.org/biocLite.R") biocLite("Biobase") biocLite("GEOquery") library(Biobase) library(GEOquery) gds2520 <- getGEO(GEO='GDS2520') Meta(gds2520)$feature_count #[1] "12625" Meta(gds2520)$sample_count #[1] "44" #turn GDS object into an expression set object (using base 2 logarithms) eset <- GDS2eSet(gds2520, do.log2=TRUE) eset #ExpressionSet (storageMode: lockedEnvironment) #assayData: 12625 features, 44 samples #element names: exprs #protocolData: none #phenoData #sampleNames: GSM153813 GSM153814 ... GSM153856 (44 total) #varLabels: sample individual disease.state description #varMetadata: labelDescription #featureData #featureNames: 1000_at 1001_at ... AFFX-YEL024w/RIP1_at (12625 total) #fvarLabels: ID Gene title ... GO:Component ID (21 total) #fvarMetadata: Column labelDescription #experimentData: use 'experimentData(object)' #pubMedIds: 15170515 #Annotation: sampleNames(eset) #[1] "GSM153813" "GSM153814" "GSM153815" "GSM153816" "GSM153817" "GSM153818" #[7] "GSM153819" "GSM153820" "GSM153821" "GSM153822" "GSM153823" "GSM153824" #[13] "GSM153825" "GSM153826" "GSM153827" "GSM153828" "GSM153829" "GSM153830" #[19] "GSM153831" "GSM153832" "GSM153833" "GSM153834" "GSM153835" "GSM153836" #[25] "GSM153837" "GSM153838" "GSM153839" "GSM153840" "GSM153841" "GSM153842" #[31] "GSM153843" "GSM153844" "GSM153845" "GSM153846" "GSM153847" "GSM153848" #[37] "GSM153849" "GSM153850" "GSM153851" "GSM153852" "GSM153853" "GSM153854" #[43] "GSM153855" "GSM153856" Table(gds2520)[1:10,] #raw dat <- exprs(eset) ann <- pData(eset) ##testing for outliers## #correlation matrix dat.cor <- cor(dat, use="pairwise.complete.obs") layout(matrix(c(1,1,1,1,1,1,1,1,2,2),5,2,byrow=TRUE)) par(oma=c(5,7,1,1)) par(mgp=c(0, 1, 0)) cx <- rev(colorpanel(25, 'yellow', 'black', 'blue')) leg <- seq(min(dat.cor, na.rm=T), max(dat.cor, na.rm=T), length=10) image(dat.cor, main='Correlation plot for Head \nand neck squamous cell carcinoma dataset', axes=F, col=cx, xlab='timepoints', ylab='timepoints') axis(1, at=seq(0,1,length=ncol(dat.cor)), label=dimnames(dat.cor)[[2]], cex.axis=0.9, las=2) axis(2, at=seq(0,1,length=ncol(dat.cor)), label=dimnames(dat.cor)[[2]], cex.axis=0.9, las=2) image(as.matrix(leg), col=cx, axes=F) tmp <- round(leg, 2) axis(1, at=seq(0, 1, length=length(leg)), labels=tmp, cex.axis=1) #CV vs Mean plot dat.mean <- apply(dat,2,mean) dat.sd <- sqrt(apply(dat,2,var)) dat.cv <- dat.sd/dat.mean plot(dat.mean, dat.cv, main="Head and Neck Squamous Cell Carcinoma dataset \nCV vs. Mean", xlab="Mean", ylab="CV", col='blue', cex=1.5, type="n") points(dat.mean, dat.cv, bg="lightblue", col=1, pch=21) text(dat.mean, dat.cv, label=dimnames(dat)[[2]], pos=1, cex=0.5) #Avg correlation plot dat.avg <- apply(dat.cor,1,mean) par(oma=c(3,0.1,0.1,0.1)) plot(c(1,length(dat.avg)),range(dat.avg),type="n",xlab="",ylab="Avg r",main="Avg correlation of Tumor/Normal tissue samples",axes=F) points(dat.avg,bg="red",col=1,pch=21,cex=1.25) axis(1,at=c(1:length(dat.avg)),labels=dimnames(dat)[[2]],las=2,cex.lab=0.4,cex.axis=0.6) axis(2) abline(v=seq(0.5,62.5,1),col="grey") #cluster dendrogram dat.t <- t(dat)#transpose dat dat.dist <- dist(dat.t,method="euclidean")# calculate distance dat.clust <- hclust(dat.dist,method="single")# calculate clusters plot(dat.clust,labels=names(dat),cex=0.75)# plot cluster tree #PCA plot dat.pca <- prcomp(t(dat)) dat.loads <- dat.pca$x[,1:2] plot(dat.loads[,1], dat.loads[,2], main='PCA plot of \nHead and Neck Squamous Cell Carcinoma dataset', xlab="p1", ylab="p2", col="red", cex=1, pch=15) text(dat.loads, label=dimnames(dat)[[2]], pos=1, cex=0.5) ##Filtering out low expression genes## #mean of genes dat.mean.genes <- apply(dat, 1, mean) mean.gt.4 <- dat.mean.genes>4 length(dat.mean.genes[mean.gt.4]) #[1] 11804 keep <- names(dat.mean.genes[mean.gt.4]) #filter out genes with a mean <= 4 dat2 <- dat[keep,] ##t-test## #disease state classes# ann <- ann[,-2] ann <- ann[,-3] dimnames(ann)[[2]] #[1] "sample""disease.state" ann <- as.data.frame(ann) ann2 <- as.data.frame(ann[,2]) dimnames(ann2)[[1]] <- dimnames(ann)[[1]] normal <- ann[,2]=='normal' normal <- dimnames(ann[normal,])[[1]] carcinoma <- ann[,2]=='head and neck squamous carcinoma' carcinoma <- dimnames(ann[carcinoma,])[[1]] #run test t.test.all.genes <- function(x,s1,s2) { x1 <- x[s1] x2 <- x[s2] x1 <- as.numeric(x1) x2 <- as.numeric(x2) t.out <- t.test(x1,x2, alternative="two.sided",var.equal=T) out <- as.numeric(t.out$p.value) return(out) } pv <- apply(dat2, 1, t.test.all.genes, s1=normal, s2=carcinoma) #plot hist(pv, col="lightblue", xlab="p-values", main="P-value distribution", cex.main=0.9) abline(v=0.05, col=2, lwd=2) ##adjust p-values## pv.holm <- p.adjust(pv, method="holm") pv.bonf <- p.adjust(pv, method="bonferroni") pv.fdr <- p.adjust(pv, method="fdr") #graphs of p-value distrbutions after adjustment hist(pv.holm, col="lightblue", xlab="p-values", main="P-value distribution - holm adjusted", cex.main=0.9) abline(v=0.05, col=2, lwd=2) hist(pv.bonf, col="lightblue", xlab="p-values", main="P-value distribution - bonferroni adjusted", cex.main=0.9) abline(v=0.05, col=2, lwd=2) hist(pv.fdr, col="lightblue", xlab="p-values", main="P-value distribution - fdr adjusted", cex.main=0.9) abline(v=0.05, col=2, lwd=2) #number of genes with p-value less than 0.05 pv.lt.05 <- pv.fdr[(pv.fdr<0.05)] length(pv.lt.05) #[1] 1365 ##Plot of p-values for genes retained## hist(pv.lt.05, col="lightblue", xlab="p-values", main="P-value distribution of Retained Genes", cex.main=0.9) ##Dimensionality Reduction or Clustering## #subset data by genes retained dat3 <- dat2[names(pv.lt.05),] #PCA dat.pca <- prcomp(t(dat3), cor=F) #scree plot dat.pca.var <- round(dat.pca$sdev^2 / sum(dat.pca$sdev^2)*100,2) plot(c(1:length(dat.pca.var)), dat.pca.var, type="b", xlab="# components", ylab="% variance", pch=21, col=1, bg=3, cex=1.5) title("Scree plot showing % variability explained by each eigenvalue") #plot of three components dat.loadings <- dat.pca$x[,1:3] par(mfrow=c(2,2)) plot(range(dat.loadings[,1]),range(dat.loadings[,2]),type="n",xlab='p1',ylab='p2',main='PCA plot of Normal/Carcinoma Data\np2 vs. p1') points(dat.loadings[,1][normal], dat.loadings[,2][normal],col=1,bg='red',pch=21,cex=1.5) points(dat.loadings[,1][carcinoma], dat.loadings[,2][carcinoma],col=1,bg='blue',pch=21,cex=1.5) legend('topleft',c("Normal","Carcinoma"),col=c("red","blue"),pch=15,cex=.9,horiz=F) plot(range(dat.loadings[,1]),range(dat.loadings[,3]),type="n",xlab='p1',ylab='p3',main='PCA plot of Normal/Carcinoma Data\np3 vs. p1') points(dat.loadings[,1][normal],dat.loadings[,3][normal],col=1,bg='red',pch=21,cex=1.5) points(dat.loadings[,1][carcinoma], dat.loadings[,3][carcinoma],col=1,bg='blue',pch=21,cex=1.5) legend('topleft',c("Normal","Carcinoma"),col=c("red","blue"),pch=15,cex=.9,horiz=F) plot(range(dat.loadings[,2]),range(dat.loadings[,3]),xlab='p2',type="n",ylab='p3',main='PCA plot of Normal/Carcinoma Data\np3 vs. p2') points(dat.loadings[,2][normal],dat.loadings[,3][normal],col=1,bg='red',pch=21,cex=1.5) points(dat.loadings[,2][carcinoma], dat.loadings[,3][carcinoma],col=1,bg='blue',pch=21,cex=1.5) legend('topleft',c("Normal","Carcinoma"),col=c("red","blue"),pch=15,cex=.9,horiz=F) ##discriminant analysis## library(MASS) dat.loads <- dat.pca$x[,1:3] dat4 <- as.data.frame(dat.loads) dat4 <- data.frame(ann2, dat4) #train model dat.lda <- lda(dat4[,1]~., dat4[,2:4]) #predict dat.pred <- predict(dat.lda, dat4[,2:4]) #confusion table table(dat.pred$class, dat4[,1]) # head and neck squamous carcinoma normal #head and neck squamous carcinoma 210 #normal1 22 #plot of 3 components and with predicted versus actual memberships par(mfrow=c(2,2)) plot(range(dat.loadings[,1]),range(dat.loadings[,2]),type="n",xlab='p1',ylab='p2',main='PCA plot of Normal/Carcinoma Data\np1 vs. p2') points(dat.loadings[,1][dat.pred$class=='normal'], dat.loadings[,2][dat.pred$class=='normal'],col='red',bg='red',pch="o",cex=1.5) points(dat.loadings[,1][dat.pred$class=='head and neck squamous carcinoma'], dat.loadings[,2][dat.pred$class=='head and neck squamous carcinoma'],col='blue',bg='blue',pch="o",cex=1.5) points(dat.loadings[,1][normal], dat.loadings[,2][normal],pch="*",cex=1) legend('topleft',c("predicted Normal","predicted Carcinoma"), pch="o", col=c('red','blue'), cex=0.9, horiz=F) legend('bottomright',c("actual Normal","actual Carcinoma"), pch=c("o", "o"), col='black', cex=0.9, horiz=F) legend('bottomright',c("actual Normal","actual Carcinoma"), pch=c("o", "*"), col='black', cex=0.9, horiz=F) plot(range(dat.loadings[,1]),range(dat.loadings[,3]),type="n",xlab='p1',ylab='p3',main='PCA plot of Normal/Carcinoma Data\np1 vs. p3') points(dat.loadings[,1][dat.pred$class=='normal'], dat.loadings[,3][dat.pred$class=='normal'],col='red',bg='red',pch="o",cex=1.5) points(dat.loadings[,1][dat.pred$class=='head and neck squamous carcinoma'], dat.loadings[,3][dat.pred$class=='head and neck squamous carcinoma'],col='blue',bg='blue',pch="o",cex=1.5) points(dat.loadings[,1][normal], dat.loadings[,3][normal],pch="*",cex=1) legend('topleft',c("predicted Normal","predicted Carcinoma"), pch="o", col=c('red','blue'), cex=0.9, horiz=F) legend('bottomright',c("actual Normal","actual Carcinoma"), pch=c("o", "o"), col='black', cex=0.9, horiz=F) legend('bottomright',c("actual Normal","actual Carcinoma"), pch=c("o", "*"), col='black', cex=0.9, horiz=F) plot(range(dat.loadings[,2]),range(dat.loadings[,3]),type="n",xlab='p2',ylab='p3',main='PCA plot of Normal/Carcinoma Data\np2 vs. p3') points(dat.loadings[,2][dat.pred$class=='normal'], dat.loadings[,3][dat.pred$class=='normal'],col='red',bg='red',pch="o",cex=1.5) points(dat.loadings[,2][dat.pred$class=='head and neck squamous carcinoma'], dat.loadings[,3][dat.pred$class=='head and neck squamous carcinoma'],col='blue',bg='blue',pch="o",cex=1.5) points(dat.loadings[,2][normal], dat.loadings[,3][normal],pch="*",cex=1) legend('topleft',c("predicted Normal","predicted Carcinoma"), pch="o", col=c('red','blue'), cex=0.9, horiz=F) legend('bottomright',c("actual Normal","actual Carcinoma"), pch=c("o", "o"), col='black', cex=0.9, horiz=F) legend('bottomright',c("actual Normal","actual Carcinoma"), pch=c("o", "*"), col='black', cex=0.9, horiz=F) ##top 10 discriminant genes## #p-value instead of fold-change pv.sorted <- sort(pv, decreasing=FALSE) names(pv.sorted[1:10]) # [1] "39657_at" "38051_at" "41644_at" "37093_at" "39801_at" "40315_at" # [7] "32868_at" "988_at" "38394_at" "33839_at"