Weighted correlation network analysis (WGCNA, Weighted correlation network analysis) is a systems biology method used to describe gene association patterns between different samples. It can be used to identify highly coordinated gene sets. Test data download link: https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/FemaleLiver-Data.zip
source("https://bioconductor.org/biocLite.R") biocLite(c("AnnotationDbi", "impute","GO.db", "preprocessCore")) install.packages(c("WGCNA", "stringr", "reshape2"), repos="https://mirrors.tuna.tsinghua.edu.cn/CRAN") library(WGCNA) options(stringsAsFactors = FALSE); #Open multi-threading, but it seems that this step will report an error on Mac, so don’t do it enableWGCNAThreads() library(WGCNA); options(stringsAsFactors = FALSE);
#Make a gene expression matrix #Read in the female liver data set femData = read.csv("Documents/project/test/FemaleLiver-Data/LiverFemale3600.csv"); datExpr0 = as.data.frame(t(femData[, -c(1:8)])); names(datExpr0) = femData$substanceBXH;
###If there are many genes, you can screen the genes that have changed a lot, reduce the amount of calculation, or do not do this step m.mad <- apply(dataExpr0,1,mad) dataExpr0 <- dataExpr0[which(m.mad> max(quantile(m.mad, probs=seq(0, 1, 0.25))[2],0.01)),]
gsg = goodSamplesGenes(datExpr0, verbose = 3); gsg$allOK
If there are samples and genes that need to be removed, perform this step, skip this step if you are ok
if (!gsg$allOK) { # Optionally, print the gene and sample names that were removed: if (sum(!gsg$goodGenes)>0) printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", "))); if (sum(!gsg$goodSamples)>0) printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", "))); # Remove the offending genes and samples from the data: datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes] }
sampleTree = hclust(dist(datExpr0), method = "average"); plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)
Such samples have outliers and need to be cut out
Leave only the samples you need
clust = cutreeStatic(sampleTree, cutHeight = 15, minSize = 10) table(clust) # clust 1 contains the samples we want to keep. keepSamples = (clust==1) datExpr = datExpr0[keepSamples,] nGenes = ncol(datExpr) nSamples = nrow(datExpr) sampleTree = hclust(dist(datExpr), method = "average"); plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)
image.png
The final gene expression matrix
image.png
traitData = read.csv("Documents/project/test/FemaleLiver-Data/ClinicalTraits.csv"); dim(traitData) names(traitData) # remove columns that hold information we do not need. allTraits = traitData[, -c(31, 16)]; allTraits = allTraits[, c(2, 11:36) ]; # Form a data frame analogous to expression data that will hold the clinical traits. femaleSamples = rownames(datExpr); traitRows = match(femaleSamples, allTraits$Mice); datTraits = allTraits[traitRows, -1]; rownames(datTraits) = allTraits[traitRows, 1];
Completed feature matrix
image.png
############################################## ##########
#Soft threshold screening## # Choose a set of soft-thresholding powers powers = c(c(1:10), seq(from = 12, to=20, by=2)) # Call the network topology analysis function sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5) # Plot the results: sizeGrWindow(9, 5) par(mfrow = c(1,2)); cex1 = 0.9; # Scale-free topology fit index as a function of the soft-thresholding power plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence")); text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,cex=cex1,col="red"); # this line corresponds to using an R^2 cut-off of h abline(h=0.90,col="red") # Mean connectivity as a function of the soft-thresholding power plot(sft$fitIndices[,1], sft$fitIndices[,5], xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n", main = paste("Mean connectivity")) text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
image.png
Reference: http://blog.genesino.com/2018/04/wgcna/#wgcna%E5%9F%BA%E6%9C%AC%E6%A6%82%E5%BF%B5