diff options
Diffstat (limited to 'day2/ex3-R/script.R')
-rw-r--r-- | day2/ex3-R/script.R | 116 |
1 files changed, 104 insertions, 12 deletions
diff --git a/day2/ex3-R/script.R b/day2/ex3-R/script.R index fcbf6e1..d25a42e 100644 --- a/day2/ex3-R/script.R +++ b/day2/ex3-R/script.R @@ -1,16 +1,108 @@ +# Adapted from https://www.bioconductor.org/packages/devel/workflows/vignettes/RNAseq123/inst/doc/limmaWorkflow.html +# with lots of help from Hannah Coughlan + +# Loading packages +library(limma) library(edgeR) +library(Mus.musculus) +library(RColorBrewer) + +# Data packaging and annotation +# The original example downloads directly from the internet, something that +# is forbidden in a Nix build to ensure reproducibility. Instead, we assume +# the tarball has already been fetched and unpacked, and the contents are +# available in the working directory +files <- c("GSM1545535_10_6_5_11.txt", "GSM1545536_9_6_5_11.txt", "GSM1545538_purep53.txt", + "GSM1545539_JMS8-2.txt", "GSM1545540_JMS8-3.txt", "GSM1545541_JMS8-4.txt", + "GSM1545542_JMS8-5.txt", "GSM1545544_JMS9-P7c.txt", "GSM1545545_JMS9-P8c.txt") +for(i in paste(files, ".gz", sep="")) + R.utils::gunzip(i, overwrite=TRUE) -# It's easier to read environment variables than parse command line arguments -counts <- read.delim(Sys.getenv("counts"), row.names=1) +# Get the directory to output results in out <- Sys.getenv("out") -group <- rep(c("A", "B"), each = 2) -design <- model.matrix(~group) -dge <- DGEList(counts=counts) -keep <- filterByExpr(dge, design) -dge <- dge[keep,,keep.lib.sizes=FALSE] -dge <- calcNormFactors(dge) -logCPM <- cpm(dge, log=TRUE, prior.count=3) -fit <- lmFit(logCPM, design) -fit <- eBayes(fit, trend=TRUE) -write.table(topTable(fit, coef=ncol(design)), file = out) +## Create the DGEList-object +x <- readDGE(files, columns=c(1,3)) + +## Organising sample information +samplenames <- substring(colnames(x), 12, nchar(colnames(x))) + +colnames(x) <- samplenames +group <- as.factor(c("LP", "ML", "Basal", + "Basal", "ML", "LP", + "Basal", "ML", "LP")) +x$samples$group <- group +lane <- as.factor(rep(c("L004","L006","L008"), c(3,4,2))) +x$samples$lane <- lane + +## Gene annotation +geneid <- rownames(x) +genes <- select(Mus.musculus, + keys=geneid, + columns=c("SYMBOL", "TXCHROM"), + keytype="ENTREZID") +head(genes) + +# Filtering and normalisation + +### Filtering for duplicated gene names +genes <- genes[!duplicated(genes$ENTREZID),] +x$genes <- genes + +## Filtering for lowly expressed genes +keep.exprs <- filterByExpr(x, group=group) +x <- x[keep.exprs,, keep.lib.sizes=FALSE] +dim(x) + +## Normalising gene expression distributions +x <- calcNormFactors(x, method = "TMM") + +## Unsupervisd clustering if sampples +lcpm <- cpm(x, log=TRUE) + +png(file=paste0(out, "/mds.png"),width=700, height=350) +par(mfrow=c(1,2)) +col.group <- group +levels(col.group) <- brewer.pal(nlevels(col.group), "Set1") +col.group <- as.character(col.group) +col.lane <- lane +levels(col.lane) <- brewer.pal(nlevels(col.lane), "Set2") +col.lane <- as.character(col.lane) +plotMDS(lcpm, labels=group, col=col.group) +title(main="A. Sample groups") +plotMDS(lcpm, labels=lane, col=col.lane, dim=c(3,4)) +title(main="B. Sequencing lanes") +dev.off() + +# Differential expression analysis + +## Creating a design matrix and contrasts +design <- model.matrix(~0+group+lane) +colnames(design) <- gsub("group", "", colnames(design)) + +contr.matrix <- makeContrasts( + BasalvsLP = Basal-LP, + BasalvsML = Basal - ML, + LPvsML = LP - ML, + levels = colnames(design)) + +## Removing heteroscedascity from count data +v <- voom(x, design, plot=FALSE) + +vfit <- lmFit(v, design) +vfit <- contrasts.fit(vfit, contrasts=contr.matrix) +efit <- eBayes(vfit) + +## Examining the number of DE genes +summary(decideTests(efit)) + +### Using treat +tfit <- treat(vfit, lfc=1) +dt <- decideTests(tfit) +summary(dt) + +### List of DE genes +basal.vs.lp <- topTreat(tfit, coef=1, n=Inf) +head(basal.vs.lp) +write.csv(x = basal.vs.lp, file = paste0(out, "/DE_gene_list.csv"), quote = FALSE) + |