1 files changed, 104 insertions, 12 deletions
diff --git a/day2/ex3-R/script.R b/day2/ex3-R/script.R
index fcbf6e1..d25a42e 100644
--- a/day2/ex3-R/script.R
+++ b/day2/ex3-R/script.R
@@ -1,16 +1,108 @@
+# Adapted from https://www.bioconductor.org/packages/devel/workflows/vignettes/RNAseq123/inst/doc/limmaWorkflow.html
+# with lots of help from Hannah Coughlan
+
+# Loading packages
+library(limma)
 library(edgeR)
+library(Mus.musculus)
+library(RColorBrewer)
+
+# Data packaging and annotation
+# The original example downloads directly from the internet, something that
+# is forbidden in a Nix build to ensure reproducibility. Instead, we assume
+# the tarball has already been fetched and unpacked, and the contents are
+# available in the working directory
+files <- c("GSM1545535_10_6_5_11.txt", "GSM1545536_9_6_5_11.txt", "GSM1545538_purep53.txt",
+           "GSM1545539_JMS8-2.txt", "GSM1545540_JMS8-3.txt", "GSM1545541_JMS8-4.txt",
+           "GSM1545542_JMS8-5.txt", "GSM1545544_JMS9-P7c.txt", "GSM1545545_JMS9-P8c.txt")
+for(i in paste(files, ".gz", sep=""))
+    R.utils::gunzip(i, overwrite=TRUE)
 
-# It's easier to read environment variables than parse command line arguments
-counts <- read.delim(Sys.getenv("counts"), row.names=1)
+# Get the directory to output results in
 out <- Sys.getenv("out")
 
-group <- rep(c("A", "B"), each = 2)
-design <- model.matrix(~group)
-dge <- DGEList(counts=counts)
-keep <- filterByExpr(dge, design)
-dge <- dge[keep,,keep.lib.sizes=FALSE]
-dge <- calcNormFactors(dge)
-logCPM <- cpm(dge, log=TRUE, prior.count=3)
-fit <- lmFit(logCPM, design)
-fit <- eBayes(fit, trend=TRUE)
-write.table(topTable(fit, coef=ncol(design)), file = out)
+## Create the DGEList-object
+x <- readDGE(files, columns=c(1,3))
+
+## Organising sample information
+samplenames <- substring(colnames(x), 12, nchar(colnames(x)))
+
+colnames(x) <- samplenames
+group <- as.factor(c("LP", "ML", "Basal",
+                     "Basal", "ML", "LP",
+                     "Basal", "ML", "LP"))
+x$samples$group <- group
+lane <- as.factor(rep(c("L004","L006","L008"), c(3,4,2)))
+x$samples$lane <- lane
+
+## Gene annotation
+geneid <- rownames(x)
+genes <- select(Mus.musculus,
+                keys=geneid,
+                columns=c("SYMBOL", "TXCHROM"),
+                keytype="ENTREZID")
+head(genes)
+
+# Filtering and normalisation
+
+### Filtering for duplicated gene names
+genes <- genes[!duplicated(genes$ENTREZID),]
+x$genes <- genes
+
+## Filtering for lowly expressed genes
+keep.exprs <- filterByExpr(x, group=group)
+x <- x[keep.exprs,, keep.lib.sizes=FALSE]
+dim(x)
+
+## Normalising gene expression distributions
+x <- calcNormFactors(x, method = "TMM")
+
+## Unsupervisd clustering if sampples
+lcpm <- cpm(x, log=TRUE)
+
+png(file=paste0(out, "/mds.png"),width=700, height=350)
+par(mfrow=c(1,2))
+col.group <- group
+levels(col.group) <-  brewer.pal(nlevels(col.group), "Set1")
+col.group <- as.character(col.group)
+col.lane <- lane
+levels(col.lane) <-  brewer.pal(nlevels(col.lane), "Set2")
+col.lane <- as.character(col.lane)
+plotMDS(lcpm, labels=group, col=col.group)
+title(main="A. Sample groups")
+plotMDS(lcpm, labels=lane, col=col.lane, dim=c(3,4))
+title(main="B. Sequencing lanes")
+dev.off()
+
+# Differential expression analysis
+
+## Creating a design matrix and contrasts
+design <- model.matrix(~0+group+lane)
+colnames(design) <- gsub("group", "", colnames(design))
+
+contr.matrix <- makeContrasts(
+    BasalvsLP = Basal-LP,
+    BasalvsML = Basal - ML,
+    LPvsML = LP - ML,
+    levels = colnames(design))
+
+## Removing heteroscedascity from count data
+v <- voom(x, design, plot=FALSE)
+
+vfit <- lmFit(v, design)
+vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
+efit <- eBayes(vfit)
+
+## Examining the number of DE genes
+summary(decideTests(efit))
+
+### Using treat
+tfit <- treat(vfit, lfc=1)
+dt <- decideTests(tfit)
+summary(dt)
+
+### List of DE genes
+basal.vs.lp <- topTreat(tfit, coef=1, n=Inf)
+head(basal.vs.lp)
+write.csv(x = basal.vs.lp, file = paste0(out, "/DE_gene_list.csv"), quote = FALSE)
+