update ex2 to DE example

author: Justin Bedo <cu@cua0.org> 2022-10-10 10:48:02 +1100
committer: Justin Bedo <cu@cua0.org> 2022-10-10 13:22:03 +1100
commit: 059034ab3d5c40c8ef0350c440d4ae343802ec5c (patch)
tree: b510a982ff27bbabe6642ad02de362d311768da0
parent: 8c29f7d34fd713d7630ef081fd8cefbb83ef201b (diff)
3 files changed, 135 insertions, 30 deletions
diff --git a/day2/ex3-R/default.nix b/day2/ex3-R/default.nix
index 614c24f..7a5d39a 100644
--- a/day2/ex3-R/default.nix
+++ b/day2/ex3-R/default.nix
@@ -1,9 +1,9 @@
 /*
 This example tries to demonstrate that even very simple workflows (this
-is only really one computational stage) can benefit from some formalisation
-to provide reproducibility. We borrow part of the edgeR example available from
-https://ucdavis-bioinformatics-training.github.io/2018-September-Bioinformatics-Prerequisites/friday/limma_biomart_vignettes.html
-and specify it in BioNix so that it's easily reproducible.
+is only really one computational stage) can benefit from some
+formalisation to provide reproducibility. With help from Hannah Coughlan
+the example from Law et al. [1] was adapted and in this exercise we will
+fully specify it in BioNix so that it's easily reproducible.
 
 One key learning goal in this exercise is to understand that Nix only
 allows _inputs_ to be referenced during the execution of a build to
@@ -12,31 +12,43 @@ forbidden when the build sandbox is enabled (default, but not on Milton
 for technical reasons), meaning data cannot be fetched as part of a
 build as the original example does.
 
-We therefore fetch the count input as a separate stage and Nix will take
+We therefore fetch the count input as a seperate stage and Nix will take
 care of downloading it for us. The caveate is that the content of things
 fetched from the internet must be verified to give reproducibility. Nix
 does this through hashing.
 
+One important thing to note is this analysis does change depending on
+annotation, so achieving reproducibility requires some tooling to fix
+the software/database stack. Nix does this for us ensuring
+reproducibility.
+
 Goal: fill out the below to specify required R packages, execute the
 build and observe the hash collision. Update the hash and see if the
 build now completes successfully.
+
+Hint: All of CRAN and BioC are available in nixpkgs, however for
+convenience periods "." are converted to "_" as "." in the Nix language
+is attrset element access.
+
+1: https://www.bioconductor.org/packages/devel/workflows/vignettes/RNAseq123/inst/doc/limmaWorkflow.html
 */
 {bionix}:
 with bionix; let
   R = pkgs.rWrapper.override {packages = with pkgs.rPackages; [];};
-
-  counts = pkgs.fetchurl {
-    url = "https://raw.githubusercontent.com/ucdavis-bioinformatics-training/2018-September-Bioinformatics-Prerequisites/master/friday/counts.tsv";
-    sha256 = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=";
-  };
 in
   stage {
-    inherit counts;
     name = "r-ex";
 
+    src = pkgs.fetchurl {
+      url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE63310&format=file";
+      sha256 = "sha256-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=";
+    };
+
     buildInputs = [R];
 
     buildCommand = ''
+      tar xf $src
+      mkdir $out
       Rscript ${./script.R}
     '';
   }
diff --git a/day2/ex3-R/script.R b/day2/ex3-R/script.R
index fcbf6e1..d25a42e 100644
--- a/day2/ex3-R/script.R
+++ b/day2/ex3-R/script.R
@@ -1,16 +1,108 @@
+# Adapted from https://www.bioconductor.org/packages/devel/workflows/vignettes/RNAseq123/inst/doc/limmaWorkflow.html
+# with lots of help from Hannah Coughlan
+
+# Loading packages
+library(limma)
 library(edgeR)
+library(Mus.musculus)
+library(RColorBrewer)
+
+# Data packaging and annotation
+# The original example downloads directly from the internet, something that
+# is forbidden in a Nix build to ensure reproducibility. Instead, we assume
+# the tarball has already been fetched and unpacked, and the contents are
+# available in the working directory
+files <- c("GSM1545535_10_6_5_11.txt", "GSM1545536_9_6_5_11.txt", "GSM1545538_purep53.txt",
+           "GSM1545539_JMS8-2.txt", "GSM1545540_JMS8-3.txt", "GSM1545541_JMS8-4.txt",
+           "GSM1545542_JMS8-5.txt", "GSM1545544_JMS9-P7c.txt", "GSM1545545_JMS9-P8c.txt")
+for(i in paste(files, ".gz", sep=""))
+    R.utils::gunzip(i, overwrite=TRUE)
 
-# It's easier to read environment variables than parse command line arguments
-counts <- read.delim(Sys.getenv("counts"), row.names=1)
+# Get the directory to output results in
 out <- Sys.getenv("out")
 
-group <- rep(c("A", "B"), each = 2)
-design <- model.matrix(~group)
-dge <- DGEList(counts=counts)
-keep <- filterByExpr(dge, design)
-dge <- dge[keep,,keep.lib.sizes=FALSE]
-dge <- calcNormFactors(dge)
-logCPM <- cpm(dge, log=TRUE, prior.count=3)
-fit <- lmFit(logCPM, design)
-fit <- eBayes(fit, trend=TRUE)
-write.table(topTable(fit, coef=ncol(design)), file = out)
+## Create the DGEList-object
+x <- readDGE(files, columns=c(1,3))
+
+## Organising sample information
+samplenames <- substring(colnames(x), 12, nchar(colnames(x)))
+
+colnames(x) <- samplenames
+group <- as.factor(c("LP", "ML", "Basal",
+                     "Basal", "ML", "LP",
+                     "Basal", "ML", "LP"))
+x$samples$group <- group
+lane <- as.factor(rep(c("L004","L006","L008"), c(3,4,2)))
+x$samples$lane <- lane
+
+## Gene annotation
+geneid <- rownames(x)
+genes <- select(Mus.musculus,
+                keys=geneid,
+                columns=c("SYMBOL", "TXCHROM"),
+                keytype="ENTREZID")
+head(genes)
+
+# Filtering and normalisation
+
+### Filtering for duplicated gene names
+genes <- genes[!duplicated(genes$ENTREZID),]
+x$genes <- genes
+
+## Filtering for lowly expressed genes
+keep.exprs <- filterByExpr(x, group=group)
+x <- x[keep.exprs,, keep.lib.sizes=FALSE]
+dim(x)
+
+## Normalising gene expression distributions
+x <- calcNormFactors(x, method = "TMM")
+
+## Unsupervisd clustering if sampples
+lcpm <- cpm(x, log=TRUE)
+
+png(file=paste0(out, "/mds.png"),width=700, height=350)
+par(mfrow=c(1,2))
+col.group <- group
+levels(col.group) <-  brewer.pal(nlevels(col.group), "Set1")
+col.group <- as.character(col.group)
+col.lane <- lane
+levels(col.lane) <-  brewer.pal(nlevels(col.lane), "Set2")
+col.lane <- as.character(col.lane)
+plotMDS(lcpm, labels=group, col=col.group)
+title(main="A. Sample groups")
+plotMDS(lcpm, labels=lane, col=col.lane, dim=c(3,4))
+title(main="B. Sequencing lanes")
+dev.off()
+
+# Differential expression analysis
+
+## Creating a design matrix and contrasts
+design <- model.matrix(~0+group+lane)
+colnames(design) <- gsub("group", "", colnames(design))
+
+contr.matrix <- makeContrasts(
+    BasalvsLP = Basal-LP,
+    BasalvsML = Basal - ML,
+    LPvsML = LP - ML,
+    levels = colnames(design))
+
+## Removing heteroscedascity from count data
+v <- voom(x, design, plot=FALSE)
+
+vfit <- lmFit(v, design)
+vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
+efit <- eBayes(vfit)
+
+## Examining the number of DE genes
+summary(decideTests(efit))
+
+### Using treat
+tfit <- treat(vfit, lfc=1)
+dt <- decideTests(tfit)
+summary(dt)
+
+### List of DE genes
+basal.vs.lp <- topTreat(tfit, coef=1, n=Inf)
+head(basal.vs.lp)
+write.csv(x = basal.vs.lp, file = paste0(out, "/DE_gene_list.csv"), quote = FALSE)
+
diff --git a/day2/ex3-R/solution.nix b/day2/ex3-R/solution.nix
index 7d40f88..406e1f3 100644
--- a/day2/ex3-R/solution.nix
+++ b/day2/ex3-R/solution.nix
@@ -1,19 +1,20 @@
 {bionix}:
 with bionix; let
-  R = pkgs.rWrapper.override {packages = with pkgs.rPackages; [edgeR];};
-
-  counts = pkgs.fetchurl {
-    url = "https://raw.githubusercontent.com/ucdavis-bioinformatics-training/2018-September-Bioinformatics-Prerequisites/master/friday/counts.tsv";
-    sha256 = "sha256-ZmZ+vC4mKnmZKVJqbnEujDngwnSTZAxvQaZaNClUUWE=";
-  };
+  R = pkgs.rWrapper.override {packages = with pkgs.rPackages; [limma edgeR Mus_musculus RColorBrewer R_utils];};
 in
   stage {
-    inherit counts;
     name = "r-ex";
 
+    src = pkgs.fetchurl {
+      url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE63310&format=file";
+      sha256 = "sha256-jLII/e+SVNx4Fivb65WOF566fwFXFtdNLUs04S82++g=";
+    };
+
     buildInputs = [R];
 
     buildCommand = ''
+      tar xf $src
+      mkdir $out
       Rscript ${./script.R}
     '';
   }
author	Justin Bedo <cu@cua0.org>	2022-10-10 10:48:02 +1100
committer	Justin Bedo <cu@cua0.org>	2022-10-10 13:22:03 +1100
commit	059034ab3d5c40c8ef0350c440d4ae343802ec5c (patch)
tree	b510a982ff27bbabe6642ad02de362d311768da0
parent	8c29f7d34fd713d7630ef081fd8cefbb83ef201b (diff)