aboutsummaryrefslogtreecommitdiff
path: root/day2/ex3-R/script.R
blob: d25a42e916ddb6a256875de54a8135f97ddffe60 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# Adapted from https://www.bioconductor.org/packages/devel/workflows/vignettes/RNAseq123/inst/doc/limmaWorkflow.html
# with lots of help from Hannah Coughlan

# Loading packages
library(limma)
library(edgeR)
library(Mus.musculus)
library(RColorBrewer)

# Data packaging and annotation
# The original example downloads directly from the internet, something that
# is forbidden in a Nix build to ensure reproducibility. Instead, we assume
# the tarball has already been fetched and unpacked, and the contents are
# available in the working directory
files <- c("GSM1545535_10_6_5_11.txt", "GSM1545536_9_6_5_11.txt", "GSM1545538_purep53.txt",
           "GSM1545539_JMS8-2.txt", "GSM1545540_JMS8-3.txt", "GSM1545541_JMS8-4.txt",
           "GSM1545542_JMS8-5.txt", "GSM1545544_JMS9-P7c.txt", "GSM1545545_JMS9-P8c.txt")
for(i in paste(files, ".gz", sep=""))
    R.utils::gunzip(i, overwrite=TRUE)

# Get the directory to output results in
out <- Sys.getenv("out")

## Create the DGEList-object
x <- readDGE(files, columns=c(1,3))

## Organising sample information
samplenames <- substring(colnames(x), 12, nchar(colnames(x)))

colnames(x) <- samplenames
group <- as.factor(c("LP", "ML", "Basal",
                     "Basal", "ML", "LP",
                     "Basal", "ML", "LP"))
x$samples$group <- group
lane <- as.factor(rep(c("L004","L006","L008"), c(3,4,2)))
x$samples$lane <- lane

## Gene annotation
geneid <- rownames(x)
genes <- select(Mus.musculus,
                keys=geneid,
                columns=c("SYMBOL", "TXCHROM"),
                keytype="ENTREZID")
head(genes)

# Filtering and normalisation

### Filtering for duplicated gene names
genes <- genes[!duplicated(genes$ENTREZID),]
x$genes <- genes

## Filtering for lowly expressed genes
keep.exprs <- filterByExpr(x, group=group)
x <- x[keep.exprs,, keep.lib.sizes=FALSE]
dim(x)

## Normalising gene expression distributions
x <- calcNormFactors(x, method = "TMM")

## Unsupervisd clustering if sampples
lcpm <- cpm(x, log=TRUE)

png(file=paste0(out, "/mds.png"),width=700, height=350)
par(mfrow=c(1,2))
col.group <- group
levels(col.group) <-  brewer.pal(nlevels(col.group), "Set1")
col.group <- as.character(col.group)
col.lane <- lane
levels(col.lane) <-  brewer.pal(nlevels(col.lane), "Set2")
col.lane <- as.character(col.lane)
plotMDS(lcpm, labels=group, col=col.group)
title(main="A. Sample groups")
plotMDS(lcpm, labels=lane, col=col.lane, dim=c(3,4))
title(main="B. Sequencing lanes")
dev.off()

# Differential expression analysis

## Creating a design matrix and contrasts
design <- model.matrix(~0+group+lane)
colnames(design) <- gsub("group", "", colnames(design))

contr.matrix <- makeContrasts(
    BasalvsLP = Basal-LP,
    BasalvsML = Basal - ML,
    LPvsML = LP - ML,
    levels = colnames(design))

## Removing heteroscedascity from count data
v <- voom(x, design, plot=FALSE)

vfit <- lmFit(v, design)
vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit)

## Examining the number of DE genes
summary(decideTests(efit))

### Using treat
tfit <- treat(vfit, lfc=1)
dt <- decideTests(tfit)
summary(dt)

### List of DE genes
basal.vs.lp <- topTreat(tfit, coef=1, n=Inf)
head(basal.vs.lp)
write.csv(x = basal.vs.lp, file = paste0(out, "/DE_gene_list.csv"), quote = FALSE)