This case study analyses Agilent-026652 Whole Human Genome Microarray 4x44Kv2 data downloaded from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE118337 using the limma package.
This vignette was made with limma version 3.67.3.
> library(limma)
> Targets <- read.delim("targets.txt")
> Targets$FileName <- paste0(Targets$Accession,"_",Targets$ID,".txt.gz")
> Targets
Accession ID Condition FileName
1 GSM3325550 465 HK2_ctrl GSM3325550_465.txt.gz
2 GSM3325551 466 HK2_TGFB GSM3325551_466.txt.gz
3 GSM3325552 467 HK2_Empa GSM3325552_467.txt.gz
4 GSM3325553 468 HK2_Cana GSM3325553_468.txt.gz
5 GSM3325554 469 RPTEC_ctrl GSM3325554_469.txt.gz
6 GSM3325555 470 RPTEC_TGFB GSM3325555_470.txt.gz
7 GSM3325556 471 RPTEC_Emp GSM3325556_471.txt.gz
8 GSM3325557 472 RPTEC_Cana GSM3325557_472.txt.gz
9 GSM3325558 473 HK2_ctrl2 GSM3325558_473.txt.gz
10 GSM3325559 474 HK2_TGFB2 GSM3325559_474.txt.gz
11 GSM3325560 475 HK2_Empa2 GSM3325560_475.txt.gz
12 GSM3325561 476 HK2_Cana2 GSM3325561_476.txt.gz
13 GSM3325562 477 RPTEC_ctrl2 GSM3325562_477.txt.gz
14 GSM3325563 478 RPTEC_TGFB2 GSM3325563_478.txt.gz
15 GSM3325564 479 RPTEC_Empa2 GSM3325564_479.txt.gz
16 GSM3325565 480 RPTEC_Cana2 GSM3325565_480.txt.gz
> x <- read.maimages(Targets, source="agilent",
+ green.only=TRUE, other.columns="gIsWellAboveBG")
Read GSM3325550_465.txt.gz
Read GSM3325551_466.txt.gz
Read GSM3325552_467.txt.gz
Read GSM3325553_468.txt.gz
Read GSM3325554_469.txt.gz
Read GSM3325555_470.txt.gz
Read GSM3325556_471.txt.gz
Read GSM3325557_472.txt.gz
Read GSM3325558_473.txt.gz
Read GSM3325559_474.txt.gz
Read GSM3325560_475.txt.gz
Read GSM3325561_476.txt.gz
Read GSM3325562_477.txt.gz
Read GSM3325563_478.txt.gz
Read GSM3325564_479.txt.gz
Read GSM3325565_480.txt.gz
> colnames(x) <- Targets$Condition
> x$targets <- Targets
> dim(x)
[1] 44495 16
> y <- backgroundCorrect(x, method="normexp")
Array 1 corrected
Array 2 corrected
Array 3 corrected
Array 4 corrected
Array 5 corrected
Array 6 corrected
Array 7 corrected
Array 8 corrected
Array 9 corrected
Array 10 corrected
Array 11 corrected
Array 12 corrected
Array 13 corrected
Array 14 corrected
Array 15 corrected
Array 16 corrected
> y <- normalizeBetweenArrays(y, method="quantile")
We will filter out control probes as indicated by the ControlType column:
> NotControl <- y$genes$ControlType==0L
We also filter probes that don’t appear to be expressed. We keep probes that are above background on at least two arrays (because there are two replicates of each condition):
> IsExpr <- rowSums(y$other$gIsWellAboveBG > 0) >= 2
Now we select the probes to keep in a new data object yfilt:
> yfilt <- y[NotControl & IsExpr, ]
> dim(yfilt)
[1] 28680 16
To be tidy, we remove annotation columns we no longer need:
> yfilt$genes <- yfilt$genes[,c("ProbeName","GeneName","SystematicName")]
> head(yfilt$genes)
ProbeName GeneName SystematicName
13 A_23_P42935 BRAF NM_004333
14 A_23_P117082 HEBP1 NM_015987
15 A_23_P2683 RPAP3 NM_024604
16 A_24_P358131 ENST00000404956 ENST00000404956
19 A_32_P14850 NPIPB15 ENST00000429990
20 A_23_P158596 AGTRAP NM_001040196
> plotMDS(yfilt)
Sample 466 is a spectacular outlier, so we will remove it.
> yfilt <- yfilt[,-2]
> plotMDS(yfilt)
The two cell lines are so different that I am inclined to analyse them separately:
> s <- strsplit2(yfilt$targets$Condition, split="_")
> yfilt$targets$CellLine <- s[,1]
> yfilt$targets$Treat <- s[,2]
> y.HK2 <- yfilt[,yfilt$targets$CellLine=="HK2"]
> plotMDS(y.HK2, main="HK2", label=y.HK2$targets$Treat)
> y.RPTEC <- yfilt[,yfilt$targets$CellLine=="RPTEC"]
> plotMDS(y.RPTEC, main="RPTEC", label=y.RPTEC$targets$Treat)
There is a strong replicate effect for both cell lines, but especially for HK2.
> Replicate <- factor(c(1,1,1,2,2,2,2))
> Treat <- factor(c(1,3,4,1,2,3,4))
> levels(Treat) <- c("ctrl","TBFB","Empa","Cana")
> design <- model.matrix(~Treat+Replicate)
> fit.HK2 <- lmFit(y.HK2,design)
> fit.HK2 <- eBayes(fit.HK2,trend=TRUE,robust=TRUE)
> plotSA(fit.HK2)
> summary(decideTests(fit.HK2))
(Intercept) TreatTBFB TreatEmpa TreatCana Replicate2
Down 0 963 602 634 5627
NotSig 4 26729 27552 27522 14768
Up 28676 988 526 524 8285
> topTable(fit.HK2,coef="TreatTBFB",n=30)
ProbeName GeneName SystematicName logFC AveExpr t P.Value adj.P.Val B
9351 A_33_P3252286 CRLF1 NM_004750 5.829 7.505 33.89 1.433e-09 2.492e-05 11.527
1875 A_24_P413126 PMEPA1 NM_020182 4.567 8.848 32.05 2.182e-09 2.492e-05 11.284
16125 A_23_P122924 INHBA NM_002192 4.727 7.994 30.20 3.421e-09 2.492e-05 11.009
20973 A_23_P122924 INHBA NM_002192 4.757 8.025 29.79 3.795e-09 2.492e-05 10.944
27759 A_23_P122924 INHBA NM_002192 4.729 7.955 28.74 4.978e-09 2.492e-05 10.769
43023 A_23_P122924 INHBA NM_002192 4.709 7.649 28.03 6.011e-09 2.492e-05 10.644
10056 A_23_P122924 INHBA NM_002192 4.605 7.926 27.98 6.081e-09 2.492e-05 10.636
25227 A_23_P122924 INHBA NM_002192 4.844 8.172 26.91 8.169e-09 2.650e-05 10.436
39212 A_23_P122924 INHBA NM_002192 4.787 7.614 26.81 8.398e-09 2.650e-05 10.417
21019 A_33_P3243887 IL11 NM_000641 5.986 7.217 26.47 9.241e-09 2.650e-05 10.350
26437 A_23_P122924 INHBA NM_002192 4.681 7.928 25.94 1.075e-08 2.802e-05 10.244
30991 A_32_P351968 HLA-DMB NM_002118 -3.582 11.732 -25.64 1.174e-08 2.805e-05 10.181
3696 A_33_P3403867 PMEPA1 NM_020182 4.568 8.301 25.27 1.311e-08 2.892e-05 10.102
29224 A_23_P122924 INHBA NM_002192 4.860 8.105 24.91 1.459e-08 2.934e-05 10.024
779 A_23_P122924 INHBA NM_002192 4.848 7.642 24.74 1.535e-08 2.934e-05 9.987
1920 A_23_P156327 TGFBI NM_000358 2.892 13.297 23.62 2.174e-08 3.897e-05 9.728
22871 A_33_P3283833 FOXS1 NM_004118 4.535 6.987 21.72 4.075e-08 6.874e-05 9.240
17330 A_24_P330303 FRMD6 NM_001042481 4.102 8.471 21.39 4.570e-08 7.281e-05 9.148
12211 A_33_P3400578 HLF NM_002126 -3.698 8.356 -20.79 5.673e-08 8.564e-05 8.973
39537 A_23_P210763 JAG1 NM_000214 3.485 7.481 19.41 9.457e-08 1.305e-04 8.549
32250 A_23_P94501 ANXA1 NM_000700 -2.575 11.125 -19.38 9.558e-08 1.305e-04 8.540
5572 A_23_P53193 SYTL2 NM_032943 -2.866 8.893 -18.99 1.113e-07 1.451e-04 8.410
24690 A_23_P374082 ADAM19 NM_033274 3.015 7.735 18.33 1.451e-07 1.735e-04 8.182
28793 A_23_P80008 MYLK2 NM_033118 3.055 7.774 18.32 1.455e-07 1.735e-04 8.179
20905 A_23_P147711 NPR1 NM_000906 -2.808 8.076 -18.23 1.512e-07 1.735e-04 8.146
21557 A_24_P239606 GADD45B NM_015675 2.354 11.306 17.87 1.750e-07 1.930e-04 8.018
40318 A_23_P431388 SPOCD1 NM_144569 3.012 8.667 17.66 1.911e-07 2.030e-04 7.941
28648 A_24_P759477 ITGB8 NM_002214 -3.110 8.285 -17.55 2.003e-07 2.051e-04 7.899
18031 A_24_P120251 TM4SF18 NM_138786 -2.441 10.213 -17.38 2.159e-07 2.135e-04 7.833
5212 A_23_P206280 GPR56 NM_201525 2.898 10.868 17.24 2.287e-07 2.186e-04 7.781
Each coefficient compares that treatment to the control.
> Replicate <- factor(c(1,1,1,1,2,2,2,2))
> Treat <- factor(c(1,2,3,4,1,2,3,4))
> levels(Treat) <- c("ctrl","TBFB","Empa","Cana")
> design <- model.matrix(~Treat+Replicate)
> fit.RPTEC <- lmFit(y.RPTEC,design)
> fit.RPTEC <- eBayes(fit.RPTEC,trend=TRUE,robust=TRUE)
> plotSA(fit.RPTEC)
> summary(decideTests(fit.RPTEC))
(Intercept) TreatTBFB TreatEmpa TreatCana Replicate2
Down 0 58 662 562 6450
NotSig 2 28285 27580 27712 17083
Up 28678 337 438 406 5147
> topTable(fit.RPTEC,coef="TreatTBFB",n=30)
ProbeName GeneName SystematicName logFC AveExpr t P.Value adj.P.Val B
26092 A_23_P215634 IGFBP3 NM_001013398 1.6180 10.939 15.245 1.043e-07 0.001744 7.661
8691 A_23_P379649 BMF NM_001003940 1.6676 7.315 14.608 1.509e-07 0.001744 7.401
26223 A_33_P3293009 NKAIN4 NM_152864 1.8552 11.117 14.511 1.825e-07 0.001744 7.241
21483 A_23_P215634 IGFBP3 NM_001013398 1.5849 10.941 12.899 4.386e-07 0.001945 6.607
19438 A_23_P40174 MMP9 NM_004994 2.3559 6.193 12.955 4.437e-07 0.001945 6.592
34760 A_23_P144071 COL7A1 NM_000094 1.6681 8.646 12.564 5.490e-07 0.001945 6.432
27207 A_23_P215634 IGFBP3 NM_001013398 1.5553 11.184 12.489 6.135e-07 0.001945 6.339
14484 A_23_P215634 IGFBP3 NM_001013398 1.5556 10.996 12.438 6.309e-07 0.001945 6.317
25177 A_23_P87011 TAGLN NM_001001522 1.6685 8.307 12.172 7.184e-07 0.001945 6.220
27759 A_23_P122924 INHBA NM_002192 2.7901 5.466 12.232 7.736e-07 0.001945 6.150
9741 A_23_P215634 IGFBP3 NM_001013398 1.5742 10.896 12.205 7.882e-07 0.001945 6.135
37537 A_23_P215634 IGFBP3 NM_001013398 1.5864 10.637 12.158 8.136e-07 0.001945 6.110
30457 A_23_P40174 MMP9 NM_004994 2.3105 6.118 11.755 1.186e-06 0.002125 5.799
41798 A_23_P17316 NKAIN4 NM_152864 1.9709 6.893 11.561 1.239e-06 0.002125 5.771
11805 A_23_P215634 IGFBP3 NM_001013398 1.6029 10.831 11.827 1.244e-06 0.002125 5.753
41624 A_23_P40174 MMP9 NM_004994 1.8649 5.978 11.337 1.310e-06 0.002125 5.733
42847 A_23_P40174 MMP9 NM_004994 2.0354 6.053 11.317 1.330e-06 0.002125 5.720
39212 A_23_P122924 INHBA NM_002192 2.6029 5.292 11.292 1.354e-06 0.002125 5.705
7566 A_23_P215634 IGFBP3 NM_001013398 1.6006 10.750 11.696 1.408e-06 0.002125 5.650
26501 A_23_P215634 IGFBP3 NM_001013398 1.5501 10.981 11.477 1.622e-06 0.002327 5.536
1680 A_23_P52761 MMP7 NM_002423 0.9399 12.963 10.853 1.890e-06 0.002404 5.427
4574 A_33_P3370787 EPHB2 NM_004442 1.0955 8.672 10.794 1.978e-06 0.002404 5.389
2401 A_23_P40174 MMP9 NM_004994 1.9412 6.086 10.779 2.001e-06 0.002404 5.379
29224 A_23_P122924 INHBA NM_002192 2.8105 5.695 11.701 2.012e-06 0.002404 5.333
28299 A_33_P3880302 EPHB2 NM_004442 1.3146 8.803 10.295 2.935e-06 0.003367 5.053
8967 A_23_P215634 IGFBP3 NM_001013398 1.6133 10.696 10.992 3.164e-06 0.003463 4.966
33217 A_33_P3349597 A_33_P3349597 A_33_P3349597 1.0771 9.906 10.141 3.326e-06 0.003463 4.945
19481 A_33_P3330991 LOC100134237 XR_248772 1.6406 7.624 10.263 3.381e-06 0.003463 4.929
28774 A_23_P71037 IL6 NM_000600 0.8926 10.266 9.978 3.805e-06 0.003763 4.828
12418 A_33_P3409854 LHX1 NM_005568 1.2507 11.262 9.948 4.187e-06 0.004002 4.744
The HK2 and RPTEC results are correlated:
> cor(fit.HK2$coef[,2:4], fit.RPTEC$coef[,2:4])
TreatTBFB TreatEmpa TreatCana
TreatTBFB 0.34294 -0.03873 -0.06799
TreatEmpa -0.08158 0.17789 0.16463
TreatCana -0.06694 0.17505 0.15339
More limma documentation and example analyses are available from https://github.com/SmythLab/limma/.