Introduction

This case study analyses Agilent-026652 Whole Human Genome Microarray 4x44Kv2 data downloaded from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE118337 using the limma package.

This vignette was made with limma version 3.67.3.

Read sample information

> library(limma)
> Targets <- read.delim("targets.txt")
> Targets$FileName <- paste0(Targets$Accession,"_",Targets$ID,".txt.gz")
> Targets
    Accession  ID   Condition              FileName
1  GSM3325550 465    HK2_ctrl GSM3325550_465.txt.gz
2  GSM3325551 466    HK2_TGFB GSM3325551_466.txt.gz
3  GSM3325552 467    HK2_Empa GSM3325552_467.txt.gz
4  GSM3325553 468    HK2_Cana GSM3325553_468.txt.gz
5  GSM3325554 469  RPTEC_ctrl GSM3325554_469.txt.gz
6  GSM3325555 470  RPTEC_TGFB GSM3325555_470.txt.gz
7  GSM3325556 471   RPTEC_Emp GSM3325556_471.txt.gz
8  GSM3325557 472  RPTEC_Cana GSM3325557_472.txt.gz
9  GSM3325558 473   HK2_ctrl2 GSM3325558_473.txt.gz
10 GSM3325559 474   HK2_TGFB2 GSM3325559_474.txt.gz
11 GSM3325560 475   HK2_Empa2 GSM3325560_475.txt.gz
12 GSM3325561 476   HK2_Cana2 GSM3325561_476.txt.gz
13 GSM3325562 477 RPTEC_ctrl2 GSM3325562_477.txt.gz
14 GSM3325563 478 RPTEC_TGFB2 GSM3325563_478.txt.gz
15 GSM3325564 479 RPTEC_Empa2 GSM3325564_479.txt.gz
16 GSM3325565 480 RPTEC_Cana2 GSM3325565_480.txt.gz

Read probe intensities

> x <- read.maimages(Targets, source="agilent",
+      green.only=TRUE, other.columns="gIsWellAboveBG")
Read GSM3325550_465.txt.gz 
Read GSM3325551_466.txt.gz 
Read GSM3325552_467.txt.gz 
Read GSM3325553_468.txt.gz 
Read GSM3325554_469.txt.gz 
Read GSM3325555_470.txt.gz 
Read GSM3325556_471.txt.gz 
Read GSM3325557_472.txt.gz 
Read GSM3325558_473.txt.gz 
Read GSM3325559_474.txt.gz 
Read GSM3325560_475.txt.gz 
Read GSM3325561_476.txt.gz 
Read GSM3325562_477.txt.gz 
Read GSM3325563_478.txt.gz 
Read GSM3325564_479.txt.gz 
Read GSM3325565_480.txt.gz 
> colnames(x) <- Targets$Condition
> x$targets <- Targets
> dim(x)
[1] 44495    16

Background correct and normalize

> y <- backgroundCorrect(x, method="normexp")
Array 1 corrected
Array 2 corrected
Array 3 corrected
Array 4 corrected
Array 5 corrected
Array 6 corrected
Array 7 corrected
Array 8 corrected
Array 9 corrected
Array 10 corrected
Array 11 corrected
Array 12 corrected
Array 13 corrected
Array 14 corrected
Array 15 corrected
Array 16 corrected
> y <- normalizeBetweenArrays(y, method="quantile")

Gene filtering

We will filter out control probes as indicated by the ControlType column:

> NotControl <- y$genes$ControlType==0L

We also filter probes that don’t appear to be expressed. We keep probes that are above background on at least two arrays (because there are two replicates of each condition):

> IsExpr <- rowSums(y$other$gIsWellAboveBG > 0) >= 2

Now we select the probes to keep in a new data object yfilt:

> yfilt <- y[NotControl & IsExpr, ]
> dim(yfilt)
[1] 28680    16

To be tidy, we remove annotation columns we no longer need:

> yfilt$genes <- yfilt$genes[,c("ProbeName","GeneName","SystematicName")]
> head(yfilt$genes)
      ProbeName        GeneName  SystematicName
13  A_23_P42935            BRAF       NM_004333
14 A_23_P117082           HEBP1       NM_015987
15   A_23_P2683           RPAP3       NM_024604
16 A_24_P358131 ENST00000404956 ENST00000404956
19  A_32_P14850         NPIPB15 ENST00000429990
20 A_23_P158596          AGTRAP    NM_001040196

Data exploration

> plotMDS(yfilt)

Sample 466 is a spectacular outlier, so we will remove it.

> yfilt <- yfilt[,-2]
> plotMDS(yfilt)

The two cell lines are so different that I am inclined to analyse them separately:

> s <- strsplit2(yfilt$targets$Condition, split="_")
> yfilt$targets$CellLine <- s[,1]
> yfilt$targets$Treat <- s[,2]
> y.HK2 <- yfilt[,yfilt$targets$CellLine=="HK2"] 
> plotMDS(y.HK2, main="HK2", label=y.HK2$targets$Treat)

> y.RPTEC <- yfilt[,yfilt$targets$CellLine=="RPTEC"] 
> plotMDS(y.RPTEC, main="RPTEC", label=y.RPTEC$targets$Treat)

There is a strong replicate effect for both cell lines, but especially for HK2.

Differential expression for HK2

> Replicate <- factor(c(1,1,1,2,2,2,2))
> Treat <- factor(c(1,3,4,1,2,3,4))
> levels(Treat) <- c("ctrl","TBFB","Empa","Cana")
> design <- model.matrix(~Treat+Replicate)
> fit.HK2 <- lmFit(y.HK2,design)
> fit.HK2 <- eBayes(fit.HK2,trend=TRUE,robust=TRUE)
> plotSA(fit.HK2)

> summary(decideTests(fit.HK2))
       (Intercept) TreatTBFB TreatEmpa TreatCana Replicate2
Down             0       963       602       634       5627
NotSig           4     26729     27552     27522      14768
Up           28676       988       526       524       8285
> topTable(fit.HK2,coef="TreatTBFB",n=30)
          ProbeName GeneName SystematicName  logFC AveExpr      t   P.Value adj.P.Val      B
9351  A_33_P3252286    CRLF1      NM_004750  5.829   7.505  33.89 1.433e-09 2.492e-05 11.527
1875   A_24_P413126   PMEPA1      NM_020182  4.567   8.848  32.05 2.182e-09 2.492e-05 11.284
16125  A_23_P122924    INHBA      NM_002192  4.727   7.994  30.20 3.421e-09 2.492e-05 11.009
20973  A_23_P122924    INHBA      NM_002192  4.757   8.025  29.79 3.795e-09 2.492e-05 10.944
27759  A_23_P122924    INHBA      NM_002192  4.729   7.955  28.74 4.978e-09 2.492e-05 10.769
43023  A_23_P122924    INHBA      NM_002192  4.709   7.649  28.03 6.011e-09 2.492e-05 10.644
10056  A_23_P122924    INHBA      NM_002192  4.605   7.926  27.98 6.081e-09 2.492e-05 10.636
25227  A_23_P122924    INHBA      NM_002192  4.844   8.172  26.91 8.169e-09 2.650e-05 10.436
39212  A_23_P122924    INHBA      NM_002192  4.787   7.614  26.81 8.398e-09 2.650e-05 10.417
21019 A_33_P3243887     IL11      NM_000641  5.986   7.217  26.47 9.241e-09 2.650e-05 10.350
26437  A_23_P122924    INHBA      NM_002192  4.681   7.928  25.94 1.075e-08 2.802e-05 10.244
30991  A_32_P351968  HLA-DMB      NM_002118 -3.582  11.732 -25.64 1.174e-08 2.805e-05 10.181
3696  A_33_P3403867   PMEPA1      NM_020182  4.568   8.301  25.27 1.311e-08 2.892e-05 10.102
29224  A_23_P122924    INHBA      NM_002192  4.860   8.105  24.91 1.459e-08 2.934e-05 10.024
779    A_23_P122924    INHBA      NM_002192  4.848   7.642  24.74 1.535e-08 2.934e-05  9.987
1920   A_23_P156327    TGFBI      NM_000358  2.892  13.297  23.62 2.174e-08 3.897e-05  9.728
22871 A_33_P3283833    FOXS1      NM_004118  4.535   6.987  21.72 4.075e-08 6.874e-05  9.240
17330  A_24_P330303    FRMD6   NM_001042481  4.102   8.471  21.39 4.570e-08 7.281e-05  9.148
12211 A_33_P3400578      HLF      NM_002126 -3.698   8.356 -20.79 5.673e-08 8.564e-05  8.973
39537  A_23_P210763     JAG1      NM_000214  3.485   7.481  19.41 9.457e-08 1.305e-04  8.549
32250   A_23_P94501    ANXA1      NM_000700 -2.575  11.125 -19.38 9.558e-08 1.305e-04  8.540
5572    A_23_P53193    SYTL2      NM_032943 -2.866   8.893 -18.99 1.113e-07 1.451e-04  8.410
24690  A_23_P374082   ADAM19      NM_033274  3.015   7.735  18.33 1.451e-07 1.735e-04  8.182
28793   A_23_P80008    MYLK2      NM_033118  3.055   7.774  18.32 1.455e-07 1.735e-04  8.179
20905  A_23_P147711     NPR1      NM_000906 -2.808   8.076 -18.23 1.512e-07 1.735e-04  8.146
21557  A_24_P239606  GADD45B      NM_015675  2.354  11.306  17.87 1.750e-07 1.930e-04  8.018
40318  A_23_P431388   SPOCD1      NM_144569  3.012   8.667  17.66 1.911e-07 2.030e-04  7.941
28648  A_24_P759477    ITGB8      NM_002214 -3.110   8.285 -17.55 2.003e-07 2.051e-04  7.899
18031  A_24_P120251  TM4SF18      NM_138786 -2.441  10.213 -17.38 2.159e-07 2.135e-04  7.833
5212   A_23_P206280    GPR56      NM_201525  2.898  10.868  17.24 2.287e-07 2.186e-04  7.781

Each coefficient compares that treatment to the control.

Differential expression for RPTEC

> Replicate <- factor(c(1,1,1,1,2,2,2,2))
> Treat <- factor(c(1,2,3,4,1,2,3,4))
> levels(Treat) <- c("ctrl","TBFB","Empa","Cana")
> design <- model.matrix(~Treat+Replicate)
> fit.RPTEC <- lmFit(y.RPTEC,design)
> fit.RPTEC <- eBayes(fit.RPTEC,trend=TRUE,robust=TRUE)
> plotSA(fit.RPTEC)

> summary(decideTests(fit.RPTEC))
       (Intercept) TreatTBFB TreatEmpa TreatCana Replicate2
Down             0        58       662       562       6450
NotSig           2     28285     27580     27712      17083
Up           28678       337       438       406       5147
> topTable(fit.RPTEC,coef="TreatTBFB",n=30)
          ProbeName      GeneName SystematicName  logFC AveExpr      t   P.Value adj.P.Val     B
26092  A_23_P215634        IGFBP3   NM_001013398 1.6180  10.939 15.245 1.043e-07  0.001744 7.661
8691   A_23_P379649           BMF   NM_001003940 1.6676   7.315 14.608 1.509e-07  0.001744 7.401
26223 A_33_P3293009        NKAIN4      NM_152864 1.8552  11.117 14.511 1.825e-07  0.001744 7.241
21483  A_23_P215634        IGFBP3   NM_001013398 1.5849  10.941 12.899 4.386e-07  0.001945 6.607
19438   A_23_P40174          MMP9      NM_004994 2.3559   6.193 12.955 4.437e-07  0.001945 6.592
34760  A_23_P144071        COL7A1      NM_000094 1.6681   8.646 12.564 5.490e-07  0.001945 6.432
27207  A_23_P215634        IGFBP3   NM_001013398 1.5553  11.184 12.489 6.135e-07  0.001945 6.339
14484  A_23_P215634        IGFBP3   NM_001013398 1.5556  10.996 12.438 6.309e-07  0.001945 6.317
25177   A_23_P87011         TAGLN   NM_001001522 1.6685   8.307 12.172 7.184e-07  0.001945 6.220
27759  A_23_P122924         INHBA      NM_002192 2.7901   5.466 12.232 7.736e-07  0.001945 6.150
9741   A_23_P215634        IGFBP3   NM_001013398 1.5742  10.896 12.205 7.882e-07  0.001945 6.135
37537  A_23_P215634        IGFBP3   NM_001013398 1.5864  10.637 12.158 8.136e-07  0.001945 6.110
30457   A_23_P40174          MMP9      NM_004994 2.3105   6.118 11.755 1.186e-06  0.002125 5.799
41798   A_23_P17316        NKAIN4      NM_152864 1.9709   6.893 11.561 1.239e-06  0.002125 5.771
11805  A_23_P215634        IGFBP3   NM_001013398 1.6029  10.831 11.827 1.244e-06  0.002125 5.753
41624   A_23_P40174          MMP9      NM_004994 1.8649   5.978 11.337 1.310e-06  0.002125 5.733
42847   A_23_P40174          MMP9      NM_004994 2.0354   6.053 11.317 1.330e-06  0.002125 5.720
39212  A_23_P122924         INHBA      NM_002192 2.6029   5.292 11.292 1.354e-06  0.002125 5.705
7566   A_23_P215634        IGFBP3   NM_001013398 1.6006  10.750 11.696 1.408e-06  0.002125 5.650
26501  A_23_P215634        IGFBP3   NM_001013398 1.5501  10.981 11.477 1.622e-06  0.002327 5.536
1680    A_23_P52761          MMP7      NM_002423 0.9399  12.963 10.853 1.890e-06  0.002404 5.427
4574  A_33_P3370787         EPHB2      NM_004442 1.0955   8.672 10.794 1.978e-06  0.002404 5.389
2401    A_23_P40174          MMP9      NM_004994 1.9412   6.086 10.779 2.001e-06  0.002404 5.379
29224  A_23_P122924         INHBA      NM_002192 2.8105   5.695 11.701 2.012e-06  0.002404 5.333
28299 A_33_P3880302         EPHB2      NM_004442 1.3146   8.803 10.295 2.935e-06  0.003367 5.053
8967   A_23_P215634        IGFBP3   NM_001013398 1.6133  10.696 10.992 3.164e-06  0.003463 4.966
33217 A_33_P3349597 A_33_P3349597  A_33_P3349597 1.0771   9.906 10.141 3.326e-06  0.003463 4.945
19481 A_33_P3330991  LOC100134237      XR_248772 1.6406   7.624 10.263 3.381e-06  0.003463 4.929
28774   A_23_P71037           IL6      NM_000600 0.8926  10.266  9.978 3.805e-06  0.003763 4.828
12418 A_33_P3409854          LHX1      NM_005568 1.2507  11.262  9.948 4.187e-06  0.004002 4.744

The HK2 and RPTEC results are correlated:

> cor(fit.HK2$coef[,2:4], fit.RPTEC$coef[,2:4])
          TreatTBFB TreatEmpa TreatCana
TreatTBFB   0.34294  -0.03873  -0.06799
TreatEmpa  -0.08158   0.17789   0.16463
TreatCana  -0.06694   0.17505   0.15339

Further reading

More limma documentation and example analyses are available from https://github.com/SmythLab/limma/.