ICGC-CLL cohort with RNAseq data (EGAS00001000374)
Pre-processing data
Download ICGC dataset from EMBL server
(The raw sequencing, genomic and outcome data can be queried from ICGC portatl: https://icgc.org/icgc/cgp/64/530/826)
icgcPath <- file.path(tempdir(),"rnaICGC.RData")
if(!file.exists(icgcPath)) {
download.file("https://www.huber.embl.de/users/jlu/data/rnaICGC.RData",
icgcPath)
load(icgcPath)
} else {
load(icgcPath)
}
Load and process ICGC expression dataset
rnaExt.vst <- varianceStabilizingTransformation(rnaExternal)
#Adjusted for potential batch effect
exprMat <- assay(rnaExt.vst)
batch <- as.factor(rnaExt.vst$raw_data_accession)
exprMat.combat <- sva::ComBat(exprMat, batch = batch)
## Found 426 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
assay(rnaExt.vst) <- exprMat.combat
Load and process internal RNAseq dataset
rna$factor <- facTab[match(colnames(rna), facTab$sample),]$value
rna <- rna[,!is.na(rna$factor)] #use samples with LF4 values
rna.vst <- varianceStabilizingTransformation(rna)
Filter out low variant genes in both datasets
exprMat <- assay(rna.vst)
exprMat.ext <- assay(rnaExt.vst)
#icgc dataset
sds <- genefilter::rowSds(exprMat.ext)
exprMat.ext <- exprMat.ext[order(sds,decreasing = TRUE)[1:10000],]
#internal dataset
sds <- genefilter::rowSds(exprMat)
exprMat <- exprMat[order(sds,decreasing = TRUE)[1:10000],]
#subset to keep common genes
overGene <- intersect(rownames(exprMat), rownames(exprMat.ext))
exprMat <- exprMat[overGene,]
exprMat.ext <- exprMat.ext[overGene,]
Remove highly correlated gene in training data
X <- mscale(exprMat)
#remove highly correlated genes
reRes <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)
X <- reRes$reduced
Test assocation with clincial outcome
Prepare outcome table based on patient annotations
survT <- colData(rnaExternal) %>% data.frame(stringsAsFactors = FALSE) %>%
rownames_to_column("sampleID") %>%
mutate(predLF = y.pred[sampleID]) %>%
dplyr::select(sampleID, treatedAfter, TTT, predLF, OS, died) %>%
dplyr::rename(patientID = sampleID) %>%
filter(!(is.na(TTT) & is.na(OS))) #remove samples with none outcome avaialble
#how many samples
nrow(survT)
## [1] 249
Prepare risk factor table
riskTab <- colData(rnaExternal) %>% data.frame(stringsAsFactors = FALSE) %>%
rownames_to_column("sampleID") %>%
dplyr::select(sampleID, IGHV, TP53, del17p, age, sex, SF3B1, NOTCH1) %>%
mutate(`TP53.del17p` = TP53 | del17p, age = age/10) %>%
mutate_if(is.logical, as.numeric) %>%
select(-TP53, -del17p) %>%
dplyr::rename(patientID = sampleID) %>%
mutate(LF4 = y.pred[patientID], IGHV =factor(IGHV, levels = c("U","M")))
Univariate test
Overall survival
#calculate p value for cox regression
pOS <- comSurv(survT$predLF, survT$OS, survT$died)
kmOS <- km(survT$predLF, survT$OS, survT$died, "Overall survival (ICGC-CLL)",
stat = "maxstat",pval = pOS$p, showTable = TRUE)
kmOS

Time to treatment
pTTT <- comSurv(survT$predLF, survT$TTT, survT$treatedAfter)
kmTTT <- km(survT$predLF, survT$TTT, survT$treatedAfter, "Time to treatment (ICGC-CLL)", stat = "maxstat", pval = pTTT$p, showTable = TRUE)
kmTTT

Prepare a summary table
sumOutcome <- bind_rows(mutate(pTTT, outcome = "TTT"),mutate(pOS,outcome = "OS")) %>% mutate(cohort = "ICGC-CLL", n=nrow(survT))
Combine IGHV and CLL-PD
KM plot for subgroup defined by IGHV status and median latent factor values
groupTab <- survT %>% mutate(IGHV = rnaExternal[,patientID]$IGHV) %>%
filter(!is.na(IGHV)) %>%
mutate(group = ifelse(predLF > median(predLF),"highPD","lowPD")) %>%
mutate(subgroup = paste0(IGHV,"_",group))
groupList <- list()
# TTT
groupList[["TTT"]] <- km(groupTab$subgroup, groupTab$TTT, groupTab$treatedAfter, "Time to treatment (ICGC-CLL)", stat = "binary", showP = TRUE, showTable = TRUE, yLabelAdjust = -5)
# OS
groupList[["OS"]] <- km(groupTab$subgroup, groupTab$OS, groupTab$died, "Overall survival (ICGC-CLL)", stat = "binary", showP = TRUE, showTable = TRUE, yLabelAdjust = -5)
grid.arrange(grobs = groupList, ncol = 2)

Multivariate test
OS
surv1 <- runCox(survT, dplyr::rename(riskTab, CLLPD= LF4, IGHV_mutated = IGHV, Sex_male=sex, Age=age), "OS", "died")
summary(surv1)
## Call:
## coxph(formula = Surv(time, endpoint) ~ ., data = testTab)
##
## n= 247, number of events= 81
## (6 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## IGHV_mutatedM -0.988694 0.372062 0.238190 -4.151 3.31e-05 ***
## Age 0.716112 2.046461 0.125042 5.727 1.02e-08 ***
## Sex_malem 0.335878 1.399168 0.263964 1.272 0.2032
## SF3B1 0.215390 1.240346 0.375213 0.574 0.5659
## NOTCH1 -0.003541 0.996465 0.310940 -0.011 0.9909
## TP53.del17p 0.196096 1.216643 0.444733 0.441 0.6593
## CLLPD 0.644055 1.904186 0.258668 2.490 0.0128 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## IGHV_mutatedM 0.3721 2.6877 0.2333 0.5934
## Age 2.0465 0.4886 1.6016 2.6148
## Sex_malem 1.3992 0.7147 0.8340 2.3472
## SF3B1 1.2403 0.8062 0.5945 2.5878
## NOTCH1 0.9965 1.0035 0.5417 1.8329
## TP53.del17p 1.2166 0.8219 0.5089 2.9088
## CLLPD 1.9042 0.5252 1.1469 3.1615
##
## Concordance= 0.781 (se = 0.027 )
## Likelihood ratio test= 71.14 on 7 df, p=9e-13
## Wald test = 67.1 on 7 df, p=6e-12
## Score (logrank) test = 70.23 on 7 df, p=1e-12
haOS <- plotHazard(surv1, title = "OS (ICGC-CLL)") +
scale_y_log10(limits = c(0.1,5))
haOS

Time to treatment
surv1 <- runCox(survT, dplyr::rename(riskTab, CLLPD= LF4, IGHV_mutated = IGHV, Sex_male=sex, Age=age), "TTT", "treatedAfter")
summary(surv1)
## Call:
## coxph(formula = Surv(time, endpoint) ~ ., data = testTab)
##
## n= 233, number of events= 127
## (20 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## IGHV_mutatedM -1.41100 0.24390 0.19162 -7.364 1.79e-13 ***
## Age -0.33373 0.71625 0.07811 -4.273 1.93e-05 ***
## Sex_malem 0.15982 1.17331 0.19901 0.803 0.42191
## SF3B1 0.77086 2.16162 0.27490 2.804 0.00504 **
## NOTCH1 0.32928 1.38997 0.25965 1.268 0.20474
## TP53.del17p 0.05686 1.05850 0.35661 0.159 0.87332
## CLLPD 0.56004 1.75074 0.19324 2.898 0.00375 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## IGHV_mutatedM 0.2439 4.1001 0.1675 0.3551
## Age 0.7162 1.3962 0.6146 0.8347
## Sex_malem 1.1733 0.8523 0.7944 1.7330
## SF3B1 2.1616 0.4626 1.2612 3.7049
## NOTCH1 1.3900 0.7194 0.8356 2.3122
## TP53.del17p 1.0585 0.9447 0.5262 2.1293
## CLLPD 1.7507 0.5712 1.1988 2.5568
##
## Concordance= 0.738 (se = 0.02 )
## Likelihood ratio test= 98.86 on 7 df, p=<2e-16
## Wald test = 99.28 on 7 df, p=<2e-16
## Score (logrank) test = 115.1 on 7 df, p=<2e-16
haTTT <- plotHazard(surv1, title = "TTT (ICGC-CLL)") +
scale_y_log10(limits = c(0.1,5))
haTTT

Correlation between predicted CLL-PD and demographics
Age
plotTab <- riskTab %>%
mutate(C1C2 = paste0("C",rnaExternal[,match(patientID, colnames(rnaExternal))]$`C1C2`),
sex = ifelse(is.na(sex),NA, ifelse(sex =="f" , "Female","Male")))
corRes <- cor.test(plotTab$age, plotTab$LF4)
pval <- formatNum(corRes$p.value, digits = 1, format = "e")
annoN <- sprintf("N = %s", nrow(filter(plotTab,!is.na(age))))
annoP <- bquote(italic("P")~"="~.(pval))
annoCoef <- sprintf("coefficient = %1.2f",corRes$estimate)
plotAge <- ggplot(plotTab, aes(x = age, y = LF4)) +
geom_point(color =colList[3]) +
geom_smooth(method = "lm", se=FALSE, color = "grey50", linetype ="dashed" ) +
annotate("text", x = max(plotTab$age), y = Inf, label = annoN,
hjust=1, vjust =1.5, size = 5, parse = FALSE, col= colList[1]) +
annotate("text", x = max(plotTab$age), y = Inf, label = annoP,
hjust=1, vjust =3.5, size = 5, parse = FALSE, col= colList[1]) +
annotate("text", x = max(plotTab$age), y = Inf, label = annoCoef,
hjust=1, vjust =5.5, size = 5, parse = FALSE, col= colList[1]) +
ylab("predicted CLL-PD") + xlab("Age (years)") +
theme_full
plotAge

Sex
corRes <- t.test(LF4 ~ sex, plotTab)
pval <- formatNum(corRes$p.value, digits = 1, format = "e")
annoP <- bquote(italic("P")~"="~.(pval))
plotTab <- group_by(plotTab, sex) %>% mutate(n=n()) %>% ungroup() %>%
mutate(sex = sprintf("%s\n(N=%s)",sex,n))
plotSex <- ggplot(plotTab, aes(x = sex, y = LF4)) +
geom_violin(aes(fill = sex)) +
geom_point() +
annotate("text", x = Inf, y = Inf, label = annoP,
hjust=1.2, vjust =2, size = 5, parse = FALSE, col= colList[1]) +
scale_fill_manual(values = colList) +
ylab("predicted CLL-PD") + xlab("Sex") +
theme_full + theme(legend.position = "none")
plotSex

Plot C1/C2
plotTab <- filter(plotTab, C1C2 != "CNA")
corRes <- t.test(LF4 ~ C1C2, plotTab)
pval <- formatNum(corRes$p.value, digits = 1, format = "e")
annoP <- bquote(italic("P")~"="~.(pval))
plotTab <- filter(plotTab, !is.na(C1C2)) %>%
group_by(C1C2) %>% mutate(n=n()) %>% ungroup() %>%
mutate(C1C2 = sprintf("%s\n(N=%s)",C1C2,n))
plotC1C2 <- ggplot(filter(plotTab,!is.na(C1C2)), aes(x = C1C2, y = LF4)) +
geom_violin(aes(fill = C1C2)) +
geom_point() +
annotate("text", x = Inf, y = Inf, label = annoP,
hjust=1.1, vjust =2, size = 5, parse = FALSE, col= colList[1]) +
scale_fill_manual(values = colList[3:length(colList)]) +
ylab("predicted CLL-PD") + xlab("C1/C2 group") +
theme_full + theme(legend.position = "none")
plotC1C2

Association with genomics
#t-test
y <- y.pred
geneICGC <- geneICGC[names(y),]
dim(geneICGC)
## [1] 253 99
tRes <- apply(geneICGC, 2, function(x) {
res <- t.test(y ~ as.factor(x), var.equal = TRUE)
data.frame(p = res$p.value,
df = res$estimate[[2]] - res$estimate[[1]])
}) %>% bind_rows() %>% mutate(gene = colnames(geneICGC),
p.adj = p.adjust(p, method = "BH")) %>%
arrange(p)
filter(tRes, p.adj <0.05) %>% mutate_if(is.numeric, formatNum, digits =3, format = "e") %>% DT::datatable()
plotGeneVolcano <- plotVolcano(tRes, posCol = colList[1], negCol = colList[2],
x_lab = "Difference of mean", ifLabel = TRUE) +
theme(legend.position = "none",
plot.margin = margin(8,8,8,8))
plotGeneVolcano

Assocation with mutation load
Function to plot and annotate assocations
plotCor <- function(plotTab, x, y, x_label, y_label, title, color = "black") {
corRes <- cor.test(plotTab[[x]], plotTab[[y]], use = "pairwise.complete.obs")
annoCoef <- paste("'coefficient ='~",format(corRes$estimate,digits = 2))
annoP <- paste("italic(P)~'='~",formatNum(corRes$p.value, digits = 1, format = "e"))
ggplot(plotTab, aes_string(x = x, y = y)) +
geom_point(shape = 21, fill =color, size=3) +
geom_smooth(method = "lm", se=FALSE, color = "grey50", linetype ="dashed" ) +
annotate("text", x = max(plotTab[[x]]), y = Inf, label = annoCoef,
hjust=1, vjust =1.5, size = 5, parse = TRUE, col= colList[1]) +
annotate("text", x = max(plotTab[[x]]), y = Inf, label = annoP,
hjust=1, vjust =3, size = 5, parse = TRUE, col= colList[1]) +
ylab(y_label) + xlab(x_label) + ggtitle(title) +
theme_full + theme(plot.margin = margin(8,8,8,8))
}
plotTab <- tibble(value = y.pred,load=rnaExternal[,names(y.pred)]$mutationLoad) %>%
filter(!is.na(load))
plotMut <- plotCor(plotTab, "value", "load", "CLL-PD", "Total number of mutations",
sprintf("WGS dataset (ICGC, n=%s)",nrow(plotTab)), colList[[6]])
plotMut
