1 Introduction
2 Prepare MOFA model
3 Overview of MOFA output
- 3.1 Variance explained by MOFA for each omic data
- 3.2 The first two latent factors represents IGHV and trisomy12
4 Associations of latent factors to clinical behaviors
5 Annotation for other four factor that only explain RNA expression variations
6 Validate CLL-PD in external, independent cohorts
7 Association between F4 (CLL-PD) and genomics
8 Assocation between F4 (CLL-PD) and DNA methylation
9 Gene expression signatures for CLL-PD (F4)
10 Compare CLL-PD expression signature with other stimulation signatures
11 Proteomic signatures for CLL-PD
12 Association between CLL-PD and energy metabolism
13 Associations between CLL-PD(F4) and ex-vivo drug response phenotyps
14 Single cell analysis (CyTOF)
15 End of session

1 Introduction

This document shows a walktrough of the analyses done for the paper “Multi-omics reveals clinically relevant proliferative drive associated with mTOR-MYC-OXPHOS activity in chronic lymphocytic leukemia” by Lu J and Cannizzaro E et al.

knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(mofaCLL)
library(reticulate)
library(DESeq2)
library(sva)
library(MultiAssayExperiment)
library(MOFA)
library(survival)
library(survminer)
library(maxstat)
library(gridExtra)
library(car)
library(cowplot)
library(egg)
library(ggrepel)
library(ggbeeswarm)
library(grImport)
library(fgsea)
library(limma)
library(GEOquery)
library(pheatmap)
library(UpSetR)
library(glmnet)
library(tidygraph)
library(igraph)
library(ggraph)
library(ComplexHeatmap)
library(scattermore)
library(CATALYST)
library(RColorBrewer)
library(circlize)
library(diffcyt)
library(genefilter)
library(lubridate)
library(tidyverse)

2 Prepare MOFA model

2.1 Prepare each single omic dataset

Load datasets

data("drug","gene","rna")

Download methylation dataset from server to a temporary folder

methPath <- file.path(tempdir(),"meth.RData")
if(!file.exists(methPath)) {
  download.file("https://www.huber.embl.de/users/jlu/data/meth.RData",
                methPath)
  load(methPath)
} else {
  load(methPath)
}

2.1.1 Drug screening

viabMat <- mutate(drug, id = paste0(Drug,"_",concIndex)) %>%
  select(patientID, id, normVal) %>%
  spread(key = patientID, value = normVal) %>%
  data.frame() %>% column_to_rownames("id") %>% 
  as.matrix()

2.1.2 RNA sequencing

rna.vst<-DESeq2::varianceStabilizingTransformation(rna)
exprMat <- assay(rna.vst)
nTop = 5000
sds <- genefilter::rowSds(exprMat)
exprMat <- exprMat[order(sds, decreasing = T)[1:nTop],]

2.1.3 DNA methylation array

methData <- assays(meth)[["beta"]]
nTop = 5000
sds <- genefilter::rowSds(methData)
methData <- methData[order(sds, decreasing = T)[1:nTop],]

2.1.4 Genetics

gene <- t(gene)

2.1.5 Assemble multiAssayExperiment object

Create object

mofaData <- list(Drugs = viabMat, 
                mRNA = exprMat,
                Mutations = gene, 
                Methylation = methData)

# Create MultiAssayExperiment object 
mofaData <- MultiAssayExperiment::MultiAssayExperiment(
  experiments = mofaData
)

Only keep samples that have at least three assays

useSamples <- MultiAssayExperiment::sampleMap(mofaData) %>%
  as_tibble() %>% group_by(primary) %>% summarise(n= length(assay)) %>%
  filter(n >= 3) %>% pull(primary)
mofaData <- mofaData[,useSamples]

Dimensions for each dataset

experiments(mofaData)

## ExperimentList class object of length 4:
##  [1] Drugs: matrix with 315 rows and 190 columns
##  [2] mRNA: matrix with 5000 rows and 202 columns
##  [3] Mutations: matrix with 39 rows and 217 columns
##  [4] Methylation: matrix with 5000 rows and 158 columns

How many samples have the complete datasets

table(table(sampleMap(mofaData)$primary))

## 
##   3   4 
## 101 116

2.2 Build MOFA object

Build MOFA object from multiAssayExperiment object

# Build the MOFA object
MOFAobject <- createMOFAobject(mofaData)
MOFAobject

## Untrained MOFA model with the following characteristics: 
##   Number of views: 4 
##   View names: Drugs mRNA Mutations Methylation 
##   Number of features per view: 315 5000 39 5000
##   Number of samples: 217

Overview of data

p <- plotDataOverview.m(MOFAobject)
p$plot

## <environment: 0x7fcf9197c5a0>

2.3 Setup MOFA training parameters

Define data options

DataOptions <- getDefaultDataOptions()

Define model options

ModelOptions <- getDefaultModelOptions(MOFAobject)
ModelOptions$numFactors <- 30
ModelOptions$sparsity <- TRUE
ModelOptions

## $likelihood
##       Drugs        mRNA   Mutations Methylation 
##  "gaussian"  "gaussian" "bernoulli"  "gaussian" 
## 
## $numFactors
## [1] 30
## 
## $sparsity
## [1] TRUE

Define training options

TrainOptions <- getDefaultTrainOptions()

# Automatically drop factors that explain less than 2% of variance in all omics
TrainOptions$DropFactorThreshold <- 0.02
TrainOptions$tolerance <- 0.01
TrainOptions

## $maxiter
## [1] 5000
## 
## $tolerance
## [1] 0.01
## 
## $DropFactorThreshold
## [1] 0.02
## 
## $verbose
## [1] FALSE
## 
## $seed
## NULL

2.4 Run MOFA model

MOFAobject <- prepareMOFA(
  MOFAobject, 
  DataOptions = DataOptions,
  ModelOptions = ModelOptions,
  TrainOptions = TrainOptions
)

MOFAobject <- runMOFA(MOFAobject)

3 Overview of MOFA output

Load datasets

data("mofaOut", "survival", "gene", "demographic", "doublingTime", "reactomeGS", "rna")

3.1 Variance explained by MOFA for each omic data

Plot variance explained

# Calculate the variance explained (R2) per factor in each view 
r2 <- calculateVarianceExplained(MOFAobject)
r2$R2Total

##       Drugs Methylation   Mutations        mRNA 
##   0.1383193   0.2346552   0.5919219   0.3590070

# Plot it
varExpPlot <- plotVarianceExplained.m(MOFAobject, censor = 0.25) 
varExpPlot

3.2 The first two latent factors represents IGHV and trisomy12

Get weights and factors

allWeights <- getWeights(MOFAobject,
                         views = "all",
                         factors = "all",
                         as.data.frame = TRUE) %>% as_tibble() %>%
  mutate(feature = ifelse(feature == "IGHV.status","IGHV",feature),
         factor = gsub("LF","F",factor))

allFactors <- getFactors(
  MOFAobject, 
  factors = "all",
  as.data.frame = TRUE
) %>% as.tibble() %>%
  mutate(factor = gsub("LF","F", factor))

patAnno <- gene[,c("IGHV", "trisomy12")] %>% data.frame() %>%
  rownames_to_column("sample") %>%
  filter(!is.na(IGHV),!is.na(trisomy12)) %>%
  mutate(IGHV = ifelse(IGHV==1, "M","U"),
         trisomy12 = ifelse(trisomy12 ==1, "yes","no"))

allFactors <- left_join(allFactors, 
                        patAnno,
                        by = "sample")

3.2.1 Plot the separation of the samples by the first two factors

plotTab <- filter(allFactors, factor %in% c("F1","F2")) %>%
  spread(key =factor, value = value) %>% mutate(trisomy12 = factor(trisomy12)) %>%
  filter(!is.na(IGHV), !is.na(trisomy12))

pcaLF1 <- ggplot(plotTab, aes(x=F1, y=F2, color = trisomy12, 
                         shape = IGHV, label = sample)) + 
  geom_point(size=3) +
  scale_shape_manual(values = c(M = 16, U =1)) +
  scale_color_manual(values = c(no = colList[1], yes = colList[2])) +
  theme_full
pcaLF1

3.2.2 Loadings of genomic variations on the first two factors

loadLF1 <- plotTopWeights.m(allWeights, "Mutations", "F1", nfeatures = 5) + 
  theme(axis.text = element_text(size=10),
        axis.title = element_text(size=15))
loadLF2 <- plotTopWeights.m(allWeights, "Mutations", "F2", nfeatures = 5) +
  theme(axis.text = element_text(size=10),
        axis.title = element_text(size=15))

Factor 1

loadLF1

Factor 2

loadLF2

3.2.3 Associations between Factor 1 and epigenetic subtypes

plotTab <- filter(allFactors, factor %in% c("F1","F2")) %>%
  mutate(Epitype = methCluster[sample]) %>%
  spread(key = factor ,value = value)

violinLF1_Meth <- ggplot(filter(plotTab, !is.na(Epitype)), aes(x=Epitype, y=F1, fill = Epitype)) +
  geom_violin() + geom_point() + scale_fill_manual(values = colList[4:length(colList)]) +
  theme_full + theme(legend.position = "none", axis.title.y = element_text(vjust=-2)) +
  xlab("") + ggtitle("Epigenetic subtypes")
violinLF1_Meth

4 Associations of latent factors to clinical behaviors

4.1 Univariate Cox regression

testTab <- left_join(allFactors, survival, by = c(sample = "patientID"))

#for OS
resOS <- filter(testTab, !is.na(OS)) %>%
  group_by(factor) %>%
  do(comSurv(.$value, .$OS, .$died, TRUE)) %>% ungroup() %>%
  arrange(p) %>% mutate(p.adj = p.adjust(p, method = "bonferroni")) %>%
  mutate(Endpoint = "OS")


#for TTT
resTTT <- filter(testTab, !is.na(TTT)) %>%
  group_by(factor) %>%
  do(comSurv(.$value, .$TTT, .$treatedAfter, TRUE)) %>% ungroup() %>%
  arrange(p) %>% mutate(p.adj = p.adjust(p, method = "bonferroni")) %>%
  mutate(Endpoint = "TTT")

Overall survival (OS)

resOS

## # A tibble: 7 × 7
##   factor        p    HR lower higher   p.adj Endpoint
##   <chr>     <dbl> <dbl> <dbl>  <dbl>   <dbl> <chr>   
## 1 F1     0.000878 1.79  1.27    2.53 0.00614 OS      
## 2 F4     0.00408  1.55  1.15    2.09 0.0286  OS      
## 3 F5     0.157    0.693 0.417   1.15 1       OS      
## 4 F7     0.226    0.837 0.628   1.12 1       OS      
## 5 F3     0.446    1.13  0.821   1.56 1       OS      
## 6 F6     0.470    1.13  0.816   1.55 1       OS      
## 7 F2     0.617    1.10  0.767   1.57 1       OS

Time to treatment (TTT)

resTTT

## # A tibble: 7 × 7
##   factor        p    HR lower higher    p.adj Endpoint
##   <chr>     <dbl> <dbl> <dbl>  <dbl>    <dbl> <chr>   
## 1 F4     3.56e-14 2.02  1.69   2.43  2.49e-13 TTT     
## 2 F1     4.10e- 8 1.74  1.43   2.12  2.87e- 7 TTT     
## 3 F7     1.42e- 2 0.817 0.695  0.960 9.96e- 2 TTT     
## 4 F6     1.49e- 1 1.15  0.951  1.39  1   e+ 0 TTT     
## 5 F2     5.58e- 1 1.06  0.868  1.30  1   e+ 0 TTT     
## 6 F3     6.53e- 1 0.957 0.790  1.16  1   e+ 0 TTT     
## 7 F5     7.04e- 1 1.03  0.879  1.21  1   e+ 0 TTT

Plot p values and hazard ratios

plotTab <- bind_rows(resOS, resTTT) %>%
 filter(factor %in% c("F1","F2","F4"))
  
haPlot <- ggplot(plotTab, aes(x=factor, y = HR, col = Endpoint, dodge = Endpoint)) + 
  geom_hline(yintercept = 1, linetype = "dotted") +
  geom_point(position = position_dodge(width=0.8)) +
  geom_errorbar(position = position_dodge(width =0.8), 
                aes(ymin = lower, ymax = higher), width = 0.3, size=1) + 
  geom_text(position = position_dodge2(width = 0.8),
            aes(x=as.numeric(as.factor(factor))+0.15,
                label = sprintf("italic(P)~'='~'%s'",
                                formatNum(p))),
            color = "black",size =5, parse = TRUE) +
  xlab("Factor") + ylab("Hazard ratio") +
  scale_y_log10(limits = c(0.5,4)) +
  coord_flip() + theme_full + theme(legend.title = element_blank(),
                                    legend.position = c(0.2,0.1),
                                    legend.background = element_blank(),
                                    legend.key.size = unit(0.5,"cm"),
                                    legend.key.width = unit(0.6,"cm"),
                                    legend.text = element_text(size=rel(1.2))) +
  scale_color_manual(values = c(OS = colList[3], TTT = colList[5])) 

haPlot

4.2 Kaplan-Meiler plots

4.2.1 KM plot for overall survival (OS)

facList <- sort(filter(resOS, p.adj <=0.05)$factor)
osList <- lapply(facList, function(x) {
  eachTab <- filter(testTab, factor == x) %>%
    select(value, OS, died) %>% filter(!is.na(OS))
  pval <- filter(resOS, factor == x)$p
  km(eachTab$value, eachTab$OS, eachTab$died, sprintf("%s VS Overall survival time", x),
     stat = "maxstat", pval = pval, showTable = TRUE)
})

grid.arrange(grobs = osList, ncol = 2)

4.2.2 KM plot for time to treatment (TTT)

facList <- sort(filter(resTTT, p.adj <=0.01)$factor)
tttList <- lapply(facList, function(x) {
  eachTab <- filter(testTab, factor == x) %>%
    select(value, TTT, treatedAfter) %>% filter(!is.na(TTT))
    pval <- filter(resTTT, factor == x)$p
  km(eachTab$value, eachTab$TTT, eachTab$treatedAfter, sprintf("%s VS Time to treatment", x), stat = "maxstat",
     maxTime = 7, pval = pval, showTable = TRUE)
})

grid.arrange(grobs = tttList, ncol = 2)

4.2.3 KM plot for subgroup defined by IGHV status and median latent factor values

groupTab <- filter(testTab, factor == "F4", !is.na(value), !is.na(IGHV)) %>%
  mutate(subgroup = ifelse(value > median(value), paste0(IGHV,"_highF4"), paste0(IGHV,"_lowF4"))) 

plotList <- list()
# TTT
plotList[["TTT"]] <- km(groupTab$subgroup, groupTab$TTT, groupTab$treatedAfter, "Time to treatment", stat = "binary", maxTime = 7, showP = TRUE, showTable = TRUE, yLabelAdjust = -10)

# OS
plotList[["OS"]] <- km(groupTab$subgroup, groupTab$OS, groupTab$died, "Overall survival", stat = "binary", maxTime = 7, showP = TRUE, showTable = TRUE, yLabelAdjust = -10)

grid.arrange(grobs = plotList, ncol = 2)

4.2.4 KM plot for subgroup defined by median latent factor values of F1 and F4 (Supplementary Figure)

groupTab <- filter(testTab, factor %in% c("F1","F4"), !is.na(value)) %>%
  spread(key = factor, value = value) %>%
  mutate(LF1group = ifelse(F1 > median(F1), "highF1", "lowF1"),
         LF4group = ifelse(F4 > median(F4), "highF4","lowF4")) %>%
  mutate(subgroup = paste0(LF1group, "_",LF4group)) 

plotList1 <- list()
# TTT
plotList1[["TTT"]] <- km(groupTab$subgroup, groupTab$TTT, groupTab$treatedAfter, "Time to treatment", stat = "binary", maxTime = 7, showP = TRUE, showTable = TRUE, yLabelAdjust = -15)

# OS
plotList1[["OS"]] <- km(groupTab$subgroup, groupTab$OS, groupTab$died, "Overall survival", stat = "binary", maxTime = 7, showP = TRUE, showTable = TRUE, yLabelAdjust = -15)

grid.arrange(grobs = plotList1, ncol = 2)

4.3 Multi-variate Cox regression for Factor 4 (F4)

Prepare data for multi-variate Cox regression

survTab <- survival
facTab <- filter(allFactors, factor == "F4")
riskTab <- gene[,c("IGHV","TP53","NOTCH1","del17p","SF3B1")] %>% 
  data.frame() %>% rownames_to_column("patientID") %>%
  mutate(`TP53.del17p` = as.numeric(TP53 | del17p)) %>%
  select(-TP53, -del17p) %>%
  mutate_if(is.numeric, as.factor) %>%
  left_join(select(demographic, patientID, age, sex), by = "patientID") %>%
  mutate(age = age/10) %>%
  mutate(F4 = facTab[match(patientID, facTab$sample),]$value,
         IGHV = factor(IGHV, levels = c(0,1))) %>%
  dplyr::rename(IGHV_mutated = IGHV, Sex_male = sex, Age = age) %>%
  filter(!is.na(F4))

Time to treatment

resTTT <- runCox(survTab, riskTab, "TTT", "treatedAfter")

#summary(surv1)
haTTT <- plotHazard(resTTT, title = "Time to treatment") +
   scale_y_log10(limits=c(0.2,5))
haTTT

Overall survival

resOS <- runCox(survTab, riskTab, "OS", "died")

#summary(surv1)
haOS <- plotHazard(resOS,"Overall survival") + scale_y_log10(limits=c(0.05,5))
haOS

4.4 Correlation between F4 and demographics

Age

plotTab <- left_join(demographic, facTab, by = c(patientID = "sample")) %>%
  filter(!is.na(value)) %>%
  mutate(pretreat = ifelse(is.na(pretreat),NA,
                           ifelse(pretreat ==1 , "yes","no")),
         sex = ifelse(is.na(sex),NA,
                           ifelse(sex =="f" , "Female","Male")))

corRes <- cor.test(plotTab$age, plotTab$value)
pval <- formatNum(corRes$p.value, digits = 1)
annoN <- sprintf("n = %s", nrow(filter(plotTab,!is.na(age))))
annoP <- bquote(italic("P")~"="~.(pval))
annoCoef <- sprintf("coefficient = %1.2f",corRes$estimate)

plotAge <- ggplot(plotTab, aes(x = age, y = value)) + 
  geom_point(fill =colList[3], shape =21, size=3) + 
  geom_smooth(method = "lm", se=FALSE, color = "grey50", linetype ="dashed" ) + 
  annotate("text", x = max(plotTab$age), y = Inf, label = annoN,
           hjust=1, vjust =1.5, size = 5, parse = FALSE, col= colList[1]) +
  annotate("text", x = max(plotTab$age), y = Inf, label = annoP,
           hjust=1, vjust =3.5, size = 5, parse = FALSE, col= colList[1]) +
  annotate("text", x = max(plotTab$age), y = Inf, label = annoCoef,
           hjust=1, vjust =5.5, size = 5, parse = FALSE, col= colList[1]) +
  ylab("F4") + xlab("Age (years)") +
  theme_full
plotAge

Sex

corRes <- t.test(value ~ sex, plotTab)
pval <- formatNum(corRes$p.value, digits = 1)
annoP <- bquote(italic("P")~"="~.(pval))

plotTab <- group_by(plotTab, sex) %>% mutate(n=n()) %>% ungroup() %>%
  mutate(sex = sprintf("%s\n(n=%s)",sex,n))

plotSex <- ggplot(plotTab, aes(x = sex, y = value)) + 
  geom_violin(aes(fill = sex)) +
  geom_point() + 
  annotate("text", x = Inf, y = Inf, label = annoP,
           hjust=1.2, vjust =2, size = 5, parse = FALSE, col= colList[1]) +
  scale_fill_manual(values = colList) +
  ylab("F4") + xlab("Sex") +
  theme_full + theme(legend.position = "none")
plotSex

Pretreatment

corRes <- t.test(value ~ pretreat, plotTab)
annoP <- paste("italic(P)~'='~",formatNum(corRes$p.value, digits = 1, format = "e"))

plotTab <- filter(plotTab, !is.na(pretreat)) %>%
  group_by(pretreat) %>% mutate(n=n()) %>% ungroup() %>%
  mutate(pretreat = sprintf("%s\n(n=%s)",pretreat,n))

plotTreat <- ggplot(filter(plotTab,!is.na(pretreat)), aes(x = pretreat, y = value)) + 
  geom_violin(aes(fill = pretreat)) +
  geom_point() + 
  annotate("text", x = -Inf, y = Inf, label = annoP,
           hjust=-0.2, vjust =2, size = 5, parse = TRUE, col= colList[1]) +
  scale_fill_manual(values = colList[3:length(colList)]) +
  ylab("F4") + xlab("Pretreatment") +
  theme_full + theme(legend.position = "none")
plotTreat

4.5 Association between F4 and outcomes in treatment-naive patients

4.5.1 Univariate Cox regression

survival.untreated <- survival %>% mutate(pretreat = demographic[match(patientID, demographic$patientID),]$pretreat) %>%
  filter(pretreat ==0)
testTab <- left_join(allFactors, survival.untreated, by = c(sample = "patientID"))

#for OS
resOS <- filter(testTab, !is.na(OS), factor == "F4") %>%
  group_by(factor) %>%
  do(comSurv(.$value, .$OS, .$died, TRUE)) %>% ungroup() %>%
  arrange(p) %>% mutate(p.adj = p.adjust(p, method = "BH")) %>%
  mutate(Endpoint = "OS")

resOS

## # A tibble: 1 × 7
##   factor      p    HR lower higher  p.adj Endpoint
##   <chr>   <dbl> <dbl> <dbl>  <dbl>  <dbl> <chr>   
## 1 F4     0.0364  1.70  1.03   2.81 0.0364 OS

#for TTT
resTTT <- filter(testTab, !is.na(TTT), factor == "F4") %>%
  group_by(factor) %>%
  do(comSurv(.$value, .$TTT, .$treatedAfter, TRUE)) %>% ungroup() %>%
  arrange(p) %>% mutate(p.adj = p.adjust(p, method = "BH")) %>%
  mutate(Endpoint = "TTT")

resTTT

## # A tibble: 1 × 7
##   factor            p    HR lower higher        p.adj Endpoint
##   <chr>         <dbl> <dbl> <dbl>  <dbl>        <dbl> <chr>   
## 1 F4     0.0000000162  1.96  1.55   2.47 0.0000000162 TTT

4.5.2 Kaplan-Meiler plots

KM plot for overall survival (OS)

eachTab <- filter(testTab, factor == "F4") %>%
  select(value, OS, died) %>% filter(!is.na(OS))
pval <- filter(resOS, factor == "F4")$p
kmOS.untreat <- km(eachTab$value, eachTab$OS, eachTab$died, "F4 VS Overall survival time",
     stat = "maxstat", pval = pval, showTable = TRUE)

kmOS.untreat

KM plot for time to treatment (TTT)

eachTab <- filter(testTab, factor == "F4") %>%
  select(value, TTT, treatedAfter) %>% filter(!is.na(TTT))
pval <- filter(resTTT, factor == "F4")$p
kmTTT.untreat <- km(eachTab$value, eachTab$TTT, eachTab$treatedAfter, "F4 VS Time to treatment",
     stat = "maxstat", pval = pval, showTable = TRUE)

kmTTT.untreat

4.6 Multi-variate Cox regression

Prepare data for multi-variate Cox regression

survTab <- survival.untreated
facTab <- filter(allFactors, factor == "F4")
riskTab <- gene[,c("IGHV","TP53","del17p","SF3B1")] %>% 
  data.frame() %>% rownames_to_column("patientID") %>%
  mutate(`TP53.del17p` = as.numeric(TP53 | del17p)) %>%
  select(-TP53, -del17p) %>%
  mutate_if(is.numeric, as.factor) %>%
  left_join(select(demographic, patientID, age, sex), by = "patientID") %>%
  mutate(age = age/10) %>%
  mutate(F4 = facTab[match(patientID, facTab$sample),]$value,
         IGHV = factor(IGHV, levels = c(0,1))) %>%
  dplyr::rename(IGHV_mutated = IGHV, Sex_male = sex, Age = age) %>%
  filter(!is.na(F4))

Time to treatment

resTTT <- runCox(survTab, riskTab, "TTT", "treatedAfter")

#summary(surv1)
haTTT.untreat <- plotHazard(resTTT, title = "Time to treatment")
haTTT.untreat

Overall survival

resOS <- runCox(survTab, riskTab, "OS", "died")

#summary(surv1)
haOS.untreat <- plotHazard(resOS,"Overall survival")
haOS.untreat

4.7 Correlation between F4 and Lymphocyte doubling time

4.7.1 Univariate test

Pearson’s correlation test

LDT <- doublingTime %>% mutate(F4 = facTab[match(patID, facTab$sample),]$value) %>%
  filter(!is.na(F4)) %>%
  mutate(IGHV = as.factor(facTab[match(patID, facTab$sample),]$IGHV))
corRes <- cor.test(log10(LDT$doubling.time), LDT$F4)

Scatter plot of correlations

pval <- formatNum(corRes$p.value, digits = 1, format = "e")
annoN <- sprintf("n = %s", nrow(LDT))
annoP <- bquote(italic("P")~"="~.(pval))
annoCoef <- sprintf("coefficient = %1.2f",corRes$estimate)

corPlot <- ggplot(LDT, aes(x = F4, y = doubling.time/30)) + 
  geom_point(fill =colList[5], shape =21, size=3) + 
  geom_smooth(method = "lm", se=FALSE, color = "grey50", linetype ="dashed" ) +
  annotate("text", x = max(LDT$F4), y = Inf, label = annoN,
           hjust=1, vjust =1.5, size = 5, parse = FALSE, col= colList[1]) +
  annotate("text", x = max(LDT$F4), y = Inf, label = annoP,
           hjust=1, vjust =3.5, size = 5, parse = FALSE, col= colList[1]) +
  annotate("text", x = max(LDT$F4), y = Inf, label = annoCoef,
           hjust=1, vjust =5.5, size = 5, parse = FALSE, col= colList[1]) +
  ylab(bquote("doubling time (months)")) + ggtitle("Lymphocyte doubling time") +
  scale_y_log10() +
  theme_full


corPlot

4.7.2 Consider IGHV status

4.7.2.1 IGHV as a covariate

ANOVA test

LDT <- doublingTime %>% mutate(F4 = facTab[match(patID, facTab$sample),]$value,
                               IGHV = as.factor(facTab[match(patID, facTab$sample),]$IGHV)) %>%
  filter(!is.na(F4),!is.na(IGHV))

corRes <- car::Anova(lm(log10(doubling.time) ~ F4 + IGHV, data = LDT))
corRes

## Anova Table (Type II tests)
## 
## Response: log10(doubling.time)
##            Sum Sq Df F value    Pr(>F)    
## F4         4.6978  1  24.633 3.838e-06 ***
## IGHV       5.2955  1  27.768 1.131e-06 ***
## Residuals 15.2566 80                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.7.2.2 Only M-CLL

LDT.M <- filter(LDT, IGHV == "M") 
corRes.M <- cor.test(log2(LDT.M$doubling.time), LDT.M$F4)
corRes.M

## 
##  Pearson's product-moment correlation
## 
## data:  log2(LDT.M$doubling.time) and LDT.M$F4
## t = -4.5287, df = 41, p-value = 5.034e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7480461 -0.3352300
## sample estimates:
##        cor 
## -0.5774352

4.7.2.3 Only U-CLL

LDT.U <- filter(LDT, IGHV == "U") 
corRes.U <- cor.test(log2(LDT.U$doubling.time), LDT.U$F4)
corRes.U

## 
##  Pearson's product-moment correlation
## 
## data:  log2(LDT.U$doubling.time) and LDT.U$F4
## t = -2.5255, df = 38, p-value = 0.01584
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.61767285 -0.07665077
## sample estimates:
##       cor 
## -0.379108

Scatter plot of correlations, stratified by IGHV

annoM <- sprintf("'M-CLL: n = %s, coefficient = %1.2f,'~italic(P)~'= %s'",nrow(LDT.M), corRes.M$estimate, formatNum(corRes.M$p.value, digits = 1, format = "e"))
annoU <- sprintf("'U-CLL: n = %s, coefficient = %1.2f,'~italic(P)~'= %s'", nrow(LDT.U), corRes.U$estimate,formatNum(corRes.U$p.value, digits = 1, format = "e"))

corPlot.IGHV <- ggplot(LDT, aes(x = F4, y = doubling.time/30, fill = IGHV, col = IGHV)) + 
  geom_point(shape=21, size=3, col = "black") + 
  geom_smooth(method = "lm", se=FALSE, linetype ="dashed" ) + 
  annotate("text", x = Inf, y = Inf, label = annoM,
           hjust=1.05, vjust =1.2, size = 4, parse = TRUE, color = colList[1]) +
  annotate("text", x = Inf, y = Inf, label = annoU,
           hjust=1.05, vjust =2.5, size = 4, parse = TRUE, color = colList[2]) +
  ylab("doubling time (months)") + ggtitle("Lymphocyte doubling time") +
  scale_fill_manual(values = c(M = colList[1],U=colList[2])) +
  scale_color_manual(values = c(M = colList[1],U=colList[2])) +
  scale_y_log10() +
  theme_full + theme(legend.position = "none")

corPlot.IGHV

Correlations in untreated patients only

LDT.untreat <- doublingTime %>% mutate(F4 = facTab[match(patID, facTab$sample),]$value, 
                               pretreat = demographic[match(patID, demographic$patientID),]$pretreat) %>%
  filter(!is.na(F4),!is.na(pretreat)) %>% filter(pretreat == 0)

corRes <- cor.test(LDT.untreat$doubling.time, LDT.untreat$F4)

annoText <- sprintf("'coefficient = %1.2f, '~italic(P)~'= %s'",corRes$estimate,formatNum(corRes$p.value, digits = 1, format = "e"))
corPlot.untreat <- ggplot(LDT.untreat, aes(x = F4, y = doubling.time/30)) + 
  geom_point(fill =colList[5], shape=21, size=3) + 
  geom_smooth(method = "lm", se=FALSE, color = "grey50", linetype ="dashed" ) + 
  annotate("text", x = 2.2, y = Inf, label = annoText,
           hjust=1, vjust =2, size = 4, parse = TRUE, col= colList[1]) +
  ylab("doubling time (months)") + ggtitle(sprintf("Lymphocyte doubling time\n(untreated patients only, n=%s)",nrow(LDT.untreat))) +
  scale_y_log10() +
  theme_full

corPlot.untreat

4.7.3 Variance expalined for lymphocyte doubling time

onlyIGHV <- summary(lm(log2(doubling.time) ~ IGHV, data = LDT))
onlyF4 <- summary(lm(log2(doubling.time) ~ F4, data = LDT))
combined <- summary(lm(log2(doubling.time) ~ F4 + IGHV, data = LDT))
plotTab <- tibble(model = c("IGHV only","F4 only", "IGHV + F4"),
                  R2 = c(onlyIGHV$adj.r.squared, onlyF4$r.squared, combined$adj.r.squared)) %>%
  mutate(model = factor(model, levels = model))
explainedDT <- ggplot(plotTab, aes(x=model, y = R2)) + geom_bar(stat = "identity", aes(fill = model), width = 0.8) +
  coord_flip(expand = FALSE, xlim = c(0.5,3.5)) + theme_half + scale_fill_manual(values = colList) +
  geom_text(aes(x = model, y =0.01, label = model), hjust =0, fontface = "bold", size =5) +
  theme(axis.ticks.y = element_blank(), axis.text.y = element_blank(), legend.position = "none",
        axis.title.y = element_text( size =13)) +
  xlab("Predictors") + ylab("Variance explained") 
explainedDT

5 Annotation for other four factor that only explain RNA expression variations

5.1 Factor 3

Assocations with RNAseq batch

batchTab <- filter(allFactors , factor == "F3") %>%
  mutate(batch = rna[,match(sample,colnames(rna))]$batch) %>% 
  filter(!is.na(batch)) %>%
  mutate(batch = paste0("batch", batch+1))

pval <- car::Anova(lm(value ~ factor(batch), batchTab))$`Pr(>F)`[1]
pval <- formatNum(pval, digits = 2)
pAnno <- bquote(italic("P")~"="~.(pval))
colListNew <- colList[-4]
pL3 <- ggplot(batchTab, aes(x=batch, y = value, col = batch)) +
  geom_boxplot() + 
  ggbeeswarm::geom_beeswarm() + 
  scale_color_manual(values = colListNew) +
  annotate("text", x=Inf, y=Inf, label=pAnno, hjust=1.5, vjust=1.5)+
  theme_full +
  theme(legend.position = "none") +
  ylab("F3 value") + xlab("") + ggtitle("F3 ~ RNAseq batch")
pL3

5.2 Factor 5

Assocations with CD4, CD8 expression

rna <- estimateSizeFactors(rna)
exprTab <- counts(rna[rowData(rna)$symbol %in% c("CD4","CD8A"),],normalized = TRUE) %>%
  t() %>% as_tibble(rownames = "sample") %>% 
  pivot_longer(-sample, names_to="id", values_to = "count") %>%
  mutate(symbol = rowData(rna)[id,]$symbol)
  
  
facTab <- filter(allFactors , factor == "F5")
plotTab <- left_join(exprTab, facTab, by = "sample")


pAnno <- bquote(italic("P")~"<"~.(10e-13))
pL5 <- ggplot(plotTab, aes(y=log2(count), x= value, col = symbol)) +
  geom_point() + geom_smooth(method = "lm", se = FALSE) +
  theme_half +
  scale_color_manual(values = colList) +
  annotate("text", x=-Inf, y=Inf, label=pAnno, hjust=-0.5, vjust=5, size=5)+
  xlab("F5 value") + ylab(bquote("log"[2]*"(RNAseq count)")) +
  ggtitle("T cell marker expressions ~ F5") +
  theme(legend.position = c(0.8,0.15),
        legend.text = element_text(size=15),
        legend.title = element_text(size=15)) +
  ylim(0,13)

5.3 Factor 6

fsea.results <- MOFA::runEnrichmentAnalysis(MOFAobject,view = "mRNA", factor =6 , feature.sets = reactomeGS)
enL6 <- MOFA::plotEnrichment(MOFAobject, fsea.results, factor = 6, max.pathways = 5) +
  ylab(bquote("-log"[10]*"(adjusted "*italic("P")~"value)")) +
  ggtitle("Pathways enriched for F6") + theme_half

#source table
outTab <- tibble(pathway = rownames(fsea.results$pval),
                 PValue = fsea.results$pval[,1],
                 adj.PValue = fsea.results$pval.adj[,1]) %>%
  arrange(PValue)

exprTab <- counts(rna[rowData(rna)$symbol %in% c("SOD1","GPX4"),],normalized = TRUE) %>%
  t() %>% as_tibble(rownames = "sample") %>% 
  pivot_longer(-sample, names_to="id", values_to = "count") %>%
  mutate(symbol = rowData(rna)[id,]$symbol)
  
  
facTab <- filter(allFactors , factor == "F6")
plotTab <- left_join(exprTab, facTab, by = "sample")

p1 <- cor.test(~ value + log2(count), filter(plotTab, symbol == "SOD1"))
p2 <- cor.test(~ value + log2(count), filter(plotTab, symbol == "GPX4"))

pAnno <- bquote(italic("P")~"<"~.(10e-13))
corL6 <- ggplot(plotTab, aes(y=log2(count), x= value, col = symbol)) +
  geom_point() + geom_smooth(method = "lm", se = FALSE) +
  theme_half +
  scale_color_manual(values = colList) +
  annotate("text", x=-Inf, y=Inf, label=pAnno, hjust=-0.5, vjust=5, size=5)+
  xlab("F6 value") + ylab(bquote("log"[2]*"(RNAseq count)")) +
  ggtitle("SOD1 and GPX4 ~ F6") +
  theme(legend.position = c(0.8,0.15),
        legend.text = element_text(size=15),
        legend.title  = element_text(size=15))




#source data

5.4 Factor 7

fsea.results <- MOFA::runEnrichmentAnalysis(MOFAobject,view = "mRNA", factor =7 , feature.sets = reactomeGS)
enL7 <- MOFA::plotEnrichment(MOFAobject, fsea.results, factor = 7, max.pathways = 5) +
  ylab(bquote("-log"[10]*"(adjusted "*italic("P")~"value)")) +
  ggtitle("Pathways enriched for F7") + theme_half

5.5 Arrange plots for Section 1 in the manuscript

5.5.1 Figure 1 in main text

plotTitle <- ggdraw() +draw_figure_label("Figure 1", fontface = "bold", position = "top.left",size=20)

topGrid <- plot_grid(varExpPlot, haPlot, haTTT, haOS, nrow =1, 
                     rel_widths = c(1,0.9,1,1), labels = c("a","b","e","f"), label_size = 20,
                     align = "h", axis = "l")
bottomGrid <- plot_grid(plotList[["TTT"]],plotList[["OS"]],NULL,
                        plot_grid(corPlot,explainedDT, rel_heights = c(0.70,0.30), ncol =1, labels = c("g","h"),label_size = 20, align = "v", axis = "l"),
                        nrow =1, rel_widths = c(1,1,0.02,0.85), labels = c("c","d"), label_size = 20)

plot_grid(plotTitle, topGrid, NULL, bottomGrid, rel_heights = c(0.02,0.45,0.02,0.5), ncol = 1)

5.5.2 Forest plot showing values of variance explained for each view

R2list <- calculateVarianceExplained(MOFAobject)
plotTab <- R2list$R2PerFactor %>%
  as_tibble(rownames = "factor") %>%
  pivot_longer(-factor, names_to = "view", values_to = "R2") %>%
  mutate(factor = str_replace(factor,"LF","F"))


p <- ggplot(plotTab, aes_string(x = "view", y = "R2")) +
  geom_point(size = 2) + 
  geom_segment(aes_string(xend = "view"), size = 0.75,yend = 0) + 
  expand_limits(y=0) +
  coord_flip() +
  facet_wrap(~factor, scale = "free_x") +
  xlab("") + ylab(bquote('Variance explained ('~R^2~')')) + 
  theme_full +
  theme(strip.text = element_text(size =15))
p

5.5.3 Annotation of LF1 for Supplementary figure

plot_grid(plot_grid(loadLF1, loadLF2, violinLF1_Meth, 
                    nrow =3, align = "hv", axis = "l", labels = c(" "," "," "), label_size = 20),
          NULL,
          plot_grid(pcaLF1,NULL, nrow=2, rel_heights  = c(0.8,0.2)),
          nrow = 1, rel_widths = c(0.4,.02, 0.6), labels = c("",""," "), label_size = 20)

5.5.4 KM plots for Supplementary Figure

grid.arrange(grobs = c(tttList, osList), ncol =2)

5.5.5 Lymphocyte doubling time for supplementary figure

plot_grid(corPlot.untreat, NULL,corPlot.IGHV, nrow =1, rel_widths = c(1,0.1,1))

5.5.6 Associations with demographics

plot_grid(plotAge, NULL,plotSex, NULL, plotTreat,ncol=1, rel_heights  = c(1,0.1,1,0.1,1))

5.5.7 Associations with outcomes in treatment-naive patients

plot_grid(plot_grid(kmTTT.untreat, haTTT.untreat, nrow =2, align = "h", axis = "l", rel_heights = c(0.55,0.45)), NULL,
          plot_grid(kmOS.untreat, haOS.untreat, nrow =2, align = "h", axis = "l", rel_heights = c(0.55,0.45)), nrow=1, rel_widths = c(1,0.1,1) )

5.5.8 Annotatons of other factors

p <- plot_grid(plot_grid(pL3, pL5, corL6, nrow=1, rel_widths = c(1,0.8,0.8), labels = c(" "," "," "), label_size = 20),
          plot_grid(enL6, enL7, nrow=1, labels = c(" "," "), label_size = 20, rel_widths = c(0.5,0.6)),nrow=2)
p

6 Validate CLL-PD in external, independent cohorts

6.1 Test whether CLL-PD can be computed using single-omic dataset

Load datasets

data("mofaOut","gene")
setMap <- read_tsv(system.file("externalData/setToPathway.txt", package = "mofaCLL"), col_types = "cc")

Download methylation dataset from server to a temporary folder

methPath <- file.path(tempdir(),"meth.RData")
if(!file.exists(methPath)) {
  download.file("https://www.huber.embl.de/users/jlu/data/meth.RData",
                methPath)
  load(methPath)
} else {
  load(methPath)
}

Get values for LF4

library(MOFA)
facTab <- getFactors(
  MOFAobject, 
  factors = "LF4",
  as.data.frame = TRUE
) %>% as_tibble()
trainData <- getTrainData(MOFAobject)
#unload MOFA package
detach("package:MOFA", unload = TRUE)

6.1.1 Using gene expression dataset for predicting LF4

Prepare model

y <- structure(facTab$value, names = facTab$sample)
X <- trainData[["mRNA"]]
X <- X[,complete.cases(t(X))]
sampleOverlap <- intersect(names(y), colnames(X))
y <- y[sampleOverlap]
X <- X[,sampleOverlap]

#remove highly correlated features
X <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)$reduced

Run regularised linear regression models

set.seed(5862)
rnaRes <- runGlm(X,y, repeats = 20, folds=5, method = "lasso", lambda ="lambda.1se", testRatio = 0.3)

Export select gene list from the best model

coefMat <- rnaRes$coefMat
coefTab <- tibble(id = rownames(coefMat),
  coef = coefMat[,which.max(rnaRes$r2Test)]) %>%
  filter(coef != 0) %>%
  mutate(Symbol = sprintf("\textit{%s}",rowData(rna[id,])$symbol)) %>%
  arrange(desc(abs(coef))) %>%
  select(id, Symbol, coef)

geneCoef <- coefTab %>% 
  mutate(coef  = formatC(coef, digits=2)) %>%
  dplyr::rename(`Ensembl gene ID` = id, 
         Coefficient = coef)

geneCoef

## # A tibble: 84 × 3
##    `Ensembl gene ID` Symbol              Coefficient
##    <chr>             <chr>               <chr>      
##  1 ENSG00000164082   "\textit{GRM2}"     -0.11      
##  2 ENSG00000105426   "\textit{PTPRS}"    -0.1       
##  3 ENSG00000185338   "\textit{SOCS1}"    -0.1       
##  4 ENSG00000161835   "\textit{GRASP}"    -0.088     
##  5 ENSG00000163687   "\textit{DNASE1L3}" 0.062      
##  6 ENSG00000114013   "\textit{CD86}"     0.055      
##  7 ENSG00000132386   "\textit{SERPINF1}" 0.054      
##  8 ENSG00000154127   "\textit{UBASH3B}"  -0.052     
##  9 ENSG00000198598   "\textit{MMP17}"    -0.044     
## 10 ENSG00000007038   "\textit{PRSS21}"   0.042      
## # … with 74 more rows

6.1.2 Using methylation dataset for predicting LF4

Prepare model

y <- structure(facTab$value, names = facTab$sample)
X <- trainData[["Methylation"]]
X <- X[,complete.cases(t(X))]
sampleOverlap <- intersect(names(y), colnames(X))
y <- y[sampleOverlap]
X <- X[,sampleOverlap]

#remove highly correlated features
X <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)$reduced

Run regularised linear regression models

methRes <- runGlm(X,y, repeats = 20, folds=5, method = "lasso", lambda ="lambda.1se", testRatio = 0.3)

Export the select gene list from best model

coefMat <- methRes$coefMat
coefTab <- tibble(id = rownames(coefMat),
  coef = coefMat[,which.max(methRes$r2Test)]) %>%
  filter(coef != 0) %>%
  arrange(desc(abs(coef))) %>%
  select(id, coef)

methCoef <- coefTab %>% 
  mutate(coef  = formatC(coef, digits=2)) %>%
  dplyr::rename(`Probe ID` = id, 
         Coefficient = coef)

methCoef

## # A tibble: 34 × 2
##    `Probe ID` Coefficient
##    <chr>      <chr>      
##  1 cg04497889 -0.46      
##  2 cg27150870 -0.38      
##  3 cg19897113 -0.34      
##  4 cg02506717 -0.21      
##  5 cg14036069 -0.2       
##  6 cg14999716 -0.19      
##  7 cg06397264 -0.18      
##  8 cg27138600 -0.16      
##  9 cg26799877 -0.15      
## 10 cg26203572 -0.15      
## # … with 24 more rows

6.1.3 Using drug responses

Prepare model

y <- structure(facTab$value, names = facTab$sample)
X <- trainData[["Drugs"]]
X <- X[,complete.cases(t(X))]
sampleOverlap <- intersect(names(y), colnames(X))
y <- y[sampleOverlap]
X <- X[,sampleOverlap]

#remove highly correlated features
X <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)$reduced

Run glm model

set.seed(5862)
drugRes <- runGlm(X,y, repeats = 20, folds=5, method = "lasso", lambda ="lambda.1se", testRatio = 0.3)

6.1.4 Using gene mutations

Prepare model

y <- structure(facTab$value, names = facTab$sample)
X <- trainData[["Mutations"]]

#fill the missing value with majority
X <- apply(X, 1, function(x) {
  xVec <- x
  avgVal <- mean(x,na.rm= TRUE)
  if (avgVal >= 0.5) {
    xVec[is.na(xVec)] <- 1
  } else xVec[is.na(xVec)] <- 0
  xVec
})

sampleOverlap <- intersect(names(y), rownames(X))
y <- y[sampleOverlap]
X <- X[sampleOverlap,]

Run glm model

set.seed(5862)
geneRes <- runGlm(X,y, repeats = 20, folds=5, method = "lasso", lambda ="lambda.1se", testRatio = 0.3)

6.1.5 Compare the prediction accuracy (variance explained for CLL-PD)

Summarized variance explained

plotTab <- tibble(r2 = rnaRes$r2Test, set = "mRNA") %>%
  bind_rows(tibble(r2 = methRes$r2Test, set = "Methylation")) %>%
  bind_rows(tibble(r2 = drugRes$r2Test, set = "Drugs")) %>%
  bind_rows(tibble(r2 = geneRes$r2Test, set = "Mutations")) %>%
  group_by(set) %>% summarise(meanR2 = mean(r2, na.rm = TRUE), sdR2 = sd(r2, na.rm = TRUE)) %>%
  arrange(desc(meanR2)) %>% mutate(set = factor(set, levels = set))

plotTabAll <- tibble(r2 = rnaRes$r2Test, set = "mRNA") %>%
  bind_rows(tibble(r2 = methRes$r2Test, set = "Methylation")) %>%
  bind_rows(tibble(r2 = drugRes$r2Test, set = "Drugs")) %>%
  bind_rows(tibble(r2 = geneRes$r2Test, set = "Mutations")) %>%
  mutate(set = factor(set, levels = levels(plotTab$set)))

Summarize number of selected features

featureNum <-  tibble(n = nFeature(rnaRes), set = "mRNA") %>%
  bind_rows(tibble(n = nFeature(methRes), set = "Methylation")) %>%
  bind_rows(tibble(n = nFeature(drugRes), set = "Drugs")) %>%
  bind_rows(tibble(n = nFeature(geneRes), set = "Mutations")) %>%
  group_by(set) %>% summarise(meanN = mean(n, na.rm = TRUE), sdN = sd(n, na.rm = TRUE)) %>%
  mutate(set = factor(set, levels = levels(plotTab$set)))

featureNumAll <-  tibble(n = nFeature(rnaRes), set = "mRNA") %>%
  bind_rows(tibble(n = nFeature(methRes), set = "Methylation")) %>%
  bind_rows(tibble(n = nFeature(drugRes), set = "Drugs")) %>%
  bind_rows(tibble(n = nFeature(geneRes), set = "Mutations")) %>%
  mutate(set = factor(set, levels = levels(plotTab$set)))

varExpBar <- ggplot(plotTab, aes(x=set, y = meanR2, fill = set)) + 
  geom_col(width = 0.4) +
  ggbeeswarm::geom_quasirandom(data = plotTabAll, aes(x=set, y=r2), alpha =0.3, width = 0.2) +
  geom_errorbar(aes(ymin = meanR2 - sdR2, ymax = meanR2 + sdR2), width =0.2) +
  ylim(0,1) + xlab("data set") + ylab(bquote('Variance explained ('*R^2*')')) +
  scale_fill_manual(values = structure(c(colList[1:3],colList[5]),names = as.character(plotTab$set))) +
  theme_full +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90, hjust =1, vjust =0.5))

numBar <- ggplot(featureNum, aes(x=set, y = meanN)) + 
  ggbeeswarm::geom_quasirandom(data = featureNumAll, aes(x=set, y =n), width = 0.2, alpha=0.3) +
  geom_point(size=3) +
  geom_errorbar(aes(ymin = meanN - sdN, ymax = meanN + sdN), width =0.2) +
  xlab("") + ylab("Number of features") +
  theme_half + 
  theme(legend.position = "none", panel.border = element_blank(), 
        axis.line.x = element_blank(),
        axis.ticks.x = element_blank(), 
        axis.title.x = element_blank(), 
        axis.text.x = element_blank())

varPlot <- plot_grid(varExpBar, numBar, align = "v", nrow =2, rel_heights = c(0.7,0.3))
varPlot

6.2 Estimate and test F4 (CLL-PD) in external cohorts using gene expression

6.2.1 ICGC-CLL cohort with RNAseq data (EGAS00001000374)

6.2.1.1 Pre-processing data

Download ICGC dataset from EMBL server (The raw sequencing, genomic and outcome data can be queried from ICGC portatl: https://icgc.org/icgc/cgp/64/530/826)

icgcPath <- file.path(tempdir(),"rnaICGC.RData")
if(!file.exists(icgcPath)) {
  download.file("https://www.huber.embl.de/users/jlu/data/rnaICGC.RData",
                icgcPath)
  load(icgcPath)
} else {
  load(icgcPath)
}

Load and process ICGC expression dataset

rnaExt.vst <- varianceStabilizingTransformation(rnaExternal)

#Adjusted for potential batch effect
exprMat <- assay(rnaExt.vst)
batch <- as.factor(rnaExt.vst$raw_data_accession)
exprMat.combat <- sva::ComBat(exprMat, batch = batch)

## Found 426 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.

assay(rnaExt.vst) <- exprMat.combat

Load and process internal RNAseq dataset

rna$factor <- facTab[match(colnames(rna), facTab$sample),]$value
rna <- rna[,!is.na(rna$factor)] #use samples with LF4 values
rna.vst <- varianceStabilizingTransformation(rna)

Filter out low variant genes in both datasets

exprMat <- assay(rna.vst)
exprMat.ext <- assay(rnaExt.vst)

#icgc dataset
sds <- genefilter::rowSds(exprMat.ext)
exprMat.ext <- exprMat.ext[order(sds,decreasing = TRUE)[1:10000],]

#internal dataset
sds <- genefilter::rowSds(exprMat)
exprMat <- exprMat[order(sds,decreasing = TRUE)[1:10000],]

#subset to keep common genes
overGene <- intersect(rownames(exprMat), rownames(exprMat.ext))
exprMat <- exprMat[overGene,]
exprMat.ext <- exprMat.ext[overGene,]

Remove highly correlated gene in training data

X <- mscale(exprMat)
#remove highly correlated genes
reRes <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)
X <- reRes$reduced

6.2.2 Predict factor value in ICGC cohort

Build linear model using in-house CLL dataset

set.seed(5862)

y <- rna.vst$factor
modelTrain <- runGlm(X,y, method = "lasso", repeats=20, folds=5, lambda ="lambda.1se")
useModel <- modelTrain$modelList[[which.max(modelTrain$r2Train)]] #choose the best model

Predict this factor in the new dataset

newX <- exprMat.ext[colnames(X),]
newX <- t(mscale(newX))
y.pred <- glmnet:::predict.cv.glmnet(useModel,  newx = newX)[,1]
y.pred <- (y.pred - mean(y.pred))/(2*sd(y.pred))
dim(newX)

## [1]  253 5428

6.2.3 Test assocation with clincial outcome

Prepare outcome table based on patient annotations

survT <- colData(rnaExternal) %>% data.frame(stringsAsFactors = FALSE) %>% 
  rownames_to_column("sampleID") %>%
  mutate(predLF = y.pred[sampleID]) %>% 
  dplyr::select(sampleID, treatedAfter, TTT, predLF, OS, died) %>%
  dplyr::rename(patientID = sampleID) %>%
  filter(!(is.na(TTT) & is.na(OS))) #remove samples with none outcome avaialble

#how many samples
nrow(survT)

## [1] 249

Prepare risk factor table

riskTab <- colData(rnaExternal) %>% data.frame(stringsAsFactors = FALSE) %>% 
  rownames_to_column("sampleID") %>%
  dplyr::select(sampleID, IGHV, TP53, del17p,  age, sex, SF3B1, NOTCH1) %>%
  mutate(`TP53.del17p` = TP53 | del17p, age = age/10) %>%
  mutate_if(is.logical, as.numeric) %>%
  select(-TP53, -del17p) %>%
  dplyr::rename(patientID = sampleID) %>%
  mutate(LF4 = y.pred[patientID], IGHV =factor(IGHV, levels = c("U","M")))

6.2.3.1 Univariate test

Overall survival

#calculate p value for cox regression
pOS <- comSurv(survT$predLF, survT$OS, survT$died)
kmOS <- km(survT$predLF, survT$OS, survT$died, "Overall survival (ICGC-CLL)",
   stat = "maxstat",pval = pOS$p, showTable = TRUE)
kmOS

Time to treatment

pTTT <- comSurv(survT$predLF, survT$TTT, survT$treatedAfter)
kmTTT <- km(survT$predLF, survT$TTT, survT$treatedAfter, "Time to treatment (ICGC-CLL)", stat = "maxstat", pval = pTTT$p, showTable = TRUE)
kmTTT

Prepare a summary table

sumOutcome <- bind_rows(mutate(pTTT, outcome = "TTT"),mutate(pOS,outcome = "OS")) %>% mutate(cohort = "ICGC-CLL", n=nrow(survT))

6.2.3.2 Combine IGHV and CLL-PD

KM plot for subgroup defined by IGHV status and median latent factor values

groupTab <- survT %>% mutate(IGHV = rnaExternal[,patientID]$IGHV) %>%
  filter(!is.na(IGHV)) %>% 
  mutate(group = ifelse(predLF > median(predLF),"highPD","lowPD")) %>%
  mutate(subgroup = paste0(IGHV,"_",group))
  
groupList <- list()
# TTT
groupList[["TTT"]] <- km(groupTab$subgroup, groupTab$TTT, groupTab$treatedAfter, "Time to treatment (ICGC-CLL)", stat = "binary", showP = TRUE, showTable = TRUE, yLabelAdjust = -5)

# OS
groupList[["OS"]] <- km(groupTab$subgroup, groupTab$OS, groupTab$died, "Overall survival (ICGC-CLL)", stat = "binary", showP = TRUE, showTable = TRUE, yLabelAdjust = -5)

grid.arrange(grobs = groupList, ncol = 2)

6.2.3.3 Multivariate test

surv1 <- runCox(survT, dplyr::rename(riskTab, CLLPD= LF4, IGHV_mutated = IGHV, Sex_male=sex, Age=age), "OS", "died")

summary(surv1)

## Call:
## coxph(formula = Surv(time, endpoint) ~ ., data = testTab)
## 
##   n= 247, number of events= 81 
##    (6 observations deleted due to missingness)
## 
##                    coef exp(coef)  se(coef)      z Pr(>|z|)    
## IGHV_mutatedM -0.988694  0.372062  0.238190 -4.151 3.31e-05 ***
## Age            0.716112  2.046461  0.125042  5.727 1.02e-08 ***
## Sex_malem      0.335878  1.399168  0.263964  1.272   0.2032    
## SF3B1          0.215390  1.240346  0.375213  0.574   0.5659    
## NOTCH1        -0.003541  0.996465  0.310940 -0.011   0.9909    
## TP53.del17p    0.196096  1.216643  0.444733  0.441   0.6593    
## CLLPD          0.644055  1.904186  0.258668  2.490   0.0128 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##               exp(coef) exp(-coef) lower .95 upper .95
## IGHV_mutatedM    0.3721     2.6877    0.2333    0.5934
## Age              2.0465     0.4886    1.6016    2.6148
## Sex_malem        1.3992     0.7147    0.8340    2.3472
## SF3B1            1.2403     0.8062    0.5945    2.5878
## NOTCH1           0.9965     1.0035    0.5417    1.8329
## TP53.del17p      1.2166     0.8219    0.5089    2.9088
## CLLPD            1.9042     0.5252    1.1469    3.1615
## 
## Concordance= 0.781  (se = 0.027 )
## Likelihood ratio test= 71.14  on 7 df,   p=9e-13
## Wald test            = 67.1  on 7 df,   p=6e-12
## Score (logrank) test = 70.23  on 7 df,   p=1e-12

haOS <- plotHazard(surv1, title = "OS (ICGC-CLL)") + 
  scale_y_log10(limits = c(0.1,5))

haOS

Time to treatment

surv1 <- runCox(survT, dplyr::rename(riskTab, CLLPD= LF4, IGHV_mutated = IGHV, Sex_male=sex, Age=age), "TTT", "treatedAfter")

summary(surv1)

## Call:
## coxph(formula = Surv(time, endpoint) ~ ., data = testTab)
## 
##   n= 233, number of events= 127 
##    (20 observations deleted due to missingness)
## 
##                   coef exp(coef) se(coef)      z Pr(>|z|)    
## IGHV_mutatedM -1.41100   0.24390  0.19162 -7.364 1.79e-13 ***
## Age           -0.33373   0.71625  0.07811 -4.273 1.93e-05 ***
## Sex_malem      0.15982   1.17331  0.19901  0.803  0.42191    
## SF3B1          0.77086   2.16162  0.27490  2.804  0.00504 ** 
## NOTCH1         0.32928   1.38997  0.25965  1.268  0.20474    
## TP53.del17p    0.05686   1.05850  0.35661  0.159  0.87332    
## CLLPD          0.56004   1.75074  0.19324  2.898  0.00375 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##               exp(coef) exp(-coef) lower .95 upper .95
## IGHV_mutatedM    0.2439     4.1001    0.1675    0.3551
## Age              0.7162     1.3962    0.6146    0.8347
## Sex_malem        1.1733     0.8523    0.7944    1.7330
## SF3B1            2.1616     0.4626    1.2612    3.7049
## NOTCH1           1.3900     0.7194    0.8356    2.3122
## TP53.del17p      1.0585     0.9447    0.5262    2.1293
## CLLPD            1.7507     0.5712    1.1988    2.5568
## 
## Concordance= 0.738  (se = 0.02 )
## Likelihood ratio test= 98.86  on 7 df,   p=<2e-16
## Wald test            = 99.28  on 7 df,   p=<2e-16
## Score (logrank) test = 115.1  on 7 df,   p=<2e-16

haTTT <- plotHazard(surv1, title = "TTT (ICGC-CLL)") + 
  scale_y_log10(limits = c(0.1,5))
haTTT

6.2.3.4 Correlation between predicted CLL-PD and demographics

Age

plotTab <- riskTab %>%
  mutate(C1C2 = paste0("C",rnaExternal[,match(patientID, colnames(rnaExternal))]$`C1C2`),
         sex = ifelse(is.na(sex),NA, ifelse(sex =="f" , "Female","Male")))

corRes <- cor.test(plotTab$age, plotTab$LF4)
pval <- formatNum(corRes$p.value, digits = 1, format = "e")
annoN <- sprintf("N = %s", nrow(filter(plotTab,!is.na(age))))
annoP <- bquote(italic("P")~"="~.(pval))
annoCoef <- sprintf("coefficient = %1.2f",corRes$estimate)

plotAge <- ggplot(plotTab, aes(x = age, y = LF4)) + 
  geom_point(color =colList[3]) + 
  geom_smooth(method = "lm", se=FALSE, color = "grey50", linetype ="dashed" ) + 
  annotate("text", x = max(plotTab$age), y = Inf, label = annoN,
           hjust=1, vjust =1.5, size = 5, parse = FALSE, col= colList[1]) +
  annotate("text", x = max(plotTab$age), y = Inf, label = annoP,
           hjust=1, vjust =3.5, size = 5, parse = FALSE, col= colList[1]) +
  annotate("text", x = max(plotTab$age), y = Inf, label = annoCoef,
           hjust=1, vjust =5.5, size = 5, parse = FALSE, col= colList[1]) +
  ylab("predicted CLL-PD") + xlab("Age (years)") +
  theme_full
plotAge

Sex

corRes <- t.test(LF4 ~ sex, plotTab)
pval <- formatNum(corRes$p.value, digits = 1, format = "e")
annoP <- bquote(italic("P")~"="~.(pval))

plotTab <- group_by(plotTab, sex) %>% mutate(n=n()) %>% ungroup() %>%
  mutate(sex = sprintf("%s\n(N=%s)",sex,n))

plotSex <- ggplot(plotTab, aes(x = sex, y = LF4)) + 
  geom_violin(aes(fill = sex)) +
  geom_point() + 
  annotate("text", x = Inf, y = Inf, label = annoP,
           hjust=1.2, vjust =2, size = 5, parse = FALSE, col= colList[1]) +
  scale_fill_manual(values = colList) +
  ylab("predicted CLL-PD") + xlab("Sex") +
  theme_full + theme(legend.position = "none")
plotSex

Plot C1/C2

plotTab <- filter(plotTab, C1C2 != "CNA")
corRes <- t.test(LF4 ~ C1C2, plotTab)
pval <- formatNum(corRes$p.value, digits = 1, format = "e")
annoP <- bquote(italic("P")~"="~.(pval))

plotTab <- filter(plotTab, !is.na(C1C2)) %>%
  group_by(C1C2) %>% mutate(n=n()) %>% ungroup() %>%
  mutate(C1C2 = sprintf("%s\n(N=%s)",C1C2,n))

plotC1C2 <- ggplot(filter(plotTab,!is.na(C1C2)), aes(x = C1C2, y = LF4)) + 
  geom_violin(aes(fill = C1C2)) +
  geom_point() + 
  annotate("text", x = Inf, y = Inf, label = annoP,
           hjust=1.1, vjust =2, size = 5, parse = FALSE, col= colList[1]) +
  scale_fill_manual(values = colList[3:length(colList)]) +
  ylab("predicted CLL-PD") + xlab("C1/C2 group") +
  theme_full + theme(legend.position = "none")
plotC1C2

6.2.4 Association with genomics

#t-test
y <- y.pred
geneICGC <- geneICGC[names(y),]
dim(geneICGC)

## [1] 253  99

tRes <- apply(geneICGC, 2, function(x) {
  res <- t.test(y ~ as.factor(x), var.equal = TRUE)
  data.frame(p = res$p.value, 
             df = res$estimate[[2]] - res$estimate[[1]])
}) %>% bind_rows() %>% mutate(gene = colnames(geneICGC),
                              p.adj = p.adjust(p, method = "BH")) %>%
  arrange(p)
filter(tRes, p.adj <0.05) %>% mutate_if(is.numeric, formatNum, digits =3, format = "e") %>% DT::datatable()

plotGeneVolcano <- plotVolcano(tRes, posCol = colList[1], negCol = colList[2],
            x_lab = "Difference of mean", ifLabel = TRUE) + 
  theme(legend.position = "none",
        plot.margin = margin(8,8,8,8))
plotGeneVolcano

6.2.5 Assocation with mutation load

Function to plot and annotate assocations

plotCor <- function(plotTab, x, y, x_label, y_label, title, color = "black") {
  corRes <- cor.test(plotTab[[x]], plotTab[[y]], use = "pairwise.complete.obs")
  annoCoef <- paste("'coefficient ='~",format(corRes$estimate,digits = 2))
  annoP <- paste("italic(P)~'='~",formatNum(corRes$p.value, digits = 1, format = "e"))
  
  ggplot(plotTab, aes_string(x = x, y = y)) + 
    geom_point(shape = 21, fill =color, size=3) + 
    geom_smooth(method = "lm", se=FALSE, color = "grey50", linetype ="dashed" ) + 
    annotate("text", x = max(plotTab[[x]]), y = Inf, label = annoCoef,
           hjust=1, vjust =1.5, size = 5, parse = TRUE, col= colList[1]) +
    annotate("text", x = max(plotTab[[x]]), y = Inf, label = annoP,
           hjust=1, vjust =3, size = 5, parse = TRUE, col= colList[1]) +
    ylab(y_label) + xlab(x_label) + ggtitle(title) +
    theme_full + theme(plot.margin = margin(8,8,8,8))
}

plotTab <- tibble(value = y.pred,load=rnaExternal[,names(y.pred)]$mutationLoad) %>%
  filter(!is.na(load))

plotMut <- plotCor(plotTab, "value", "load", "CLL-PD", "Total number of mutations",
        sprintf("WGS dataset (ICGC, n=%s)",nrow(plotTab)), colList[[6]])
plotMut

6.2.6 Enrichment analysis

highSet <- c("MYC targets v1", "MYC targets v2", "mTORC1 signaling","Oxidative phosphorylation")
gmts = list(H = system.file("externalData/h.all.v6.2.symbols.gmt", package = "mofaCLL"))
designMat <- model.matrix(~ 1 + y.pred)
enrichRes <- runCamera(assay(rnaExt.vst[,names(y.pred)]), designMat, gmts$H, 
                       id = rowData(rnaExt.vst)$symbol, 
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "ICGC-CLL cohort (Hallmarks, 5% FDR)", 
                       setToHighlight = highSet, setMap = setMap)
enrichICGC <- enrichRes$enrichPlot
enrichICGC

6.3 Three external cohorts with microarray data

6.3.1 Query data from GEO and format datasets

A list object to store external data

Sys.setenv("VROOM_CONNECTION_SIZE" = 131072*1000)
gseList <- list()

6.3.1.1 GSE22762 (Munic cohort)

An eight-gene expression signature for the prediction of survival and time to treatment in chronic lymphocytic leukemia

gse.all <- getGEO("GSE22762", GSEMatrix = TRUE)
gse <- gse.all[[1]]
gse$patID <- sapply(str_split(gse$title,"[_ ]"), function(x) x[4])
colnames(gse) <- gse$patID

exprs(gse) <- log2(exprs((gse)))

#only keep genes with entrenz ID presented in internal dataset
gse <- gse[!fData(gse)$ENTREZ_GENE_ID %in% c("",NA)]
gse <- gse[fData(gse)$ENTREZ_GENE_ID %in% rowData(rna)$entrezgene]


#reorder the expression matrix, so when subsetting on genes and there are multiple mapings, always put higher variant genes on top
sdsNew <- genefilter::rowSds(exprs(gse))
gse <- gse[order(sdsNew, decreasing = TRUE),]


gseList[["GSE22762"]] <- gse

6.3.1.2 GSE39671 (UCSD cohort)

Expression data from untreated CLL patients

gse <- getGEO("GSE39671", GSEMatrix = TRUE)
gse <- gse[[1]]
exprs(gse) <- log2(exprs((gse)))

#only keep genes with entrenz ID presented in internal dataset
gse <- gse[!fData(gse)$ENTREZ_GENE_ID %in% c("",NA)]
gse <- gse[fData(gse)$ENTREZ_GENE_ID %in% rowData(rna)$entrezgene]

#reorder the expression matrix, so when subsetting on genes and there are multiple mapings, always put higher variant genes on top
sdsNew <- genefilter::rowSds(exprs(gse))
gse <- gse[order(sdsNew, decreasing = TRUE),]

gseList[["GSE39671"]] <- gse

6.3.1.3 GSE10138 (Duke cohort)

A Genomic Approach to Improve Prognosis and Predict Therapeutic Response in Chronic Lymphocytic Leukemia (Duke_VA)

gse <- getGEO("GSE10138", GSEMatrix = TRUE)
gse <- gse[[1]]

patID <- sapply(str_split(gse$title," "),function(x) x[3])
patID <- sapply(str_split(patID,"-"),function(x) x[1])
gse$patID <- patID
gse <- gse[,!duplicated(gse$patID)]
exprs(gse) <- log2(limma::normalizeVSN(gse)) #the raw microarray was stored. Normalization needs to be performed first.

#only keep genes with entrenz ID presented in internal dataset
gse <- gse[fData(gse)$ENTREZ_GENE_ID %in% rowData(rna)$entrezgene]


#reorder the expression matrix, so when subsetting on genes and there are multiple mapings, always put higher variant genes on top
sdsNew <- genefilter::rowSds(exprs(gse))
gse <- gse[order(sdsNew, decreasing = TRUE),]


patAnno <- tibble(sampleID = colnames(gse),
                  type = gse$source_name_ch1)

#the object dose not have patient annotation, the clinical outcome data is obtained from the publication: PMID: 19861443
annoPath <- system.file("externalData/GSE10138_patAnno.csv", package = "mofaCLL")

csvAnno <- read.csv2(annoPath, stringsAsFactors = FALSE) %>%
  dplyr::select(Group, TTT..years., Rx) %>%
  rename(TTT = TTT..years., treatedAfter = Rx) %>%
  filter(!is.na(TTT)) %>% 
  mutate(treatedAfter = ifelse(treatedAfter == "Y ",TRUE,FALSE),
         sampleID = colnames(gse)) %>%
  as_tibble() 

patAnno <- left_join(patAnno, csvAnno)

gseList[["GSE10138"]] <- list(gse = gse, patAnno = patAnno)

6.3.2 Test associations with outcomes

6.3.2.1 Munic cohort (GSE22762)

6.3.2.1.1 Preprocessing

Filter out low variant genes in both datasets

gse <- gseList$GSE22762

#internal dataset
rnaSub <- rna.vst[! rowData(rna.vst)$entrezgene %in% c("",NA),]
exprMat <- assay(rnaSub)
sds <- genefilter::rowSds(exprMat)
rnaSub <- rnaSub[order(sds, decreasing = T)[1:10000],]

#external
gseMat <- exprs(gse)
sds <- genefilter::rowSds(gseMat)
gseSub <- gse[order(sds, decreasing = T)[1:10000],] #top variatne genes

#subset for common genes
commonGene <- intersect(fData(gseSub)$ENTREZ_GENE_ID, rowData(rnaSub)$entrezgene)
gseSub <- gseSub[match(commonGene,fData(gseSub)$ENTREZ_GENE_ID),]
rownames(gseSub) <- rownames(rnaSub[match(commonGene, rowData(rnaSub)$entrezgene),])

#get expression matrix
gseMat <- exprs(gseSub)
exprMat <- assay(rnaSub)[rownames(gseMat),]

6.3.2.1.2 Model trainning

Remove highly correlated features in the training set

X <- mscale(exprMat)
reRes <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)
X <- reRes$reduced

set.seed(5862)
y <- rna.vst$factor
lassoRes <- runGlm(X,y, method = "lasso", repeats=20, folds=5, lambda ="lambda.1se")
useModel <- lassoRes$modelList[[which.max(lassoRes$r2Train)]]

6.3.2.1.3 Predict CLL-PD

Input matrix from new dataset

newX <- gseMat[colnames(X),]
newX <- t(mscale(newX)) 
#dimension of test set
dim(newX)

## [1]  107 5055

Predict factor using the best model

y.pred <- glmnet:::predict.cv.glmnet(useModel, newx = newX)[,1]
y.pred <- (y.pred - mean(y.pred))/(2*sd(y.pred))

6.3.2.1.4 Association with outcomes

pTab <- pData(gse)
patAnno <- tibble(sampleID = rownames(pTab),
                  TTT = as.numeric(pTab$`time to treatment (days):ch1`)/365,
                  treatedAfter = pTab$`treatment status (censoring day):ch1`,
                  OS = as.numeric(pTab$`overall survival (days):ch1`)/365,
                  died = pTab$`life status (censoring day):ch1`) %>%
  mutate(treatedAfter = as.logical(as.numeric(treatedAfter)),
         died = as.logical(as.numeric(died)))


plotTab <- tibble(sampleID = names(y.pred), factor = y.pred) %>%
  left_join(patAnno, by = "sampleID")

Factor vs TTT (cox regression)

pTTT <- comSurv(plotTab$factor, plotTab$TTT, plotTab$treatedAfter)

Factor vs OS (cox regression)

pOS <- comSurv(plotTab$factor, plotTab$OS, plotTab$died)

Add to summary table

sumOutcome <- mutate(bind_rows(mutate(pTTT, outcome = "TTT"),mutate(pOS,outcome = "OS")),cohort = "Munich",n = nrow(plotTab)) %>% bind_rows(sumOutcome)

KM plots

kmTTT_m1 <- km(plotTab$factor, plotTab$TTT, plotTab$treatedAfter, stat = "maxstat", pval = pTTT$p,  
   titlePlot = "Time to treatment (Munich)", showTable = TRUE)

kmOS_m1 <- km(plotTab$factor, plotTab$OS, plotTab$died, stat = "maxstat", pval =pOS$p,
   titlePlot = "Overall survival (Munich)", showTable = TRUE)
kmOS_m1

kmTTT_m1

6.3.2.2 Enrichment analysis

gmts = list(H = system.file("externalData/h.all.v6.2.symbols.gmt", package = "mofaCLL"))
designMat <- model.matrix(~ 1 + y.pred)
enrichRes <- runCamera(exprs(gse), designMat, gmts$H, 
                       id =fData(gse)$`Gene Symbol`, 
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "Munich cohort (Hallmarks, 5% FDR)", 
                       setToHighlight = highSet, setMap = setMap)
enrichMunich <- enrichRes$enrichPlot
enrichMunich

6.3.2.3 UCSD cohort (GSE39671)

6.3.2.3.1 Preprocessing

gse <- gseList[["GSE39671"]]

#internal dataset
rnaSub <- rna.vst[! rowData(rna.vst)$entrezgene %in% c("",NA),]
exprMat <- assay(rnaSub)
sds <- genefilter::rowSds(exprMat)
rnaSub <- rnaSub[order(sds, decreasing = T)[1:10000],]

#external
gseMat <- exprs(gse)
sds <- genefilter::rowSds(gseMat)
gseSub <- gse[order(sds, decreasing = T)[1:10000],] #top variatne genes

#subset for common genes
commonGene <- intersect(fData(gseSub)$ENTREZ_GENE_ID, rowData(rnaSub)$entrezgene)
gseSub <- gseSub[match(commonGene,fData(gseSub)$ENTREZ_GENE_ID),]
rownames(gseSub) <- rownames(rnaSub[match(commonGene, rowData(rnaSub)$entrezgene),])

#get expression matrix
gseMat <- exprs(gseSub)
exprMat <- assay(rnaSub)[rownames(gseMat),]

6.3.2.3.2 Model training

Remove highly correlated features

X <- mscale(exprMat)
#remove highly correlated genes
reRes <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)
X <- reRes$reduced

set.seed(5862)
y <- rna.vst$factor
lassoRes <- runGlm(X,y, method = "lasso", repeats=20, folds=5, lambda ="lambda.1se")
useModel <- lassoRes$modelList[[which.max(lassoRes$r2Train)]]

6.3.2.3.3 Predict CLL-PD

Input matrix from new dataset

newX <- gseMat[colnames(X),]
newX <- t(mscale(newX)) 
#dimension of test set
dim(newX)

## [1]  130 4933

Predict factor using the best model

y.pred <- glmnet:::predict.cv.glmnet(useModel, newx = newX)[,1]
y.pred <- (y.pred - mean(y.pred))/(2*sd(y.pred))

6.3.2.3.4 Association with outcome

patAnno <- tibble(sampleID = colnames(gse),
                  TTT = as.numeric(gse$`sampling time to first treatment (years):ch1`),
                  treatedAfter = gse$`treatment event (1:ch1`) %>%
  separate(treatedAfter, c("a","b","treatedAfter"),":") %>%
  mutate(treatedAfter = as.logical(as.numeric(treatedAfter))) %>%
  dplyr::select(sampleID, TTT, treatedAfter)


plotTab <- tibble(sampleID = names(y.pred), factor = y.pred) %>%
  left_join(patAnno, by = "sampleID")

Factor vs TTT (cox regression)

pTTT <- comSurv(plotTab$factor, plotTab$TTT, plotTab$treatedAfter)

Add to summary table

sumOutcome <- mutate(pTTT, outcome = "TTT", cohort = "UCSD",n=nrow(plotTab)) %>% bind_rows(sumOutcome)

KM plots

kmTTT_m2 <- km(plotTab$factor, plotTab$TTT, plotTab$treatedAfter, stat = "maxstat", pval = pTTT$p,  
   titlePlot = "Time to treatment (UCSD)", showTable = TRUE)

kmTTT_m2

6.3.2.3.5 Enrichment

gmts = list(H = system.file("externalData/h.all.v6.2.symbols.gmt", package = "mofaCLL"))
designMat <- model.matrix(~ 1 + y.pred)
enrichRes <- runCamera(exprs(gse), designMat, gmts$H, 
                       id =fData(gse)$`Gene Symbol`, 
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "UCSD cohort (Hallmarks, 5% FDR)", 
                       setToHighlight = highSet, setMap = setMap)
enrichUCSD <- enrichRes$enrichPlot
enrichUCSD

6.3.2.4 Duke cohort (GSE10138)

6.3.2.4.1 Preprocessing

gse <- gseList$GSE10138$gse
patAnno <- gseList$GSE10138$patAnno

#internal dataset
rnaSub <- rna.vst[! rowData(rna.vst)$entrezgene %in% c("",NA),]
exprMat <- assay(rnaSub)
sds <- genefilter::rowSds(exprMat)
rnaSub <- rnaSub[order(sds, decreasing = T)[1:10000],]

#external
gseMat <- exprs(gse)
sds <- genefilter::rowSds(gseMat)
gseSub <- gse[order(sds, decreasing = T)[1:10000],] #top variatne genes

#subset for common genes
commonGene <- intersect(fData(gseSub)$ENTREZ_GENE_ID, rowData(rnaSub)$entrezgene)
gseSub <- gseSub[match(commonGene,fData(gseSub)$ENTREZ_GENE_ID),]
rownames(gseSub) <- rownames(rnaSub[match(commonGene, rowData(rnaSub)$entrezgene),])

#get expression matrix
gseMat <- exprs(gseSub)
exprMat <- assay(rnaSub)[rownames(gseMat),]

6.3.2.4.2 Model training

Remove highly correlated features

X <- t(mscale(exprMat))
reRes <- removeCorrelated(X, cutoff = 0.9, method = "pearson", record = FALSE)
X <- reRes$reduced

set.seed(5862)
y <- rna.vst$factor

lassoRes <- runGlm(X,y, method = "lasso", repeats=20, folds=5, lambda ="lambda.1se")
useModel <- lassoRes$modelList[[which.max(lassoRes$r2Train)]]

6.3.2.4.3 Predict CLL-PD

newX <- t(mscale(gseMat[colnames(X),]))
#dimension of test set
dim(newX)

## [1]   61 3478

Predict factor using the best model

y.pred <- glmnet:::predict.cv.glmnet(useModel, newx = newX)[,1]
y.pred <- (y.pred - mean(y.pred))/(2*sd(y.pred))

6.3.2.4.4 Association with outcome

plotTab <- tibble(sampleID = names(y.pred), factor = y.pred) %>%
  left_join(patAnno, by = "sampleID")

Factor vs TTT (cox regression)

pTTT <- comSurv(plotTab$factor, plotTab$TTT, plotTab$treatedAfter)

Add information to summary table

sumOutcome <- mutate(pTTT, outcome = "TTT", cohort = "Duke",n=nrow(plotTab)) %>% 
  bind_rows(sumOutcome)

KM plots

kmTTT_m3 <- km(plotTab$factor, plotTab$TTT, plotTab$treatedAfter, stat = "maxstat", pval =pTTT$p,  
   titlePlot = "Time to treatment (Duke)", showTable = TRUE)

kmTTT_m3

6.3.2.4.5 Enrichment

gmts = list(H = system.file("externalData/h.all.v6.2.symbols.gmt", package = "mofaCLL"))
designMat <- model.matrix(~ 1 + y.pred)
enrichRes <- runCamera(exprs(gse), designMat, gmts$H, 
                       id =fData(gse)$`Gene Symbol`, 
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "Duke cohort (Hallmarks, 5% FDR)", 
                       setToHighlight = highSet, setMap = setMap)
enrichDuke <- enrichRes$enrichPlot
enrichDuke

6.4 Summarization plot of p-values and hazzard ratios in external cohorts

plotTab <- sumOutcome %>%
  mutate(cohort = sprintf("%s\n(n=%s)",cohort,n)) %>%
  mutate(cohort = factor(cohort, levels = unique(cohort)))


haSumPlot <- ggplot(plotTab, aes(x=cohort, y = HR, col = outcome, dodge = outcome)) + 
  geom_hline(yintercept = 1, linetype = "dotted") +
  geom_point(position = position_dodge(width=0.8)) +
  geom_errorbar(position = position_dodge(width =0.8), 
                aes(ymin = lower, ymax = higher), width = 0.3, size=1) + 
  geom_text(position = position_dodge2(width = 0.8),
            aes(x=as.numeric(as.factor(cohort))+0.15, 
                label = sprintf("italic(P)~'=%s'",formatNum(p, digits = 1, format = "e"))),
            color = "black",size =5, parse = TRUE) +
  xlab("Cohorts") + ylab("Hazard ratio") +
  scale_y_log10(limits = c(0.3,11)) +
  coord_flip() + theme_full + theme(legend.title = element_blank(),
                                    legend.position = c(0.2,0.1),
                                    legend.background = element_blank(),
                                    legend.key.size = unit(0.5,"cm"),
                                    legend.key.width = unit(0.6,"cm"),
                                    legend.text = element_text(size=rel(1.2))) +
  scale_color_manual(values = c(OS = colList[3], TTT = colList[5])) 

haSumPlot

6.5 Arrange plots for Section 2 in the manuscript

6.5.1 Main figure 2

title = ggdraw() + draw_figure_label("Figure 2", fontface = "bold", position = "top.left",size=22)

pout <- ggdraw() +
  draw_plot(varPlot, 0.02, 0.5, 0.22, 0.5) +
  draw_plot(groupList[["TTT"]], 0.25, 0.5 , 0.36, 0.5) +
  draw_plot(groupList[["OS"]], 0.64, 0.5, 0.36, 0.5) +
  draw_plot(haSumPlot, 0, 0, 0.32, 0.465) +
  draw_plot(haTTT, 0.33, 0, 0.30, 0.48) +
  draw_plot(haOS, 0.66, 0, 0.30, 0.48) +
  draw_plot_label(c("a", "b", "c", "d","e","f"), 
                  c(0, 0, 0.33, 0.66, 0.33, 0.65), c(1, 0.5, 1, 1, 0.5,0.5), size = 20)
plot_grid(title, pout, rel_heights = c(0.05,0.95), ncol = 1)

6.5.2 Supplementary figure for KM-plots

plot_grid(kmTTT, kmOS, kmTTT_m1, kmOS_m1, kmTTT_m2, kmTTT_m3, nrow =3)

6.5.3 Supplementary figure for the assocations between predicted CLL-PD and demographics in ICGC cohort

plot_grid(plotAge,plotSex,nrow=1)

6.5.4 Supplementary figure for the associations between predicted CLL-PD and genomics in ICGC cohort

plot_grid(plotGeneVolcano, plotMut,rel_widths = c(1,1), nrow =1, align = "h", axis="t")

6.5.5 Supplementary figure for the enrichment analysis in ICGC cohort

enrichICGC

6.5.6 Supplementary figure for the enrichment analysis in four external cohorts

plot_grid(enrichICGC, enrichMunich, enrichUCSD, enrichDuke, ncol=2,
          align = "hv", axis = "l")

7 Association between F4 (CLL-PD) and genomics

Load datasets

data("mofaOut","gene","rna","mutLoad","tabFACS")

Download methylation dataset from server to a temporary folder

methPath <- file.path(tempdir(),"meth.RData")
if(!file.exists(methPath)) {
  download.file("https://www.huber.embl.de/users/jlu/data/meth.RData",
                methPath)
  load(methPath)
} else {
  load(methPath)
}

Process MOFA model

#get factor values
facTab <- getFactors(
  MOFAobject, 
  factors = "LF4",
  as.data.frame = TRUE
) %>% as_tibble() 

#get all factors
facTab.all <- getFactors(
  MOFAobject,
  factors = "all",
  as.data.frame = TRUE
) %>% as_tibble()

#get weights
weightTab <- getWeights(
  MOFAobject,
  factors = "all",
  as.data.frame = TRUE
) %>% as_tibble() %>%
  mutate(feature = ifelse(feature == "IGHV.status", "IGHV", feature))
#get training data
trainData <- getTrainData(MOFAobject)

detach("package:MOFA", unload = TRUE)

7.1 Loading of features in the genomic view

plotGeneWeight <- plotTopWeights.m(weightTab, view =  "Mutations", factor = "LF4", nfeatures = 5) + ylab("Absolute loading on F4 (CLL-PD)") +
  theme(plot.margin = margin(10,0,10,0))
plotGeneWeight

7.2 Test of associations between genetic variations with latent factor

Student’s t-test

sampleOverlap <- intersect(rownames(gene.unfiltered), facTab$sample)
genData <- gene.unfiltered[sampleOverlap,]

#remove gene with higher than 40% missing values
genData <- genData[,colSums(is.na(genData))/nrow(genData) <= 0.4]

#remove genes with less than 3 mutated cases
genData <- genData[,colSums(genData, na.rm = TRUE) >= 3]

#t-test
y <- facTab[match(sampleOverlap, facTab$sample),]$value
tRes <- apply(genData, 2, function(x) {
  res <- t.test(y ~ as.factor(x), var.equal = TRUE)
  data.frame(p = res$p.value, 
             df = res$estimate[[2]] - res$estimate[[1]])
}) %>% bind_rows() %>% mutate(gene = colnames(genData),
                              p.adj = p.adjust(p, method = "BH")) %>%
  arrange(p)
filter(tRes, p.adj <0.05) %>% mutate_if(is.numeric, formatNum, digits =3, format = "e") %>% DT::datatable()

Volcano plot

plotGeneVolcano <- plotVolcano(tRes, posCol = colList[1], negCol = colList[2],
            x_lab = "Difference of mean", ifLabel = TRUE) + 
  theme(legend.position = "none",
        plot.margin = margin(8,8,8,8),
        axis.title.y = element_text(vjust=0))
plotGeneVolcano

Boxplots of associated genes (5% FDR)

tRes.sig <- filter(tRes, p.adj <= 0.05)
plotList <- lapply(tRes.sig$gene, function(geneName) {
   plotTab <- tibble(factor = y, 
                     status = genData[,geneName]) %>%
     filter(!is.na(status)) %>%
     mutate(status = ifelse(status ==1, "mut","wt")) %>%
     group_by(status) %>% mutate(n=n()) %>% ungroup() %>%
     mutate(status = sprintf("%s\n(N=%s)",status,n)) %>%
     arrange(desc(n)) %>% mutate(status = factor(status, levels = unique(status))) %>%
     mutate(geneName = italicizeGene(geneName))
   pval <- formatNum(filter(tRes, gene == geneName)$p, digits = 1, format="e")
   titleText <- sprintf("%s~' ('~italic(P)~'='~'%s'~')'",unique(plotTab$geneName),pval) 
   ggplot(plotTab, aes(x=status, y = factor)) +
     geom_boxplot(width=0.3, aes(fill = status), outlier.shape = NA) + 
     geom_beeswarm(col = "black", size =1,cex = 1.5, alpha=0.5) +
     ggtitle(parse(text = titleText))+
     #ggtitle(sprintf("%s (p = %s)",geneName, formatNum(pval, digits = 1, format = "e"))) + 
     ylab("F4 value") + xlab("") +
     scale_fill_manual(values = colList[3:4]) +
     theme_full +
     theme(legend.position = "none",
           plot.title = element_text(hjust = 0.5),
           plot.margin = margin(3,3,3,3))
})
grid.arrange(grobs = plotList, ncol=3)

plotGeneViolin <- plotList

Heatmap plot showing associated genetic features (5% FDR), arranged by F4 values.

annoCol <- data.frame(row.names = sampleOverlap,
                      F4 = facTab[match(sampleOverlap, facTab$sample),]$value)
annoCol <- annoCol[order(annoCol$F4),,drop = FALSE]
plotMat <- t(genData[rownames(annoCol),filter(tRes, p.adj <= 0.05)$gene])
plotMat[is.na(plotMat)] <- 0
breaks <- c(0,0.5,1)
colors <- c("white","black")
rowLabs <- italicizeGene(rownames(plotMat))
plotGeneHeatmap <- pheatmap(plotMat, cluster_cols = FALSE, cluster_rows = FALSE, 
                            annotation_col = annoCol, border_color = "black",
         breaks = breaks, color = colors, show_colnames = FALSE, 
         labels_row = parse(text = rowLabs),
         legend = FALSE, silent = TRUE)$gtable
plot_grid(plotGeneHeatmap)

7.3 Correlation with the mutation load

7.4 Scatter plot of correlation with mutations load and recurrent mutation load

compareTab <- filter(mutLoadSum, group == "Overall", subgroup == "Overall") %>%
  select(patID, platform, n)
sumReCurr.cnv <- gene.unfiltered[,c("trisomy12","del13q","del11q","del17p",
                                    "gain2p","gain8q","del8p","del6q","del18p","del20p")]
sumRecurr.snv <- gene.unfiltered[,71:113] #only select recurrent gene mutations
sumRecurr <- cbind(sumRecurr.snv, sumReCurr.cnv) %>%
  data.frame() %>% rownames_to_column("patID") %>%
  gather(key = "gene", value = "status",-patID) %>%
  group_by(patID) %>% summarise(n.recurr = sum(status,na.rm = TRUE))

compareTab <- left_join(compareTab, sumRecurr, by = "patID") %>% 
  mutate(LF4 = facTab[match(patID, facTab$sample),]$value) %>%
  filter(!is.na(LF4),!is.na(n),!is.na(n.recurr))

7.5 Assocations with total number of mutations

Function to plot and annotate assocations

plotCor <- function(plotTab, x, y, x_label, y_label, title, color = "black") {
  corRes <- cor.test(plotTab[[x]], plotTab[[y]], use = "pairwise.complete.obs")
  annoCoef <- paste("'coefficient ='~",format(corRes$estimate,digits = 2))
  annoP <- paste("italic(P)~'='~",formatNum(corRes$p.value, digits = 1, format = "e"))
  
  ggplot(plotTab, aes_string(x = x, y = y)) + 
    geom_point(shape = 21, fill =color, size=3) + 
    geom_smooth(method = "lm", se=FALSE, color = "grey50", linetype ="dashed" ) + 
    annotate("text", x = max(plotTab[[x]]), y = Inf, label = annoCoef,
           hjust=1, vjust =1.5, size = 5, parse = TRUE, col= colList[1]) +
    annotate("text", x = max(plotTab[[x]]), y = Inf, label = annoP,
           hjust=1, vjust =3, size = 5, parse = TRUE, col= colList[1]) +
    ylab(y_label) + xlab(x_label) + ggtitle(title) +
    theme_full + theme(plot.margin = margin(8,8,8,8))
}

7.5.0.1 WES

plotTab <- filter(compareTab, platform == "WES") 
corTotal.wes <- plotCor(plotTab, "LF4", "n", "CLL-PD", "Total number of mutations",
        sprintf("WES dataset (n=%s)",nrow(plotTab)), colList[[6]])
corTotal.wes

7.5.0.2 WGS

plotTab <- filter(compareTab, platform == "WGS") 
corTotal.wgs <- plotCor(plotTab, "LF4", "n", "CLL-PD", "Total number of mutations",
        sprintf("WGS dataset (n=%s)",nrow(plotTab)), colList[[6]])
corTotal.wgs

7.5.1 Assocations with recurrent aberration

plotTab <- distinct(compareTab, patID, .keep_all = TRUE)
corRecurr <- plotCor(plotTab, "LF4", "n.recurr", "CLL-PD", "Number of recurrent aberrations",
        "",colList[[5]]) 
corRecurr

8 Assocation between F4 (CLL-PD) and DNA methylation

8.1 Correlation test using linear model

methMat <- assays(meth[,colnames(meth) %in% facTab$sample])[["beta"]]
factorMatrix <- facTab.all %>% spread(key = factor, value = value) %>%
  data.frame() %>% column_to_rownames("sample") 
factorMatrix <- factorMatrix[colnames(methMat),]
factorMatrix <- factorMatrix[,complete.cases(t(factorMatrix))]
designMat <- model.matrix(~1 + .,factorMatrix)
fit <- lmFit(methMat, designMat)
fit2 <- eBayes(fit)
corRes <- limma::topTable(fit2, number ="all", coef = "LF4", adjust.method = "BH") %>%
  rownames_to_column("id") %>% as_tibble()
corRes.LF1 <- limma::topTable(fit2, number ="all", coef = "LF1", adjust.method = "BH") %>%
  rownames_to_column("id") %>% as_tibble()

Number of tested associations

dim(methMat)

## [1] 394735    158

8.2 Compare number of significant correlations in F1 and F4

numTab.LF4 <- filter(corRes, adj.P.Val <= 0.01) %>% mutate(direction = ifelse(t>0, "pos","neg")) %>%
  group_by(direction) %>% summarise(number = length(id)) %>%
  mutate(factor = "F4 (CLL-PD)")
numTab.LF1 <- filter(corRes.LF1, adj.P.Val <= 0.01) %>% mutate(direction = ifelse(t>0, "pos","neg")) %>%
  group_by(direction) %>% summarise(number = length(id)) %>% 
  mutate(factor = "F1")

plotTab <- bind_rows(numTab.LF4, numTab.LF1)
sigNumPlot <- ggplot(plotTab, aes(x=factor, y=number, fill = direction)) + 
  geom_bar(stat = "identity", width = 0.8,
           position = position_dodge2(width = 6),
           col = "black") +
  geom_text(aes(label=number), 
            position = position_dodge(width = 0.9),
            size=4, hjust=-0.1)  +
  scale_fill_manual(name = "", labels = c("Hypomethylated","Hypermethylated"), values = colList) +
  coord_flip(ylim = c(0,70000), expand = FALSE) + xlab("") + ylab("Number of significant associations") + theme_half +
  theme(legend.position = c(0.75,0.22), legend.text = element_text(size=12), legend.background = element_rect(fill = NA))
sigNumPlot

8.3 Correlation between mean methylation values and F4

meanBeta <- colMeans(methMat)
plotTab <- tibble(patID = names(meanBeta),
                  meanBeta = meanBeta,
                  LF4 = facTab[match(patID, facTab$sample),]$value)

corBeta <- cor.test(plotTab$meanBeta, plotTab$LF4)

annoCoef <- paste("'coefficient ='~",format(corBeta$estimate,digits = 2))
annoP <- paste("italic(P)~'='~",formatNum(corBeta$p.value, digits = 1, format = "e"))

plotMeanBeta <- ggplot(plotTab, aes(x = LF4, y = meanBeta)) + 
  geom_point(shape = 21, fill =colList[3], size=3) + 
  geom_smooth(method = "lm", se=FALSE, color = "grey50", linetype ="dashed" ) + 
  annotate("text", x = 3, y = Inf, label = annoCoef,
           hjust=1, vjust =1.5, size = 5, parse = TRUE, col= colList[1]) +
  annotate("text", x = 3, y = Inf, label = annoP,
           hjust=1, vjust =3, size = 5, parse = TRUE, col= colList[1]) +
  ylab("Mean beta values") + xlab("CLL-PD") +
  theme_full 

plotMeanBeta

8.4 Enrichment of F4 associated probes in the PMD region

8.4.1 Probes in the CLL specific PMD region

Read list of CGs that lie in the PMD region, according to Mallm et al. 2019 (PMID: 31118277)

probesPMD <- readLines(system.file("externalData/PMD_probes.txt", package = "mofaCLL"))

How many CpGs correlated with LF4 are in the PMD region?

sigProbe <- filter(corRes, adj.P.Val <=0.01, t <0)$id
table(sigProbe %in% probesPMD)

## 
## FALSE  TRUE 
## 32920 27044

Fisher’s exact test

testTab <- table(rownames(meth) %in% sigProbe, rownames(meth) %in% probesPMD)
fisher.test(testTab, alternative = "greater")

## 
##  Fisher's Exact Test for Count Data
## 
## data:  testTab
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  2.805021      Inf
## sample estimates:
## odds ratio 
##   2.847808

8.4.2 Probes in the general solo-WCGW PMD region

Download the list of CG probes in the solo-WCGW PMD region (Zhou et al., 2018, PMID: 29610480)

wcgwPath <- system.file("externalData/hm450.comPMD.probes.tsv", package = "mofaCLL")
probesPMD.wcgw <- read_delim(wcgwPath, delim = "\t", col_names = FALSE, skip = 1)[[2]]

How many probes annotated as WCGW probes are on our array?

onArray <- probesPMD.wcgw %in% rownames(meth)
table(onArray)

## onArray
## FALSE  TRUE 
##  1074  5140

#subset
probesPMD.wcgw <- probesPMD.wcgw[onArray]

How many CpGs correlated with LF4 are in the WCGW region?

table(sigProbe %in% probesPMD.wcgw)

## 
## FALSE  TRUE 
## 57089  2875

Fisher’s exact test

testTab <- table(rownames(meth) %in% sigProbe, rownames(meth) %in% probesPMD.wcgw)
fisher.test(testTab, alternative = "greater")

## 
##  Fisher's Exact Test for Count Data
## 
## data:  testTab
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  7.052028      Inf
## sample estimates:
## odds ratio 
##   7.394117

FGSEA tset and plot

sigList <- list("Solo-WCGW CpGs in PMDs" = probesPMD.wcgw)
enrichRes <- runFGSEA(corRes, sigList, pcut =1, maxSize = 10000, name = "id", stat = "t")
enrichPlot <- enrichRes$plots[[1]] + theme(plot.margin = margin(1,20,1,20))
enrichPlot

8.5 Association between CLL-PD(F4) and CLL proliferation ability (Ki-67 populaiton)

8.5.1 Paired t-test

testTab <- tabFACS %>% mutate(posKI = CD19posKI67pos)
waterTab <- filter(testTab, treat == "H2O") %>% select(patID, posKI) %>%
  dplyr::rename(popWater = posKI)
testTab <- filter(testTab, treat!="H2O") %>%
  dplyr::rename(popTreat = posKI) %>% left_join(waterTab)

resPair <- group_by(testTab, treat, groupLF) %>% nest() %>%
  mutate(m = map(data,~t.test(.$popTreat,.$popWater, paired = TRUE))) %>%
  mutate(res = map(m, broom::tidy)) %>% unnest(res) %>%
  select(treat, groupLF, p.value) %>% arrange(p.value)
resPair

## # A tibble: 2 × 3
## # Groups:   groupLF, treat [2]
##   treat groupLF   p.value
##   <chr> <fct>       <dbl>
## 1 CPG   high    0.0000584
## 2 CPG   low     0.0708

8.5.2 T-test for compare CLL-PD high and low for CpG treatment

8.5.2.1 Univariate test

testTab <- tabFACS %>% filter(treat == "CPG") %>% mutate(posKI = CD19posKI67pos)
resUni <- t.test(posKI ~ groupLF, testTab)
resUni

## 
##  Welch Two Sample t-test
## 
## data:  posKI by groupLF
## t = 4.0037, df = 22.314, p-value = 0.0005845
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   5.402964 16.995869
## sample estimates:
## mean in group high  mean in group low 
##          14.173333           2.973917

8.5.2.2 Multi-vairate test

resMulti <- car::Anova(lm(posKI ~ groupLF + IGHV, testTab))
resMulti

## Anova Table (Type II tests)
## 
## Response: posKI
##            Sum Sq Df F value    Pr(>F)    
## groupLF    789.94  1 15.2933 0.0006602 ***
## IGHV       240.09  1  4.6481 0.0413283 *  
## Residuals 1239.66 24                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

8.5.3 Box plots to show associations

plotTab <- tabFACS %>% mutate(posKI = CD19posKI67pos/(CD19posKI67pos+CD19posKI67neg)) %>%
  mutate(subtype = plyr::revalue(IGHV, c("M"="M-CLL","U"="U-CLL")),
         group = plyr::revalue(groupLF, c("high" = "high CLL-PD\n(n=12)","low"="low CLL-PD\n(n=12)")),
         treat = plyr::revalue(treat, c("H2O" = "water", "CPG" = "CpG ODN"))) %>%
  mutate(group = factor(group,levels = c("high CLL-PD\n(n=12)","low CLL-PD\n(n=12)")),
         treat = factor(treat, levels = c("water","CpG ODN"))) %>%
  arrange(posKI) %>% mutate(patID = factor(patID, levels = unique(patID)))

annoText <- tibble(tt = c(paste("italic(P)~'=",formatNum(resPair$p.value[1], digits = 2),"'"),
                          paste("italic(P)~'=",formatNum(resPair$p.value[2], digits = 2),"'")), 
                   group  = c("high CLL-PD\n(n=12)","low CLL-PD\n(n=12)"))

plotCpG <- ggplot(plotTab, aes(x=treat, y = CD19posKI67pos)) + #geom_boxplot(outlier.alpha = NA) + 
  geom_point(aes(fill = IGHV), shape =21, size =4, cex=4, alpha=0.8) +
  geom_segment(x =1, xend =2, y=40,yend=40, size=0.2) +
  geom_line(aes(x=treat,y=CD19posKI67pos, group = patID), col = "grey50", linetype = "dashed") +
  geom_text(data =annoText, aes(label = tt), x = 1.5, y =40, vjust=-1, parse=TRUE, size=5) +
  scale_fill_manual(values = colList[c(3,6)]) +
  facet_wrap(~group) +
  ylim(c(0,55)) + theme_half+
  theme(legend.position = "bottom", strip.text = element_text(size=15, face= "bold"),
        strip.background = element_rect(fill = "white",color = "white"),
        axis.text.x = element_text(size=15.5),
        legend.text = element_text(size=15),
        legend.title = element_text(size=13),
        legend.margin = margin(0,0,0,0),
        legend.box.margin=margin(-5,-5,-5,-5)) +
  ylab("% Ki-67+ CD19+ cells") + xlab("")
plotCpG

8.6 Organizing plots for Section 3 in the manuscript

8.6.1 Main Figure 3

enMyc <- pictureGrob(readPicture(system.file("externalData//MYC.eps.xml", package = "mofaCLL")))

title = ggdraw() + draw_figure_label("Figure 3", fontface = "bold", position = "top.left",size=22)

pout <- plot_grid(plot_grid(plotGeneWeight, plotGeneVolcano, ncol=1, labels = c("a","b"), 
                            rel_heights = c(0.4,0.6), align = "hv", axis = "lr", label_size = 22),
                  plot_grid(corRecurr + theme(axis.title.y = element_text(vjust =0)),
                             plotMeanBeta , ncol=1, labels = c("c","e"), rel_heights = c(5,5),align = "hv", axis = "lr", label_size = 22),
          plot_grid(NULL, sigNumPlot,enMyc, plotCpG, ncol=1, labels = c("d","","f","g"), rel_heights = c(0.05,0.3,0.2,0.45), 
                    align = "hv", label_size = 22),
          ncol=3,rel_widths = c(1,0.9,1))

plot_grid(title, pout, rel_heights = c(0.05,0.95), ncol = 1)

8.6.2 Supplementary figures

8.6.2.1 Associations between genomic variantions and LF4

gridViolin <- plot_grid(plotlist = plotGeneViolin, ncol =3)
pout <- ggdraw() +
  draw_plot(plotGeneHeatmap, 0, 0.70, 1, 0.30) +
  draw_plot(gridViolin, 0, 0 , 1, 0.69)
pout

8.6.2.2 Associations between LF4 and mutation load

plot_grid(corTotal.wes, corTotal.wgs,
          labels = c(" "," "), label_size = 22)

9 Gene expression signatures for CLL-PD (F4)

Load datasets

data("mofaOut","gene","drug","validateScreen","rna","seahorse","protein","mito")

gmts = list(H = system.file("externalData/h.all.v6.2.symbols.gmt", package = "mofaCLL"))

setMap <- read_tsv(system.file("externalData/setToPathway.txt", package = "mofaCLL"), col_types = "cc")

mitoProtList <- readxl::read_xls(system.file("externalData/Human.MitoCarta3.0.xls", package = "mofaCLL"), 
                                 sheet = 2)$Symbol

Process MOFA model

library(MOFA)
#get factor values
facTab <- getFactors(
  MOFAobject, 
  factors = "LF4",
  as.data.frame = TRUE
) %>% as_tibble()

#get all factors
facTab.all <- getFactors(
  MOFAobject,
  factors = "all",
  as.data.frame = TRUE
) %>% as_tibble()

#get weights
weightTab <- getWeights(
  MOFAobject,
  factors = "all",
  as.data.frame = TRUE
) %>% as_tibble() %>%
  mutate(feature = ifelse(feature == "IGHV.status", "IGHV", feature))
#get training data
trainData <- getTrainData(MOFAobject)

detach("package:MOFA", unload = TRUE) #detach MOFA because it's "predict" function marks the "predict" function in glmnet

9.1 Pre-processing data

Processing RNAseq data

dds<-estimateSizeFactors(rna)
dds$factor <- facTab[match(dds$PatID, facTab$sample),]$value
dds$IGHV <- gene[match(dds$PatID, rownames(gene)),]$IGHV
ddsSub <- dds[,!is.na(dds$factor)]

#vst
ddsSub.vst <- varianceStabilizingTransformation(ddsSub)

#seperate M-CLL and U-CLL
ddsM.vst <- ddsSub.vst[,ddsSub.vst$IGHV %in% 1]
ddsU.vst <- ddsSub.vst[,ddsSub.vst$IGHV %in% 0]

9.2 Identify genes associated with CLL-PD

Correlation test using DESeq2

factorMatrix <- facTab.all %>% spread(key = factor, value = value) %>%
  data.frame() %>% column_to_rownames("sample") 
factorMatrix <- factorMatrix[colnames(ddsSub),]

designMat <- model.matrix(~1 +.,factorMatrix)
deRes <- DESeq(ddsSub, full = designMat, betaPrior = FALSE)


corRes.rna <- results(deRes, name = "LF4", tidy = TRUE) %>%
  mutate(symbol = rowData(ddsSub)[row,]$symbol) %>%
  dplyr::rename(logFC = log2FoldChange, t = stat, P.Value = pvalue, adj.P.Val = padj,id=row)

P value histogram

pHis <- ggplot(corRes.rna, aes(x=P.Value)) + geom_histogram(col="black",fill = colList[2],alpha=0.5) +
  xlab("P value") + ylab("Count") + ggtitle("P value histogram") +
  theme_full
pHis

Heatmap of significantly correlated genes (1% FDR)

corRes.sig <- filter(corRes.rna, adj.P.Val < 0.01)
exprMat <- assay(ddsSub.vst)
plotMat <- exprMat[corRes.sig$id,]
plotMat <- plotMat[order(corRes.sig$logFC, decreasing = TRUE),order(designMat[,"LF4"])]

colAnno <- data.frame(row.names = colnames(exprMat),
                      LF4 = designMat[,"LF4"])
colnames(colAnno) <- c("CLL-PD")
plotMat <- mscale(plotMat, censor = 4)

breaks <- seq(-4,4,length.out = 100)

pheatmap(plotMat, scale = "none", cluster_cols = FALSE, cluster_rows = FALSE,
         annotation_col = colAnno,
         color = colorRampPalette(c(colList[2],"white",colList[1]))(length(breaks)),
         breaks = breaks,
         show_rownames = FALSE, show_colnames = FALSE)

How many genes show significant correlation?

nrow(corRes.sig)

## [1] 5227

#percentage
nrow(corRes.sig)/nrow(ddsSub)

## [1] 0.3360118

How many genes show up-regulation?

nrow(filter(corRes.sig, t>0))

## [1] 2938

9.3 Gene enrichment analysis

9.3.1 Enrichment using Hallmark genesets for all CLLs

enrichRes <- runCamera(exprMat, designMat, gmts$H, 
                       id = rowData(ddsSub.vst[rownames(exprMat),])$symbol, contrast = 5, #LF4
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "Pathway enrichment on RNA level (5% FDR)", insideLegend = TRUE, setMap = setMap)
rnaEnrichHallmark <- enrichRes$enrichPlot + theme(legend.position = c(0.8,0.2))
rnaEnrichHallmark

9.3.2 Enrichment using Hallmark genesets for U-CLLs

exprMat.U <- assay(ddsU.vst)
designMat.U <- designMat[colnames(exprMat.U),]
enrichRes.U <- runCamera(exprMat.U, designMat.U, gmts$H, 
                       id = rowData(ddsSub.vst[rownames(exprMat.U),])$symbol, contrast = 5, #LF4
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "Hallmarks (U-CLL, 5% FDR)", setMap = setMap)
plotEnrichHallmark.U <- enrichRes.U$enrichPlot
plotEnrichHallmark.U

9.3.3 Enrichment using Hallmark genesets for M-CLLs

exprMat.M <- assay(ddsM.vst)
designMat.M <- designMat[colnames(exprMat.M),]
enrichRes.M <- runCamera(exprMat.M, designMat.M, gmts$H, 
                       id = rowData(ddsSub.vst[rownames(exprMat.M),])$symbol, contrast = 5, #LF4
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "Hallmarks (M-CLL, 5% FDR)", setMap = setMap)
plotEnrichHallmark.M <- enrichRes.M$enrichPlot
plotEnrichHallmark.M

9.3.4 Heatmap for enriched genesets

Get mitochondrial genes from mito carta

highLightList <- filter(corRes.sig, symbol %in% mitoProtList)$id

MTORC1_SIGNALING

corRes.sig <- dplyr::rename(corRes.rna, coef = "logFC") %>% dplyr::filter(adj.P.Val < 0.01, coef > 0)

colAnno <- data.frame(row.names = colnames(exprMat),
                      LF4 = designMat[,"LF4"])
col_fun = circlize::colorRamp2(c(min(colAnno$LF4),max(colAnno$LF4)), c("white",colList[4]))
annoColor <- list(`CLL-PD` = col_fun)
colnames(colAnno) <- c("CLL-PD")

plotSetHeatmapComplex(geneSigTab = corRes.sig, geneSet =gmts$H, setName = "HALLMARK_MTORC1_SIGNALING", exprMat, colAnno, plotTitle = "Hallmark mTORC1 signaling",scale = TRUE,annoCol = annoColor, highLight =  highLightList)

OXIDATIVE_PHOSPHORYLATION

plotSetHeatmapComplex(geneSigTab = corRes.sig, geneSet =gmts$H, setName = "HALLMARK_OXIDATIVE_PHOSPHORYLATION", exprMat, colAnno, plotTitle = "Hallmark Oxidative phosphorylation", scale = TRUE, annoCol = annoColor, highLight =  highLightList)

#get a percentage of mitochondiral proteins in this set
allGeneInList <- intersect(corRes.sig$symbol, loadGSC(gmts$H)$gsc$HALLMARK_OXIDATIVE_PHOSPHORYLATION)

How many genes are in the geneset of OXIDATIVE PHOSPHORYLAITON?

length(allGeneInList)

## [1] 83

How many genes are also on mitochondria?

sum(allGeneInList %in% mitoProtList)

## [1] 62

MYC_TARGETS_V1

#pvalue for MYC
filter(corRes.sig, symbol == "MYC")

##                id baseMean      coef      lfcSE       t      P.Value
## 1 ENSG00000136997 3294.441 0.2622213 0.07173709 3.65531 0.0002568713
##     adj.P.Val symbol
## 1 0.001077059    MYC

plotSetHeatmapComplex(geneSigTab = corRes.sig, geneSet =gmts$H, setName = "HALLMARK_MYC_TARGETS_V1", exprMat, colAnno, scale = TRUE, plotTitle = "Hallmark MYC targets v1", annoCol = annoColor, highLight =  highLightList)

9.4 A network plot showing the overlap of gene enriched in those three pathways and annotate mitochondrial genes

corRes.sigUp <- filter(corRes.rna, adj.P.Val < 0.01, logFC >0)

setList <- c("HALLMARK_MYC_TARGETS_V1", "HALLMARK_OXIDATIVE_PHOSPHORYLATION", "HALLMARK_MTORC1_SIGNALING")
nodeTab <- lapply(setList, function(eachSet) {
  geneList <- intersect(corRes.sig$symbol, loadGSC(gmts$H)$gsc[[eachSet]])
  tibble(symbol = geneList, setName = eachSet)
}) %>% bind_rows() %>%
  mutate(setName = str_remove(setName,"HALLMARK_")) %>%
  mutate(setName = setMap[match(setName, setMap$setName),]$pathwayName) %>%
  mutate(mitoGene = ifelse(symbol %in% mitoProtList, "yes", "no"))

#get node list
allNodes <- union(nodeTab$symbol, nodeTab$setName) 

nodeList <- data.frame(id = seq(length(allNodes))-1, name = allNodes, stringsAsFactors = FALSE) %>%
  mutate(type = ifelse(name %in% mitoProtList, 
                          "yes","no")) %>%
  mutate(type = ifelse(name %in% nodeTab$setName,"set", type)) %>%
  mutate(nodeText = ifelse(type == "no","",name)) %>%
  mutate(pathText = ifelse(type == "set", name, "")) %>%
  mutate(textSize = ifelse(type == "set",3,0)) %>%
  mutate(textPosAdjust = ifelse(name %in%"Oxidative phosphorylation",0.2,0.5))
  

#get edge list
edgeList <- nodeTab %>%
  dplyr::rename(Source = setName, Target = symbol) %>%
  mutate(Source = nodeList[match(Source,nodeList$name),]$id,
         Target = nodeList[match(Target, nodeList$name),]$id) %>%
  data.frame(stringsAsFactors = FALSE)

net <- graph_from_data_frame(vertices = nodeList, d=edgeList, directed = FALSE)

set.seed(2)
tidyNet <- as_tbl_graph(net)
enrichNet <- ggraph(tidyNet, layout = "igraph", algorithm = "nicely") + 
  geom_edge_link(color = "grey90", width=1) + 
  geom_node_point(aes(color =type), size=3, alpha=0.7) + 
  geom_node_text(aes(label = pathText, col = type, hjust = textPosAdjust), repel = FALSE, fontface= "bold", size=5, nudge_y = -0.5) +
  #scale_size_manual(values =c(set = 4, yes = 5, no =4))+
  scale_color_manual(values = c(yes = colList[5],no = "grey60", set = colList[1])) + 
  #scale_edge_color_brewer(palette = "Set2") +
  theme_graph(base_family = "sans") + theme(legend.position = "none", plot.margin =margin(-5,-5,-5,-5))

prepare legend

set.seed(2)

legTab <- tibble(x=c(1,1), y=c(1,2), group = c("mitochondrial protein coding genes",
                                               "other genes"))

legP <- ggplot(legTab, aes(x=x, y=y, col = group)) +
  geom_point(size=4) + theme_bw() +
  scale_color_manual(values = c("mitochondrial protein coding genes" = colList[5], "other genes" = "grey60"), name = "") +
  theme(legend.text = element_text(size=12),
        legend.background = element_blank())
legP <- get_legend(legP)

comNet <- ggdraw(enrichNet) + draw_plot(legP, x = 0.22, y=-0.35)
comNet

10 Compare CLL-PD expression signature with other stimulation signatures

10.1 Expression profile of U-CLL cells treated with CPG (GSE30105)

10.1.1 Query and pre-processing microarray data

CLL cells were purified by negative selection using anti-CD3, anti-CD14 and anti-CD16 mouse monoclonal antibodies and Dynabeads coated with a pan anti-mouse IgG antibody. The purity of the CLL cells after negative selection was monitored by flow-cytometry and the percentage of CD5+/CD19+ cells exceeded 98% for each sample. CLL cells were resuspended in RPMI complete medium at a density of 2 × 105 and stimulated with 7.5 μg/ml complete phosphorothioate CpG ODN oligonucleotide 2006 (5′-TCGTCGTTTTGTCGTTTTGTCGTT-3′) or left unstimulated for 18 h.

Sys.setenv("VROOM_CONNECTION_SIZE" = 131072*1000)
gse30105 <- getGEO("GSE30105", GSEMatrix = TRUE)

##GSE30105
gse <- gse30105[[1]]
gse$treatment <- factor(ifelse(grepl("Unstimulated",gse$title), "untreated","CPG"),
                        levels = c("untreated","CPG"))
gse.vst <- gse
exprs(gse.vst) <- normalizeVSN(exprs(gse))
gse.vst <- gse.vst[!fData(gse.vst)$`GENE_SYMBOL` %in% c(NA, ""),]
#invariant filtering
sds <- genefilter::rowSds(exprs(gse.vst))
gse.vst <- gse.vst[sds > genefilter::shorth(sds)]

10.2 Differential expression and enrichment analysis

Differential expression using Limma

mm <- model.matrix( ~ 1 + treatment, pData(gse.vst))
fit <- lmFit(gse.vst, mm)
fit <- eBayes(fit)
resTab <- limma::topTable(fit, coef= "treatmentCPG", number = "all")
hist(resTab$P.Value)

sigList <- list()
sigListP <- list() #for upset plot, only cut-off is adj.P.Val < 0.01
sigList[["Up-regulated by CpG ODN"]] <- unique(filter(resTab, adj.P.Val < 0.01, logFC > 1.5)$GENE_SYMBOL)
sigListP[["CpG ODN"]] <- unique(filter(resTab, adj.P.Val < 0.01, logFC > 0)$GENE_SYMBOL)

Enrichment

highSet <- c("MYC targets v1", "MYC targets v2", "mTORC1 signaling","Oxidative phosphorylation")
exprMat <- exprs(gse.vst)
designMat <- model.matrix( ~ 1 + treatment, pData(gse.vst))

enrichRes <- runCamera(exprMat, designMat, gmts$H, id = fData(gse.vst)$GENE_SYMBOL,
                       method = "camera", pCut = 0.01, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "CpG ODN treatment", insideLegend = TRUE, setToHighlight = highSet, setMap = setMap)
plotEnrich.CPG <- enrichRes$enrichPlot + theme(plot.title = element_text(size=18, face = "bold"))
plotEnrich.CPG

10.3 Signature of IL21 + CD40L

10.3.1 Pre-processing microarray data

GSE50572 <- getGEO("GSE50572", GSEMatrix = TRUE)

gse <- GSE50572[[1]]

#table(gse$`treatment:ch1`)
condi <- structure(c("none","Tcell","IL21_CD40L"),
                   names = c("none","cocultured with activated T cells",
                             "IL-21 + cocultured with CD40L- cells"))
gse <- gse[,gse$`treatment:ch1` %in% names(condi)]
gse$treatment <- condi[gse$`treatment:ch1`]
gse$treatment <- factor(gse$treatment, levels =c("none","IL21_CD40L","Tcell"))

gse.all <- gse
exprs(gse.all) <- normalizeVSN(exprs(gse))
gse.all <- gse.all[!fData(gse.all)$`Gene Symbol` %in% c(NA, ""),]

#invariant filtering
sds <- genefilter::rowSds(exprs(gse.all))
gse.all <- gse.all[sds > genefilter::shorth(sds)]

Differential expression using Limma

mm <- model.matrix( ~ 1 + treatment, pData(gse.all))
fit <- lmFit(gse.all, mm)
fit <- eBayes(fit)

10.3.2 Differential expression and enrichment analysis

Differential expression using Limma

resTab <- limma::topTable(fit, coef= "treatmentIL21_CD40L", number = "all")
hist(resTab$P.Value)

Creat gene sets

sigList[["Up-regulated by IL21+CD40L"]] <- unique(filter(resTab, adj.P.Val < 0.01, logFC > 1.5)$Gene.Symbol)
sigListP[["IL21+CD40L"]] <- unique(filter(resTab, adj.P.Val < 0.01,logFC>0)$Gene.Symbol)

Enrichment analysis

exprMat <- exprs(gse.all)
designMat <- mm

enrichRes <- runCamera(exprMat, designMat, gmts$H, id = fData(gse.all)$`Gene Symbol`,
                       contrast = "treatmentIL21_CD40L",
                       method = "camera", pCut = 0.01, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "IL21 + CD40L treatment", insideLegend = TRUE, setToHighlight = highSet, setMap = setMap)

plotEnrich.IL21CD40L <- enrichRes$enrichPlot + theme(plot.title = element_text(size=18, face = "bold"))
plotEnrich.IL21CD40L

10.4 T cell co-cultrue signatures

10.4.1 Differential expression and enrichment analysis

Differential expression using Limma

resTab <- limma::topTable(fit, coef= "treatmentTcell", number = "all")
hist(resTab$P.Value)

sigList[["Up-regulated by activated T cells"]] <- unique(filter(resTab, adj.P.Val < 0.01, logFC > 1.5)$Gene.Symbol)
sigListP[["activated T cells"]] <- unique(filter(resTab, adj.P.Val < 0.01, logFC >0)$Gene.Symbol)

Enrichment

enrichRes <- runCamera(exprMat, mm, gmts$H, id = fData(gse.all)$`Gene Symbol`, contrast = "treatmentTcell",
                       method = "camera", pCut = 0.01, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "Co-culture with activated T cells", insideLegend = TRUE, setToHighlight = highSet, setMap=setMap)
plotEnrich.Tcell <- enrichRes$enrichPlot+ theme(plot.title = element_text(size=18, face = "bold"))
plotEnrich.Tcell

10.5 BCR-triggering signature by anti-IgM stimulation for 390 min (GSE3941)

gse39411 <- getGEO("GSE39411", GSEMatrix = TRUE)

gse <- gse39411[[1]]

#subset for only B-cell and CLL without transfection
gse <- gse[,gse$`transfected with:ch1` == "none" &
             gse$`time point (min):ch1` == "T390" &
             gse$`cell type:ch1` == "chronic lymphocytic leukemia B-cell"]

#vst
gse.vst <- gse
exprs(gse.vst) <- limma::normalizeVSN(gse.vst)

patAnno <- pData(gse.vst) %>% rownames_to_column("sampleID") %>%
  select(sampleID, description) %>%
  separate(description, into = c("patID","stimulation","timePoint"),sep = "_") %>%
  mutate(cellType = substr(patID,1,nchar(patID)-1)) %>%
  mutate(cellType = ifelse(cellType == "N", "B-cell",cellType)) %>%
  mutate(timePoint = abs(as.integer(gsub("T","",timePoint)))) %>%
  mutate(stimulation = factor(stimulation, levels = c("US","S")))

pData(gse.vst) <- patAnno[match(colnames(gse.vst), patAnno$sampleID),]

Differential expression (CLL cells for 390 min)

gse.test <- gse.vst

Differential expression using Limma

mm <- model.matrix( ~ 1  + patID + stimulation, pData(gse.test))
fit <- lmFit(gse.test, mm)
fit <- eBayes(fit)
resTab <- limma::topTable(fit, coef= "stimulationS", number = "all")
hist(resTab$P.Value)

sigList[["Up-regulated by anti-IgM"]] <- unique(filter(resTab, adj.P.Val < 0.01, logFC > 1.5)$Gene.Symbol)
sigListP[["anti-IgM"]] <- unique(filter(resTab, adj.P.Val < 0.01, logFC > 0)$Gene.Symbol)

Enrichment

exprMat <- exprs(gse.test)
designMat <- model.matrix( ~ 1  + patID  + stimulation, pData(gse.test))

enrichRes <- runCamera(exprMat, designMat, gmts$H, id = fData(gse.test)$`Gene Symbol`,
                       method = "camera", pCut = 0.01, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "anti-IgM treatment",insideLegend = TRUE, setToHighlight = highSet, setMap = setMap)
plotEnrich.IgM <- enrichRes$enrichPlot+ theme(plot.title = element_text(size=18, face = "bold"))
plotEnrich.IgM

10.6 Enrichment analysis using all above stimulation signatures

fgseaOut <- runFGSEA(corRes.rna, sigList, ifFDR = FALSE)
pList <- fgseaOut$plots

stiEnrich <- pList[[1]]
stiEnrich

pList[[2]] <- pList[[2]] + ylab("")
pList[[4]] <- pList[[4]] + ylab("")
stiEnrichOther <- plot_grid(plotlist = pList, ncol=2, align = "hv", axis = "tblr")
fgseaOut$table

##                              pathway         pval         padj        ES
## 1:           Up-regulated by CpG ODN 0.0001231679 0.0001337614 0.4124512
## 2:        Up-regulated by IL21+CD40L 0.0001181056 0.0001337614 0.5283995
## 3: Up-regulated by activated T cells 0.0001337614 0.0001337614 0.3906186
## 4:          Up-regulated by anti-IgM 0.0001308044 0.0001337614 0.3870246
##         NES nMoreExtreme size                               leadingEdge
## 1: 2.005809            0  352 RUVBL1,MTHFD2,PAICS,TXN,IFI44L,SEC61B,...
## 2: 2.633904            0  478  GART,RUVBL1,MTHFD2,EIF3J,AVEN,FBXO22,...
## 3: 1.795029            0  204      MTHFD2,PRDX3,PAICS,TXN,TARS,RRM2,...
## 4: 1.809572            0  242      GART,MTHFD2,PAICS,TXN,TARS,KLF10,...

stiEnrichOther

Upset plot to show gene overlaps

geneOverList <- c(list(`CLL-PD` = filter(corRes.rna, logFC>0, adj.P.Val <0.01)$symbol), sigListP)
UpSetR::upset(UpSetR::fromList(geneOverList))

11 Proteomic signatures for CLL-PD

11.1 Detect proteins correlated with CLL-PD (LF4)

11.1.1 Process datasets

Process proteomics data

protCLL <- protein
protCLL$CLLPD <- facTab[match(colnames(protCLL), facTab$sample),]$value
protCLL <- protCLL[,!is.na(protCLL$CLLPD)]
protMat <- assays(protCLL)[["QRILC"]]

**How many samples have CLL-PD value?

ncol(protCLL)

## [1] 46

11.1.2 Correlate protein expression with CLL-PD using proDA

Fit the probailistic dropout model

CLLPD <- protCLL$CLLPD
fit <- proDA::proDA(protMat, design = ~ CLLPD)

Test for differentially expressed proteins

corRes.prot <- proDA::test_diff(fit, "CLLPD") %>%
  dplyr::rename(id = name, logFC = diff, t=t_statistic,
                P.Value = pval, adj.P.Val = adj_pval) %>% 
  mutate(symbol = rowData(protCLL[id,])$hgnc_symbol,
         coef = logFC) %>%
  select(symbol, id, coef, t, P.Value, adj.P.Val, n_obs) %>%  
  arrange(P.Value) %>% 
  as_tibble()

11.1.3 P-value histogram

ggplot(corRes.prot, aes(x=P.Value)) + geom_histogram(fill = "lightblue",col="grey50") + xlim(0,1) +
  theme_full

11.1.4 List of significantly correlated proteins (10% FDR)

corRes.sig.prot <- filter(corRes.prot, adj.P.Val <= 0.1) %>% arrange(desc(t))
corRes.sig.prot %>% mutate_if(is.numeric, formatC, digits =2, format="e") %>% 
  mutate(n_obs = as.integer(n_obs)) %>%
  DT::datatable()

How many proteins in total?

nrow(corRes.sig.prot)

## [1] 62

How many up-regulated proteins?

nrow(filter(corRes.sig.prot, t>0))

## [1] 33

11.1.5 Enrichment using Hallmark genesets for all CLLs

designMat <- model.matrix(~CLLPD)
enrichRes <- runCamera(protMat, designMat, gmts$H, 
                       id = rowData(protCLL[rownames(protMat),])$hgnc_symbol, #LF4
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "Pathway enrichment on protein level (5% FDR)", insideLegend = TRUE, direction = "Up", setMap = setMap)
proteinEnrichHallmark <- enrichRes$enrichPlot + theme(legend.position = c(0.8,0.2))
proteinEnrichHallmark

11.1.6 Heatmap of enrichmed MYC targets

mycTar <- unique(unlist(loadGSC(gmts$H)$gsc[c("HALLMARK_MYC_TARGETS_V2","HALLMARK_MYC_TARGETS_V1")]))

corRes.sigUp <- corRes.prot %>% dplyr::filter(P.Value < 0.05, t > 0)

colAnno <- tibble(patID = colnames(protMat), 
                 F4 = protCLL$CLLPD) %>%
  arrange(F4) %>% data.frame() %>% column_to_rownames("patID")

col_fun = circlize::colorRamp2(c(min(colAnno$F4),max(colAnno$F4)), c("white",colList[4]))
annoColor <- list(`CLL-PD` = col_fun)
colnames(colAnno) <- c("CLL-PD")

plotProtMYC <- plotSetHeatmapComplex(geneSigTab = corRes.sigUp, geneSet =mycTar, setName = "HALLMARK_MYC_TARGETS", plotTitle = "Hallmark MYC targets",
                                     exprMat = protMat,colAnno =colAnno, scale = TRUE, annoCol = annoColor)

plotProtMYC

11.2 Scatter plot of selected proteins

11.2.1 MYC tagets

seleProt <- c("MCM4","NME1","PAICS")
tTab <- filter(corRes.prot, symbol %in% seleProt)
scatterTab <- protMat[tTab$id,] %>% data.frame() %>%
  rownames_to_column("id") %>% gather(key="patID", value = "value",-id) %>%
  filter(!is.na(value)) %>%
  mutate(LF4 = facTab[match(patID, facTab$sample),]$value,
         symbol = rowData(protCLL)[id,]$hgnc_symbol)


plotProtScatter <- lapply(seleProt, function(n) {
  eachTab <- filter(scatterTab, symbol == n) 
  res <- cor.test(eachTab$value, eachTab$LF4)
  annoCoef <- paste("coefficient =",format(res$estimate,digits = 2))
  annoP <- paste("italic(P)~'=",formatNum(res$p.value, digits = 1, format = "e"),"'")
  ggplot(eachTab, aes(x=LF4, y=value)) +
    geom_point(fill = colList[6],size=2,shape=21) + geom_smooth(method="lm",se=FALSE, linetype = "dashed", color = "grey50") +
    annotate("text", x = -1.5, y = Inf, label = annoCoef,
           hjust=0, vjust =1, size = 5, col= colList[1]) +
    annotate("text", x = -1.5, y = Inf, label = annoP,
           hjust=0, vjust =3, size = 5, col= colList[1], parse= TRUE) +
    ggtitle(unique(eachTab$symbol)) + ylab("Protein expression") + xlab("CLL-PD") +
    #ylim(c(0.4,1.2)) +
    theme_half 
})
grid.arrange(grobs= plotProtScatter, ncol=3)

11.3 Differentially expressed proteins on mitochondria

11.3.1 Mitochondrial proteins

Read a table of proteins on mitochondria from mito

corRes.mito <- corRes.sig.prot %>% filter(symbol %in% mitoProtList, t>0)

How many proteins are on mitochodnria?

nrow(corRes.mito)

## [1] 19

Heatmap of significantly associated proteins (10% FDR), with mitochondrial proteins highlighted

colAnno <- tibble(patID = colnames(protMat), 
                  CLL_PD = protCLL$CLLPD) %>%
  arrange(CLL_PD) %>% data.frame() %>% column_to_rownames("patID")

col_fun = circlize::colorRamp2(c(min(colAnno$CLL_PD),max(colAnno$CLL_PD)), c("white",colList[4]))
annoColor <- list(CLL_PD = col_fun)
plotMat <- assays(protCLL[corRes.sig.prot$id,rownames(colAnno)])[["QRILC"]]
plotMat <- mscale(plotMat, censor = 6)

protHeatmap <- plotSetHeatmapComplex(geneSigTab = corRes.sig.prot, geneSet = corRes.sig.prot$symbol,
               exprMat = protMat, colAnno = colAnno, 
               setName = "Proteins associated with CLL-PD (10%FDR)", annoCol = annoColor,
               highLight = corRes.mito$id)
grobHeatmap = grid.grabExpr(draw(protHeatmap))
plot_grid(grobHeatmap)

How many up-regulated RNAs are mitochodnrial protiens?

corRes.rna.mito <- filter(corRes.rna, t>0, adj.P.Val < 0.01, symbol %in% mitoProtList) 
nrow(corRes.rna.mito)

## [1] 307

head(arrange(corRes.rna.mito, desc(t)))

##                id baseMean     logFC      lfcSE         t      P.Value
## 1 ENSG00000136521 1886.127 0.2301449 0.01968797 11.689618 1.440322e-31
## 2 ENSG00000178741 1874.537 0.2307863 0.01981524 11.646912 2.379254e-31
## 3 ENSG00000164933 3668.398 0.1874903 0.01790616 10.470717 1.177482e-25
## 4 ENSG00000065911 1138.622 0.3023715 0.02908427 10.396393 2.574954e-25
## 5 ENSG00000198130 1274.413 0.2724889 0.02926809  9.310103 1.277085e-20
## 6 ENSG00000165672 1530.073 0.2011660 0.02176611  9.242168 2.415524e-20
##      adj.P.Val   symbol
## 1 3.200807e-28   NDUFB5
## 2 4.138698e-28    COX5A
## 3 7.632045e-23 SLC25A32
## 4 1.381241e-22   MTHFD2
## 5 2.684640e-18    HIBCH
## 6 4.756443e-18    PRDX3

seleProt <- c("VDAC1","HSPD1")
tTab <- filter(corRes.prot, symbol %in% seleProt)
scatterTab <- protMat[tTab$id,] %>% data.frame() %>%
  rownames_to_column("id") %>% gather(key="patID", value = "value",-id) %>%
  filter(!is.na(value)) %>%
  mutate(LF4 = facTab[match(patID, facTab$sample),]$value,
         symbol = rowData(protCLL)[id,]$hgnc_symbol)


plotMitoProScatter <- lapply(seleProt, function(n) {
  eachTab <- filter(scatterTab, symbol == n) 
  res <- cor.test(eachTab$value, eachTab$LF4)
  annoCoef <- paste("coefficient =",format(res$estimate,digits = 2))
  annoP <- paste("italic(P)~'=",formatNum(res$p.value, digits = 1, format = "e"),"'")
  ggplot(eachTab, aes(x=LF4, y=value)) +
    geom_point(fill = colList[6],size=3,shape=21) + geom_smooth(method="lm",se=FALSE, linetype = "dashed", color = "grey50") +
    annotate("text", x = -1.5, y = Inf, label = annoCoef,
           hjust=0, vjust =1, size = 5, col= colList[1]) +
    annotate("text", x = -1.5, y = Inf, label = annoP,
           hjust=0, vjust =3, size = 5, col= colList[1], parse= TRUE) +
    ggtitle(unique(eachTab$symbol)) + ylab("Protein expression") + xlab("CLL-PD") +
    #ylim(c(0.4,1.2)) +
    theme_half 
})
corMitoMarer <- grid.arrange(grobs= plotMitoProScatter, ncol=2)

corMitoMarer

## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]

11.4 FACS validation using mitoTracker

mitoTab <- mito %>% mutate(CLLPD = facTab[match(patientID, facTab$sample),]$value) %>%
  pivot_longer(-c(patientID, CLLPD), names_to = "feature", values_to = "MFI") %>%
  mutate(logMFI = log10(MFI)) %>%
  filter(feature == "MitoTracker")


eachTab <- mitoTab
res <- cor.test(eachTab$logMFI, eachTab$CLLPD)
annoCoef <- paste("coefficient =",format(res$estimate,digits = 2))
annoP <- paste("italic(P)~'=",formatNum(res$p.value, digits = 2, format = "e"),"'")
plotMitoScatter <- ggplot(eachTab, aes(x=CLLPD, y=logMFI)) +
  geom_point(fill = colList[3],size=5,shape=21) + geom_smooth(method="lm",se=FALSE, linetype = "dashed", color = "grey50") +
  annotate("text", x = -1.5, y = Inf, label = annoCoef,
         hjust=0, vjust =1, size = 5, col= colList[1]) +
  annotate("text", x = -1.5, y = Inf, label = annoP,
         hjust=0, vjust =3, size = 5, col= colList[1], parse= TRUE) +
  ggtitle(unique(eachTab$feature)) + ylab(expression(log[10]~(MFI))) + xlab("CLL-PD") +
  theme_half 


plotMitoScatter

12 Association between CLL-PD and energy metabolism

12.1 Correlation test using linear model

seaMat <- SummarizedExperiment::assay(seahorse)
factorMatrix <- facTab.all %>% spread(key = factor, value = value) %>%
  data.frame() %>% column_to_rownames("sample") 
overSample <- intersect(colnames(seaMat), rownames(factorMatrix))
factorMatrix <- factorMatrix[overSample,]
factorMatrix <- factorMatrix[,complete.cases(t(factorMatrix))]
designMat <- model.matrix(~1 +.,factorMatrix)
seaMat <- seaMat[,rownames(designMat)]

fit <- lmFit(seaMat, designMat)
fit2 <- eBayes(fit)

corRes.LF1 <- limma::topTable(fit2, number ="all", coef = "LF1", adjust.method = "BH") %>%
  rownames_to_column("id") %>% as_tibble() %>% mutate(factor = "LF1")
corRes.LF4 <- limma::topTable(fit2, number ="all", coef = "LF4", adjust.method = "BH") %>%
  rownames_to_column("id") %>% as_tibble() %>% mutate(factor = "LF4")


dim(seaMat)

## [1]  11 136

12.2 Plot P values for associations, straitified by F1 and F4

plotTab <- bind_rows(corRes.LF1[,c("id","P.Value","factor")], corRes.LF4[,c("id","P.Value","factor")]) %>%
  mutate(P.adj = p.adjust(P.Value, method = "BH")) %>%
  mutate(id = str_replace_all(id, "[.]", " "))

orderTab <- plotTab %>% select(id, P.Value, factor) %>% spread(key = factor, value = P.Value) %>%
  arrange(LF1/LF4)

plotTab <- plotTab %>% mutate(id = factor(id, levels = orderTab$id))

fdrCut <- -log10((filter(plotTab,P.adj <= 0.05) %>% arrange(desc(P.Value)))$P.Value[1])

plotFeature <- unique(filter(plotTab, P.adj <=0.05)$id)
plotTab <- filter(plotTab, id %in% plotFeature)# %>%
 # mutate(factor = ifelse(factor == "LF1", "F1", "F4 (CLL-PD)"))
plotSeahorse <- ggplot(plotTab, aes(x=id, y=-log10(P.Value), col = factor, dodge = factor)) + 
  geom_point(position = position_dodge(width=0.5),size=3) + 
  geom_linerange(aes(ymin = 0, ymax=-log10(P.Value)),position = position_dodge2(width=0.5), size=1) + 
  scale_color_manual(values = c(LF1=colList[3],LF4=colList[4]),labels = 
                      c("F1","F4 (CLL-PD)"), name = "") +
  geom_hline(yintercept = fdrCut, linetype = "dashed", color = "grey50") +
  coord_flip() + xlab("") + ylab(bquote("-log"[10]*"("*italic("P")~"value)")) + 
  annotate("text", x = Inf, y = fdrCut, label = paste0("5% FDR"),
             size = 3, vjust =1.5, hjust=-0.1) +
  theme_full + theme(legend.position = "bottom", 
                     legend.text = element_text(size=13)) 
plotSeahorse

12.3 Scatter plot for showing assocations

plotFeature <- c("maximal.respiration", "spare.respiratory.capacity", "OCR")
scatterTab <- seaMat[plotFeature,] %>% data.frame() %>%
  rownames_to_column("feature") %>% gather(key="patID", value = "value",-feature) %>%
  filter(!is.na(value)) %>%
  mutate(LF4 = facTab[match(patID, facTab$sample),]$value)

pList <- lapply(plotFeature, function(n) {
  eachTab <- filter(scatterTab, feature == n) %>%
   mutate(feature = str_replace_all(feature, "[.]", " "))
  corRes = cor.test(eachTab$value, eachTab$LF4)
  annoCoef <- paste("'coefficient ='~",format(corRes$estimate,digits = 2))
  annoP <- paste("italic(P)~'=",formatNum(corRes$p.value, digits = 1, format = "e"),"'")
  ggplot(eachTab, aes(x=LF4, y=value)) +
    geom_point(fill = colList[6],size=2, shape =21) + geom_smooth(method="lm",se=FALSE, linetype = "dashed", color = "grey50") +
    annotate("text", x = Inf, y = Inf, label = annoCoef,
           hjust=1, vjust =1, size = 5, parse = TRUE, col= colList[1]) +
    annotate("text", x = Inf, y = Inf, label = annoP,
           hjust=1, vjust =3, size = 5, parse = TRUE, col= colList[1]) +
    ggtitle(unique(eachTab$feature)) + ylab("OCR (pMol/min)") + xlab("CLL-PD") +
    theme_half +
    theme(plot.margin = margin(5,20,5,20),
          plot.title = element_text(size = 14, hjust =0.5, face="bold"))
})
grid.arrange(grobs= pList, ncol=3)

plotSeahorseScatter <- pList

13 Associations between CLL-PD(F4) and ex-vivo drug response phenotyps

13.1 Correlation test using limma

Association test

viabSD <- group_by(drug, Drug, Concentration) %>%
  summarise(sdViab = sd(normVal), meanViab = mean(normVal)) %>%
  filter(sdViab > genefilter::shorth(sdViab), meanViab < 0.9)

viabMat <- filter(drug, paste0(Drug, Concentration) %in% paste0(viabSD$Drug, viabSD$Concentration)) %>%
  distinct(patientID, Drug, auc) %>%
  filter(patientID %in% facTab$sample) %>%
  spread(key = patientID, value = auc) %>%
  data.frame() %>% column_to_rownames("Drug")

factorMatrix <- facTab.all %>% spread(key = factor, value = value) %>%
  data.frame() %>% column_to_rownames("sample") 
factorMatrix <- factorMatrix[colnames(viabMat),complete.cases(t(factorMatrix))]

designMat <- model.matrix(~1 +.,factorMatrix)
fit <- lmFit(viabMat, designMat)
fit2 <- eBayes(fit)

corRes <- limma::topTable(fit2, number ="all", coef = "LF4", adjust.method = "BH") %>%
  rownames_to_column("name") %>% as_tibble() %>% mutate(factor = "LF4")
corRes.LF1 <- limma::topTable(fit2, number ="all", coef = "LF1", adjust.method = "BH") %>%
  rownames_to_column("name") %>% as_tibble() %>% mutate(factor = "LF1")
corRes.LF2 <- limma::topTable(fit2, number ="all", coef = "LF2", adjust.method = "BH") %>%
  rownames_to_column("name") %>% as_tibble() %>% mutate(factor = "LF2")

13.2 Compare the strength of associations to F1 (IGHV), F2 (trisomy12) and F4 (CLL-PD)

plotTab <- bind_rows(corRes, corRes.LF1, corRes.LF2) %>% 
  mutate(P.adj = p.adjust(P.Value, method = "BH")) %>% arrange(P.Value)

drugShow <- filter(plotTab, P.adj <= 0.05) %>%
  arrange(P.Value) %>% distinct(name)

plotTab <- filter(plotTab, name %in% drugShow$name) %>%
  mutate(name = factor(name, levels = drugShow$name))

fdrCut <- -log10((filter(plotTab,P.adj <= 0.05) %>% arrange(desc(P.Value)))$P.Value[1])

plotDrugPval <- ggplot(plotTab, aes(x=name, y=-log10(P.Value), col = factor)) + 
  geom_point(size=3) + 
  scale_color_manual(values = c(LF1=colList[3],LF2 = colList[5], LF4=colList[4]), labels = c("F1","F2","F4 (CLL-PD)")) +
  geom_hline(yintercept = fdrCut, linetype = "dashed", color = "grey50") +
  xlab("") + ylab(bquote("-log"[10]*"("*italic("P")~"value)")) + 
  annotate("text", x = Inf, y = fdrCut, label = paste0("5% FDR"),
             size = 3, vjust =2, hjust=-0.1) +
  theme_full +
  theme(panel.grid.major = element_line(),
        panel.grid.minor = element_line(),
        axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5))
plotDrugPval

13.3 Scatter plots significant assocations (5% FDR)

plotFeature <- as.character(filter(plotTab, factor == "LF4", P.adj < 0.05)$name)

scatterTab <- viabMat[plotFeature,] %>% data.frame() %>%
  rownames_to_column("feature") %>% gather(key="patID", value = "value",-feature) %>%
  filter(!is.na(value)) %>%
  mutate(LF4 = facTab[match(patID, facTab$sample),]$value)


plotDrugScatter <- lapply(plotFeature, function(n) {
  eachTab <- filter(scatterTab, feature == n) 
  res <- cor.test(eachTab$value, eachTab$LF4)
  annoCoef <- paste("coefficient =",format(res$estimate,digits = 2))
  annoP <- paste("italic(P)~'=",formatNum(res$p.value, digits = 1, format = "e"),"'")
  ggplot(eachTab, aes(x=LF4, y=value)) +
    geom_point(fill = colList[6],size=2,shape=21) + geom_smooth(method="lm",se=FALSE, linetype = "dashed", color = "grey50") +
    annotate("text", x = Inf, y = Inf, label = annoCoef,
           hjust=1, vjust =1, size = 5, col= colList[1]) +
    annotate("text", x = Inf, y = Inf, label = annoP,
           hjust=1, vjust =3, size = 5, col= colList[1], parse= TRUE) +
    ggtitle(unique(eachTab$feature)) + ylab("Viability") + xlab("CLL-PD") +
    #ylim(c(0.4,1.2)) +
    theme_half 
})
grid.arrange(grobs= plotDrugScatter, ncol=3)

A different arrangement

grid.arrange(grobs= plotDrugScatter, ncol=2)

13.4 Organizing figures for Section 4 in the manuscript

13.4.1 Main Figure 4

set.seed(2)
title = ggdraw() + draw_figure_label("Figure 4", fontface = "bold", position = "top.left",size=22)

leftGrid <- plot_grid(rnaEnrichHallmark + theme(plot.margin = margin(0,1,1,0,"cm")),
                      proteinEnrichHallmark + theme(plot.margin = margin(1,1,0,0,"cm")), 
                      ncol=1, 
                      rel_heights = c(1.2,1),labels = c("a","b"), label_size = 22,
                      vjust = c(1.5,1.5))

rightGrid <- plot_grid(plotSeahorse, comNet,
                       ncol=1,rel_heights = c(0.58, 0.6),
                       labels = c("c","d"), label_size = 22)

pout <- plot_grid(leftGrid, rightGrid,rel_widths = c(1,1), ncol =2)
plot_grid(title, pout, rel_heights = c(0.05,0.95), ncol = 1)

13.5 Supplementary figures

13.5.1 Enrichment for M-CLL and U-CLL separately

plot_grid(plotEnrichHallmark.U + theme(legend.position = "none"), NULL, plotEnrichHallmark.M, rel_widths = c(0.45,0.08, 0.5), ncol = 3, align = "v")

13.5.2 Asseble supplementary figures to show enrichment barplot

plot_grid(plotEnrich.CPG, plotEnrich.IL21CD40L, plotEnrich.Tcell, plotEnrich.IgM,
          ncol=2, align = "hv",axis = "tblr")

13.5.2.1 Supplementary figure of Pvalue and variance explained plot

plot_grid(plotDrugPval,
          plot_grid(plotlist = plotDrugScatter, ncol = 3),
          align = "v", nrow =2, labels = c(" "," "), 
          rel_heights = c(0.4,0.6),label_size = 22,ncol=1)

13.5.2.2 Supplementary Figure for mitochondrial biogenesis

plot_grid(grobHeatmap,
          plot_grid(plotMitoScatter, corMitoMarer, nrow=1, rel_widths = c(0.5,1), labels = c(" "," "), label_size = 22),
          ncol=1, rel_heights = c(3,1), labels = c(" ",""), label_size = 22)

14 Single cell analysis (CyTOF)

Load cyTOF dataset

rm(list = ls()) #clean environment

cytoPath <- file.path(tempdir(),"sceAll.RData")
if(!file.exists(cytoPath)) {
  download.file("https://www.huber.embl.de/users/jlu/data/sceAll.RData",
                cytoPath)
  load(cytoPath)
} else {
  load(cytoPath)
}

md <- metadata(sceAll)$experiment_info %>% as_tibble()

14.1 T-SNE plots

14.1.1 T-SNE of all samples and cells, colored by cell types

reTab <- getReducedDimTab(sceAll,"TSNE")
plotTab <- reTab %>% 
  mutate(cluster = factor(clustMap[as.character(stateCluster)], clustMap))
p<-plotDM(plotTab, sceAll,colorBy = "cluster", x = "x",y="y", plotTitle = FALSE) +
  guides(color = guide_legend(ncol= 2,override.aes = list(alpha = 1, size = 6))) +
  theme(legend.text = element_text(size=28), legend.title = element_text(size=20))
p

14.1.2 Proliferation markers, stratified by CLL-PD

14.1.2.1 Ki-67 (for main figure)

useDim <- "TSNE"
sceCpG <- sceAll[,!is.na(reducedDim(sceAll,useDim)[,1])]
sceCpG <- sceCpG[,sceCpG$condition == "CpG"]
dmTab  <- getReducedDimTab(sceCpG,useDim)

p <- plotDM(dmTab, sceCpG, "Ki-67", facetBy = "CLL-PD", facetCol = 2, x = "x", y =  "y", fast=FALSE, pointsize =3.2) +
  theme(legend.text = element_text(size=22), legend.title = element_text(size=22))
p

14.1.2.2 P-Rb & Cyclin B1

pList <- list()

pList[[1]] <- plotDM(dmTab, sceCpG, "P-Rb", facetBy = "CLL-PD", facetCol = 2, x = "x", y =  "y",  fast=FALSE, pointsize =3.2)+
  theme(legend.text = element_text(size=20), legend.title = element_text(size=20))
pList[[2]] <- plotDM(dmTab, sceCpG, "Cyclin B1", facetBy = "CLL-PD", facetCol = 2, x = "x", y =  "y", fast=FALSE, pointsize =3.2)

p <- shareLegend(pList, ncol=1, position = "bottom")

p

14.1.3 Pooled samples, colored by markers associated with proliferation, under CpG condition

useDim <- "TSNE"
sceCpG <- sceAll[,!is.na(reducedDim(sceAll,useDim)[,1])]
sceCpG <- sceCpG[,sceCpG$condition == "CpG"]
dmTab  <- getReducedDimTab(sceCpG,useDim)
markerNames <- c("CDK4","GLUT1","c-Myc","P-4E-BP1","P-AMPK alpha","NFAT1")

plotList <- lapply(markerNames, function(n) {
   plotDM(dmTab, sceCpG, n, x = "x", y =  "y", fast=FALSE, pointsize = 4) +
   theme(legend.text = element_text(size=20), 
         legend.title = element_text(size=20),
         plot.margin = margin(0,1.8,0,1.8,"cm"))
})

p<-shareLegend(plotList,position = "bottom", ncol=3)
p

14.1.3.1 Separate for CLL-PD high and low samples

plotList <- lapply(markerNames, function(n) {
   plotDM(dmTab, sceCpG, n, x = "x", y =  "y", fast=FALSE, facetBy = "CLL-PD", facetCol = 2, pointsize = 4) +
   theme(legend.text = element_text(size=20), legend.title = element_text(size=20),
         plot.margin = margin(1,1,1,1, unit = "cm"))
})

p <- shareLegend(plotList,position = "bottom", ncol=2)
p

14.2 Differential population abundance

14.2.1 Test for differential abundance

sceViable <- sceAll[,sceAll$typeCluster == "CLL"]
popTab <- getPop(sceViable,"stateCluster") %>% left_join(md, by = "sample_id")
rm(sceViable)

condiList <- levels(sceAll$condition)

resDA_CLLPD <-lapply(condiList, function(eachCondi) {
  #subset
  sceSub <- subsetSCE(sceAll, eachCondi)
  
  #define experiment design
  ei <- metadata(sceSub)$experiment_info
  ei$CLLPD <- relevel(ei$CLLPD, ref = "low")
  designMat <- createDesignMatrix(ei, cols_design = c("CLLPD"))
  contrast <- createContrast(c(0, 1))

  #test using diffcyte
  ds_res <- diffcyt(sceSub, 
    design = designMat, contrast = contrast, 
    analysis_type = "DA", method_DA = "diffcyt-DA-edgeR",
    clustering_to_use = "stateCluster", verbose = TRUE,
    transform = FALSE)
  
  #get results
  res <- rowData(ds_res$res) %>%
    data.frame() %>% mutate(condition = eachCondi)
  res
}) %>% bind_rows()

plotPopCLLPD <- lapply(condiList, function(eachCondi) {
  eachList <- lapply(unique(popTab$cluster), function(clusterName) {
    eachTab <- popTab %>% dplyr::filter(condition %in% eachCondi, cluster == clusterName) %>%
      mutate(condition = as.character(condition)) %>%
      mutate(condiName = condiMap[eachCondi],
             CLLPD = factor(paste0(CLLPD," CLL-PD"), levels = c("high CLL-PD","low CLL-PD")))
    pVal <- dplyr::filter(resDA_CLLPD, cluster_id == clusterName, condition == eachCondi)$p_val
    
    annoP <- data.frame(annoP = ifelse(pVal < 1e-12, paste("italic(P)~'<",formatNum(1e-12, digits = 1),"'"),
                    paste("italic(P)~'=",formatNum(pVal, digits = 1),"'")))
    
    yMax <- max(eachTab$popSize)
    
    eachPlot <- ggplot(eachTab, aes(x= CLLPD, y = popSize)) + 
              #geom_boxplot(width=0.5, outlier.shape = NA) +
              ggbeeswarm::geom_quasirandom(aes(fill = IGHV), shape =21, cex=5, width = 0.3) +
              ylim(c(0, yMax + yMax*0.5)) +
              geom_segment(x =1, xend = 2, y=yMax + yMax*0.1 ,yend= yMax + yMax*0.1, size=0.8) +
              scale_fill_manual(values = colorMap, name = "IGHV status") + 
              geom_text(data = annoP, aes(label = annoP),x = 1.5, y= yMax + yMax*0.1, vjust=-1, parse=TRUE, size=6) +
              ylab("% in live CLL cells") + xlab("") + ggtitle(clusterName) +
              theme_half  +
              theme(legend.position = "bottom",
                    axis.text = element_text(size=15),
                    axis.title = element_text(size=15),
                    legend.box.margin=margin(-20,0,0,0),
                    legend.margin=margin(0,0,0,0),
                    legend.text = element_text(size=15),
                    legend.title = element_text(size=15)) 
    })
  names(eachList) <- unique(popTab$cluster)
  eachList
})
names(plotPopCLLPD) <- condiList

14.2.2 Proliferating fraction under CpG condition

p <- plotPopCLLPD$CpG$CLL_proliferating
pPopCLLPD <- p+ggtitle("proliferating CLL cells\n(CLL-PD high vs low)")  + ylim(0,50)

#pPopCLLPD

14.2.3 Proliferating fraction change under all four conditons

plotPopCondition <- function(popTab, condiList, clusterList, colorBy = "CLL-PD",
                             yLabel = "% in all cells", plotTitle = "", yLim  = NULL) {
  
  plotTab <- popTab %>% dplyr::filter(condition %in% condiList, cluster %in% clusterList) %>%
    mutate(condition = as.character(condition)) %>% 
    mutate(condiName = factor(condiMapS[condition],levels = condiMapS[condiList])) %>%
    mutate(clusterName = factor(clustMap[as.character(cluster)], levels = clustMap[clusterList]))
  
  eachPlot <- ggplot(plotTab, aes(x= condiName, y = popSize))
      
  if (colorBy == "CLL-PD") {
    eachPlot <- eachPlot + 
      geom_line(aes(group = patient_id, col = CLLPD), linetype = "solid", size=1) +
      geom_point(aes(col = CLLPD), size = 3) 
  } else {
    eachPlot <- eachPlot + geom_point(aes(col = IGHV), size=3)  +
            geom_line(aes(group = patient_id, col = IGHV), linetype = "dashed", size=1) 
  }
      
  eachPlot <- eachPlot + 
      #scale_fill_manual(values = colorMap, name = colorBy) + 
      scale_color_manual(values = colorMap, name = colorBy) + 
      expand_limits(y=c(0,yLim)) +
      ylab(yLabel) + xlab("")
  
  if (length(clusterList) > 1) {
    eachPlot <- eachPlot + facet_wrap(~clusterName, ncol=2) 
  } else {
    eachPlot <- eachPlot + ggtitle(plotTitle)
  }
  eachPlot <- eachPlot  + theme_half  +
      theme(legend.position = "bottom", 
            strip.text = element_text(size=15, face= "bold"),
            strip.background = element_rect(fill = "white",color = "white"),
            axis.text.x = element_text(size=14),
            legend.box.margin=margin(-20,0,0,0),
            legend.margin=margin(0,0,0,0),
            legend.text = element_text(size=15),
            legend.title = element_text(size=15)) 
  

  
  return(eachPlot)
}

clusterList <- c("CLL_proliferating")
condiList <- c("DMSO","CpG","CpG_Ever","Ever")
pPopCondi <- plotPopCondition(popTab, condiList, clusterList, yLabel = "% in live CLL cells", plotTitle = "proliferating CLL cells\n(across conditions)") + ylim(0,50)
#pPopCondi

Combine and align axis for figure c and d

plot_grid(pPopCondi, 
          pPopCLLPD + theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank(),
                            axis.title.y = element_blank()),
          align = "hv", axis = "rb", rel_widths = c(0.65,0.35))

14.3 Differential marker expression

Subset for CLL population

sceCLL <- subsetSCE(sceAll, cluster_id = "CLL", k = "typeCluster")
#rm(sceAll)
#select functional marker (non-linage and non-proliferating)
markerUse <- setdiff(state_markers(sceCLL),c("Ki-67","Cyclin B1","P-Rb"))
sceCLL <- sceCLL[rownames(sceCLL) %in% markerUse, ]

14.3.1 Heatmap plot to summarise changes

getMedian <- function(sceObj, useCluster) {
  clustTab <- metadata(sceObj)$cluster_code
  allCluster <- clustTab[match(sceObj$cluster_id, clustTab$som100),][[useCluster]]
  exprMat <- assays(sceObj)[["exprs"]]

  sumTab <- lapply(seq(nrow(exprMat)), function(i) {
    data.frame(expr = exprMat[i,],
               sample_id = sceObj$sample_id,
               cluster = allCluster) %>%
      group_by(sample_id, cluster) %>%
      summarise(medVal = median(expr, na.rm=TRUE),
                .groups = "drop") %>%
      mutate(antigen = rownames(exprMat)[i])
  }) %>% bind_rows()
  return(sumTab)
}

exprTab <- getMedian(sceCLL, useCluster = "typeCluster") %>% left_join(md) %>%
  mutate(exprVal = medVal)
sumTab <- group_by(exprTab, condition, CLLPD, antigen) %>%
  summarise(val = median(exprVal)) %>%
  mutate(colID = paste0(condition, "_",CLLPD)) %>%
  ungroup()

colAnno <- distinct(sumTab, condition, CLLPD, colID) %>%
  mutate(condition = factor(condition, levels = c("DMSO","CpG","CpG_Ever","Ever")),
         CLLPD = factor(CLLPD, levels = c("high","low"))) %>%
  arrange(condition, CLLPD) %>%
  mutate(Condition = condiMapS[as.character(condition)],
         `CLL-PD` = CLLPD) %>%
  select(colID, Condition, `CLL-PD`) %>%
  tibble() %>% column_to_rownames("colID")

exprMat <- select(sumTab, colID, antigen, val) %>%
  pivot_wider(names_from = colID, values_from = val) %>%
  data.frame() %>% column_to_rownames("antigen")
exprMat <- exprMat[,rownames(colAnno)]

Prepare annotation rows

annoColor <- list(`CLL-PD` = c(high = colList[3], low=colList[6]))
annoTab <- colAnno %>% as_tibble(rownames = "colID") %>%
  mutate(Condition = factor(Condition, levels = unique(Condition)))

pCondi <- ggplot(annoTab, aes(x=Condition, y= "  Condition")) +
  geom_tile(fill = "white",color = "black", size=0.5, alpha=0) +
  scale_y_discrete(position = "right", expand = c(0,0)) +
  coord_cartesian(expand = FALSE) +
  geom_text(aes(label = Condition),size=4.5, fontface = "plain", lineheight =0.8) + theme_void() +
  theme(axis.text.y = element_text(size=12, face = "bold"),
        panel.background = element_rect(size=1, color = "black"))

pPd <- ggplot(annoTab, aes(x=colID, y= "  CLL-PD")) +
  geom_tile(aes(fill = `CLL-PD`),color = "black", size=0.5, alpha=0.5) +
  scale_y_discrete(position = "right", expand = c(0,0)) +
  coord_cartesian(expand=FALSE) +
  scale_fill_manual(values = annoColor$`CLL-PD`) +
  geom_text(aes(label = `CLL-PD`),size=4.5, fontface = "plain") + theme_void() +
  theme(axis.text.y = element_text(size=12, face = "bold"), legend.position = "none",
        panel.background = element_rect(size=1, color = "black"),
        plot.margin = margin(1,0,2,0))


pColAnno <- plot_grid(pCondi, pPd, ncol=1, align = "hv",axis = "lr", rel_heights = c(1.4,1))
pColAnno

exprMat <- mscale(exprMat, center = TRUE, scale = FALSE, useMad = TRUE)
pHeat <- pheatmap::pheatmap(as.matrix(exprMat), scale = "none", cluster_cols = FALSE, 
                   gaps_col = c(2,4,6,8), border_color = "grey30",
                   treeheight_row = 20, clustering_method = "ward.D2", fontsize = 12, fontsize_row = 15,
                   show_colnames = FALSE, breaks = seq(-0.8,0.8, length.out = 100),
                   color = colorRampPalette(c(colList[2],"white",colList[1]))(101), silent = TRUE)$gtable
plot_grid(plot_grid(NULL, pColAnno,  NULL, rel_widths = c(0.47, 8, 2.1), nrow=1),
          pHeat, nrow=2, rel_heights = c(0.15,1))

14.3.2 Differential marker expression related to condition

14.3.2.1 In all CLL cells

Define all the necessary comparison groups and clusters to use

#comparison list
compareList <- list(c("DMSO","CpG"),
                 c("CpG", "CpG_Ever"))

#cluster to use
clusterUse <- "typeCluster"

14.3.2.2 Performing test

resDE_condition <- diffCondiDE(sceCLL, compareList, clusterUse) %>%
  mutate(p_adj = p.adjust(p_val, method = "BH"))

14.3.2.3 Volcano plots

plotVolcanoList <- function(resDE_condition, compareList, fdrCut = 0.05, clusterPlot = NULL, useFdr = TRUE,
                            globalPcut = NULL) {

  if (is.null(clusterPlot)) clusterPlot <- unique(resDE_condition$cluster_id)

  plotList <- lapply(clusterPlot, function(eachCluster) {

    eachList <- lapply(compareList, function(eachPair) {
      plotTab <- dplyr::filter(resDE_condition, cluster_id == eachCluster, control == eachPair[1], treat == eachPair[2])
      p <- plotCyToVolcano(plotTab, fdrCut = fdrCut, labSize = 5.2,
                           plotTitle = sprintf("%s vs %s",condiMap[eachPair[2]], condiMap[eachPair[1]]), 
                           useFdr=useFdr, globalPcut = globalPcut)
      p
    })

  })
  names(plotList) <- clusterPlot
  return(plotList)
}

14.3.2.3.1 DMSO, CpG, CpG + Ever

fdrCut = 0.05
globalPcut <- max(filter(resDE_condition, p_adj <= fdrCut)$p_val)
volList <- plotVolcanoList(resDE_condition, compareList, globalPcut = globalPcut)

shareLegend(list(volList$CLL[[1]] + ylim(0, 9) + xlim(-1.7,1.7),
                 volList$CLL[[2]] + ylim(0, 9) + xlim(-1.7,1.7) + 
                   theme(axis.text.y = element_blank(),
                         axis.ticks.y = element_blank(),
                         axis.title.y = element_blank())), 
            position = "bottom", ratio = c(1,0.05),
            rel_widths = c(0.55,0.5))

plotExprConditionFacet <- function(exprTab, markers, conditions, resDE_condition, clusterPlot = "CLL",ncol=3) {
    
  #prepare table for plot
  plotTab <- dplyr::filter(exprTab, cluster == clusterPlot, antigen %in% markers, condition %in% conditions) %>%
    mutate(antigen = factor(antigen, levels = markers),
           condiName = factor(condiMapS[as.character(condition)],levels = condiMapS[conditions]))
  
  pTab <- lapply(seq(length(conditions) -1), function(i) {
    tab <- dplyr::filter(resDE_condition, marker_id %in% markers, control == conditions[i], treat == conditions[i+1]) %>%
      mutate(x1 = i+0.1, x2 = i+0.9, xp = i+0.5)
    return(tab)
  }) %>% bind_rows() %>%
    select(marker_id, p_val, control, treat, x1, x2, xp)
  
  rangeTab <- group_by(plotTab, antigen) %>% summarise(yMax = max(medVal), yMin = min(medVal)) %>%
    mutate(yRange = yMax-yMin)
  
  pTab <- left_join(pTab, rangeTab, by = c(marker_id="antigen")) %>%
          mutate(annoP = ifelse(p_val < 1e-12, paste("italic(P)*'<",formatNum(1e-12, digits = 1),"'"),
                                paste("italic(P)*'=",formatNum(p_val, digits = 1),"'"))) %>%
    mutate(antigen = marker_id)
  
  scaleFUN <- function(x) sprintf("%.1f", x)

  p<-ggplot(plotTab, aes(x=condiName, y=exprVal)) + 
    geom_point(aes(col = CLLPD), size=3) +
    geom_line(aes(group = patient_id, col = CLLPD), linetype = "solid",size=1) +
    scale_y_continuous(labels=scaleFUN, limits = c(0,NA), expand = expand_scale(add = c(0.1,0.5)))+
    geom_blank(data = pTab, aes(x=1, y=yMax + yRange*0.5)) +
    geom_segment(data = pTab, aes(x=x1, xend =x2, y = yMax+yRange*0.3, yend = yMax+yRange*0.3)) +
    geom_text(data = pTab, aes(label = annoP, x=xp, y=yMax+yRange*0.3), vjust=-1, parse=TRUE, size=4) +
    guides(col = guide_legend(override.aes = list(alpha = 1, size = 3))) +
    scale_color_manual(values = colorMap, name = "CLL-PD") + 
    ylab("Median intensity") + xlab("")+
    facet_wrap(~antigen, scale = "free_y") +
    theme_half  + theme(legend.position = "bottom", 
                        strip.text = element_text(size=15, face= "bold"),
                        strip.background = element_rect(fill = "white",color = "white"),
                        axis.text.x = element_text(size=12),
                        legend.text = element_text(size=13),
                        legend.title = element_text(size=13),
                        legend.box.margin=margin(-20,0,0,0),
                        legend.margin=margin(0,0,0,0))
  
  return(p)
}

14.3.2.4 Key markers

plotExprConditionFacet(exprTab, c("CDK4","GLUT1","c-Myc","P-4E-BP1","P-S6K","P-AMPK alpha"),c("DMSO","CpG","CpG_Ever"), resDE_condition)

14.4 Differential marker expression related to CLL-PD

14.4.1 Using all cell type populations

#cluster to use
clusterUse <- "typeCluster"

#cluster to plot 
clusterPlot <- c("CLL")

#sce object for testing
sceTest <- subsetSCE(sceCLL, condiList = c("CpG","DMSO"))

exprTab <- filter(exprTab, condition %in% c("CpG","DMSO"))

14.4.1.1 Performing test

resDiffPd <- diffPdDE(sceTest,clusterUse) %>% mutate(p_adj = p.adjust(p_val, method = "BH"))

14.4.1.2 Summary of results

14.4.2 Visualize selected samples

markerList <- c("CDK4","GLUT1","c-Myc", "P-4E-BP1","P-S6K","P-AMPK alpha")
p <- plotDiffExpr(exprTab, markerList, resDiffPd, useCondi = "CpG")
p

14.4.2.1 Volcano plots (Supplementary)

Volcano plot

plotVolcanoListPD <- function(resDE_CLLPD, fdrCut = 0.05, clusterPlot = NULL, useFdr = FALSE) {
  
  if (is.null(clusterPlot)) clusterPlot <- unique(resDE_CLLPD$cluster_id)
  
  plotList <- lapply(clusterPlot, function(eachCluster) {
    
    eachList <- lapply(unique(resDE_CLLPD$condition), function(eachCondi) {
      plotTab <- dplyr::filter(resDE_CLLPD, cluster_id == eachCluster, condition == eachCondi)
      p <- plotCyToVolcano(plotTab, fdrCut = fdrCut, 
                       plotTitle = sprintf("high vs low CLL-PD (%s)", condiMap[eachCondi]),
                       useFdr = useFdr)
      p
    })
    names(eachList) <- unique(resDE_CLLPD$condition)
    return(eachList)
  })
  names(plotList) <- clusterPlot
  return(plotList)
}
pVolPD <- plotVolcanoListPD(resDiffPd)

14.4.2.2 Upon CpG treatment

resDiffPd.cpg <- filter(resDiffPd, condition == "CpG") %>% 
  mutate(p_adj = p.adjust(p_val, method = "BH"))

pVolPD <- plotVolcanoListPD(resDiffPd.cpg, fdrCut = 0.1, useFdr = TRUE)

p1 <- pVolPD$CLL$CpG + ylim(0, 3) + xlim(-0.8,0.8)

p1

rm(sceTest)
rm(sceCLL)

14.5 Identify markers (other than proliferating markers) that differentiate proliferatign and non-proliferating cells

cellSample <- function(sceObj, useCluster, nCell, poolSample = FALSE) {
  
  sceObj$index <- seq(ncol(sceObj))
  
  if (!poolSample) {
    sampleTab <- colData(sceObj) %>% data.frame() %>%
      group_by(sample_id, !!sym(useCluster)) %>% sample_n(nCell) 
  } else{
    sampleTab <- colData(sceObj) %>% data.frame() %>%
      group_by(!!sym(useCluster)) %>% sample_n(nCell) 
  }
  
  sceObj <- sceObj[,sceObj$index %in% sampleTab$index]
  sceObj <- sceObj[, sample(seq(ncol(sceObj)))]
  
  return(sceObj)
}

#Functions for running glm
identifyMarker <- function(sceObj, useCluster, useMarker, nCell = 500, poolSample=FALSE, 
                            repeats=20, folds = 10, lambda = "lambda.1se") {
  
  sceObj <- sceObj[rownames(sceObj) %in% useMarker,]
  
  modelList <- list()
  lambdaList <- c()
  alpha = 1
  coefMat <- matrix(NA, nrow(sceObj), repeats)
  rownames(coefMat) <- rownames(sceObj)
  
  for (i in seq(repeats)) {
      
    #sample cells
    sceSub <- cellSample(sceObj, useCluster,nCell,poolSample)

    
    y <- droplevels(sceSub[[useCluster]])
    X <- t(assay(sceSub))
    
    res <- cv.glmnet(X,y, type.measure = "auc", family="binomial", 
                     nfolds = folds, alpha = alpha, standardize = TRUE)
    
    lambdaList <- c(lambdaList, res[[lambda]])
    
    modelList[[i]] <- res
      
    coefModel <- coef(res, s = lambda)[-1] #remove intercept row
    coefMat[,i] <- coefModel
  }
  list(modelList = modelList, lambdaList = lambdaList, coefMat = coefMat)
}

#Function for the heatmap plot
lassoPlotMarker <- function(lassoOut, sceObj, annoColor, freqCut = 0, coefCut = 0.01, useCluster = "stateCluster", 
                            annoMarker = c("Ki-67","Cyclin B1","P-Rb")) {
  
  
  #for the barplot on the left of the heatmap
  barValue <- rowMeans(lassoOut$coefMat)
  freqSeq <- rowMeans(lassoOut$coefMat !=0)
  
  #filter
  barValue <- barValue[barValue > coefCut & freqSeq >= freqCut]
  
  barValue <- barValue[order(barValue, decreasing = TRUE)]

  #for the heatmap and scatter plot below the heatmap
  colnames(sceObj) <- paste0("c",seq(ncol(sceObj)))
  exprMat <- t(scaleExprs(t(assay(sceObj))))
  #exprMat <- mscale(exprMat, censor =5)
  
  #prepare column annotation table
  
  colAnnoTab <- data.frame(row.names = colnames(exprMat),
                           state = sceObj[[useCluster]])
  
  annoMarkerTab <- exprMat[annoMarker,]
  colAnnoTab <- cbind(colAnnoTab, t(annoMarkerTab))
  colAnnoTab <- colAnnoTab[order(colAnnoTab$state),]
  
  #remove annotation markers
  exprMat <- exprMat[!rownames(exprMat) %in% annoMarker, ]
  exprMat <- exprMat[names(barValue),] #reorder row by coefficient
  exprMat <- exprMat[,rownames(colAnnoTab)] #order column by cluster
  #construct heatmap
  
  
  
  myCol <- colorRampPalette(c('blue','yellow','red'), 
                 space = "Lab")
  
  haCol <- ComplexHeatmap::HeatmapAnnotation(df = colAnnoTab,  col = annoColor,
                                             which = "column",
                                             show_legend = c(FALSE,TRUE,TRUE,TRUE),
                                             annotation_name_gp = gpar(fontface = "bold"),
                                             annotation_legend_param = list(
                                               grid_height = unit(0.6,"cm"),
                                               title_gp = gpar(cex=1.1)
                                             ), simple_anno_size = unit(0.7,"cm"))
  
  #bar annotation on the left
  annoBar = rowAnnotation(`importance score` = anno_barplot(barValue, baseline = 0, border = FALSE,width = unit(3.5,"cm"),
                                               gp = gpar(fill = "grey80"),
                                               axis_param = list(direction = "reverse"),
                                               at = c(0,0.5,1)))
  
  ComplexHeatmap::Heatmap(exprMat, col = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100),
                          name = "scaled intensity", top_annotation = haCol,
                          left_annotation = annoBar, show_column_names = FALSE,
                          #top_annotation = haCol, left_annotation = haRow, show_column_names = FALSE,
                          cluster_columns  = FALSE, cluster_rows = FALSE,
                          column_title = "",
                          heatmap_legend_param = list(title_position =  "lefttop-rot", grid_height = unit(0.8,"cm"),
                                                      title_gp = gpar(cex=1.1)),
                          use_raster = TRUE)
                        
}

14.5.1 In CLL-PD high samples with proliferation

Sample equal number of cells from proliferating and non-proliferating compartment

cMap <- c(CLL_resting = "non-proliferating",CLL_proliferating = "proliferating")
sceSub <- sceAll[,sceAll$typeCluster =="CLL" & sceAll$condition == "CpG" & sceAll$CLLPD == "high" & sceAll$patient_id!="H272"]
sceSub$stateCluster <- factor(cMap[as.character(sceSub$stateCluster)], levels = cMap)

useMarker <- setdiff(state_markers(sceSub),c("Ki-67","Cyclin B1","P-Rb"))
rm(sceAll)

set.seed(1118)
lassoOut <- identifyMarker(sceSub, "stateCluster", useMarker, nCell = 500, repeats = 100, poolSample = FALSE)

sceVis <- cellSample(sceSub, "stateCluster", 500)

col_fun1 = circlize::colorRamp2(c(0,1), c("white",colList[4]))
col_fun2 = circlize::colorRamp2(c(0,1), c("white",colList[6]))
col_fun3 = circlize::colorRamp2(c(0,1), c("white",colList[5]))
annoColor <- list(state = c(proliferating = "#BD3D2A4E", `non-proliferating` = "#0071B87E"), 
                  `Ki-67` = col_fun3,
                  `Cyclin B1` = col_fun3,
                  `P-Rb` = col_fun3)

lassoPlotMarker(lassoOut, sceVis, annoColor = annoColor, coefCut = 0, freqCut = 0.8)

15 End of session

sessionInfo()

## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS  10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] MOFA_1.4.0                  forcats_0.5.1              
##  [3] stringr_1.4.0               dplyr_1.0.7                
##  [5] purrr_0.3.4                 readr_2.0.0                
##  [7] tidyr_1.1.3                 tibble_3.1.3               
##  [9] tidyverse_1.3.1             lubridate_1.7.10           
## [11] diffcyt_1.8.8               circlize_0.4.13            
## [13] RColorBrewer_1.1-2          CATALYST_1.12.2            
## [15] SingleCellExperiment_1.10.1 scattermore_0.7            
## [17] ComplexHeatmap_2.4.3        ggraph_2.0.5               
## [19] igraph_1.2.6                tidygraph_1.2.0            
## [21] glmnet_4.1-2                Matrix_1.3-4               
## [23] UpSetR_1.4.0                pheatmap_1.0.12            
## [25] GEOquery_2.56.0             limma_3.44.3               
## [27] fgsea_1.14.0                grImport_0.9-3             
## [29] XML_3.99-0.6                ggbeeswarm_0.6.0           
## [31] ggrepel_0.9.1               egg_0.4.5                  
## [33] cowplot_1.1.1               car_3.0-11                 
## [35] carData_3.0-4               gridExtra_2.3              
## [37] maxstat_0.7-25              survminer_0.4.9            
## [39] ggpubr_0.4.0                ggplot2_3.3.5              
## [41] survival_3.2-11             MultiAssayExperiment_1.14.0
## [43] sva_3.36.0                  BiocParallel_1.22.0        
## [45] genefilter_1.70.0           mgcv_1.8-36                
## [47] nlme_3.1-152                DESeq2_1.28.1              
## [49] SummarizedExperiment_1.18.2 DelayedArray_0.14.1        
## [51] matrixStats_0.60.0          Biobase_2.48.0             
## [53] GenomicRanges_1.40.0        GenomeInfoDb_1.24.2        
## [55] IRanges_2.22.2              S4Vectors_0.26.1           
## [57] BiocGenerics_0.34.0         reticulate_1.20            
## [59] mofaCLL_0.0.9               BiocStyle_2.16.1           
## 
## loaded via a namespace (and not attached):
##   [1] rsvd_1.0.5                  ps_1.6.0                   
##   [3] foreach_1.5.1               rprojroot_2.0.2            
##   [5] crayon_1.4.1                MASS_7.3-54                
##   [7] backports_1.2.1             reprex_2.0.0               
##   [9] rlang_0.4.11                XVector_0.28.0             
##  [11] readxl_1.3.1                irlba_2.3.3                
##  [13] nloptr_1.2.2.2              callr_3.7.0                
##  [15] scater_1.16.2               rjson_0.2.20               
##  [17] bit64_4.0.5                 glue_1.4.2                 
##  [19] processx_3.5.2              vipor_0.4.5                
##  [21] AnnotationDbi_1.50.3        vsn_3.56.0                 
##  [23] haven_2.4.1                 tidyselect_1.1.1           
##  [25] km.ci_0.5-2                 usethis_2.0.1              
##  [27] rio_0.5.27                  zoo_1.8-9                  
##  [29] nnls_1.4                    xtable_1.8-4               
##  [31] magrittr_2.0.1              evaluate_0.14              
##  [33] cli_3.0.1                   zlibbioc_1.34.0            
##  [35] rstudioapi_0.13             bslib_0.2.5.1              
##  [37] fastmatch_1.1-3             BiocSingular_1.4.0         
##  [39] xfun_0.24                   clue_0.3-59                
##  [41] pkgbuild_1.2.0              cluster_2.1.2              
##  [43] ggtext_0.1.1                png_0.1-7                  
##  [45] withr_2.4.2                 bitops_1.0-7               
##  [47] ggforce_0.3.3               RBGL_1.64.0                
##  [49] plyr_1.8.6                  cellranger_1.1.0           
##  [51] ncdfFlow_2.34.0             pillar_1.6.2               
##  [53] RcppParallel_5.1.4          GlobalOptions_0.1.2        
##  [55] cachem_1.0.5                multcomp_1.4-17            
##  [57] fs_1.5.0                    CytoML_2.0.5               
##  [59] GetoptLong_1.0.5            DelayedMatrixStats_1.10.1  
##  [61] vctrs_0.3.8                 ellipsis_0.3.2             
##  [63] generics_0.1.0              devtools_2.4.2             
##  [65] tools_4.0.2                 foreign_0.8-81             
##  [67] beeswarm_0.4.0              munsell_0.5.0              
##  [69] tweenr_1.0.2                fastmap_1.1.0              
##  [71] compiler_4.0.2              pkgload_1.2.1              
##  [73] abind_1.4-5                 sessioninfo_1.1.1          
##  [75] GenomeInfoDbData_1.2.3      edgeR_3.30.3               
##  [77] lattice_0.20-44             utf8_1.2.2                 
##  [79] jsonlite_1.7.2              affy_1.66.0                
##  [81] scales_1.1.1                graph_1.66.0               
##  [83] doParallel_1.0.16           latticeExtra_0.6-29        
##  [85] rmarkdown_2.9               openxlsx_4.2.4             
##  [87] sandwich_3.0-1              Rtsne_0.15                 
##  [89] yaml_2.2.1                  plotrix_3.8-1              
##  [91] extraDistr_1.9.1            cytolib_2.0.3              
##  [93] flowWorkspace_4.0.6         htmltools_0.5.1.1          
##  [95] memoise_2.0.0               locfit_1.5-9.4             
##  [97] graphlayouts_0.7.1          viridisLite_0.4.0          
##  [99] digest_0.6.27               assertthat_0.2.1           
## [101] KMsurv_0.1-5                RSQLite_2.2.7              
## [103] remotes_2.4.0               data.table_1.14.0          
## [105] blob_1.2.2                  flowCore_2.0.1             
## [107] preprocessCore_1.50.0       survMisc_0.5.5             
## [109] drc_3.0-1                   splines_4.0.2              
## [111] labeling_0.4.2              Rhdf5lib_1.10.1            
## [113] proDA_1.2.0                 gridtext_0.1.4             
## [115] RCurl_1.98-1.3              broom_0.7.9                
## [117] hms_1.1.0                   modelr_0.1.8               
## [119] rhdf5_2.32.4                colorspace_2.0-2           
## [121] ConsensusClusterPlus_1.52.0 base64enc_0.1-3            
## [123] BiocManager_1.30.16         shape_1.4.6                
## [125] sass_0.4.0                  Rcpp_1.0.7                 
## [127] bookdown_0.22               mvtnorm_1.1-2              
## [129] FlowSOM_1.20.0              RProtoBufLib_2.0.0         
## [131] fansi_0.5.0                 tzdb_0.1.2                 
## [133] R6_2.5.0                    ggridges_0.5.3             
## [135] lifecycle_1.0.0             zip_2.2.0                  
## [137] curl_4.3.2                  ggsignif_0.6.2             
## [139] affyio_1.58.0               minqa_1.2.4                
## [141] testthat_3.0.4              jquerylib_0.1.4            
## [143] desc_1.3.0                  TH.data_1.0-10             
## [145] iterators_1.0.13            htmlwidgets_1.5.3          
## [147] polyclip_1.10-0             markdown_1.1               
## [149] crosstalk_1.1.1             rvest_1.0.1                
## [151] exactRankTests_0.8-32       codetools_0.2-18           
## [153] gtools_3.9.2                prettyunits_1.1.1          
## [155] dbplyr_2.1.1                gtable_0.3.0               
## [157] tsne_0.1-3                  DBI_1.1.1                  
## [159] httr_1.4.2                  highr_0.9                  
## [161] stringi_1.7.3               vroom_1.5.3                
## [163] reshape2_1.4.4              farver_2.1.0               
## [165] annotate_1.66.0             viridis_0.6.1              
## [167] hexbin_1.28.2               Rgraphviz_2.32.0           
## [169] magick_2.7.2                DT_0.18                    
## [171] xml2_1.3.2                  ggcyto_1.16.0              
## [173] boot_1.3-28                 BiocNeighbors_1.6.0        
## [175] lme4_1.1-27.1               geneplotter_1.66.0         
## [177] bit_4.0.4                   jpeg_0.1-9                 
## [179] pkgconfig_2.0.3             rstatix_0.7.0              
## [181] corrplot_0.90               knitr_1.33

Multi-omics reveals clinically relevant proliferative drive associated with mTOR-MYC-OXPHOS activity in chronic lymphocytic leukemia

4 August 2021

Contents

1 Introduction

2 Prepare MOFA model

2.1 Prepare each single omic dataset

2.1.1 Drug screening

2.1.2 RNA sequencing

2.1.3 DNA methylation array

2.1.4 Genetics

2.1.5 Assemble multiAssayExperiment object

2.2 Build MOFA object

2.3 Setup MOFA training parameters

2.4 Run MOFA model

3 Overview of MOFA output

3.1 Variance explained by MOFA for each omic data

3.2 The first two latent factors represents IGHV and trisomy12

3.2.1 Plot the separation of the samples by the first two factors

3.2.2 Loadings of genomic variations on the first two factors

3.2.3 Associations between Factor 1 and epigenetic subtypes

4 Associations of latent factors to clinical behaviors

4.1 Univariate Cox regression

4.2 Kaplan-Meiler plots

4.2.1 KM plot for overall survival (OS)

4.2.2 KM plot for time to treatment (TTT)

4.2.3 KM plot for subgroup defined by IGHV status and median latent factor values

4.2.4 KM plot for subgroup defined by median latent factor values of F1 and F4 (Supplementary Figure)

4.3 Multi-variate Cox regression for Factor 4 (F4)

4.4 Correlation between F4 and demographics

4.5 Association between F4 and outcomes in treatment-naive patients

4.5.1 Univariate Cox regression

4.5.2 Kaplan-Meiler plots

4.6 Multi-variate Cox regression

4.7 Correlation between F4 and Lymphocyte doubling time

4.7.1 Univariate test

4.7.2 Consider IGHV status

4.7.2.1 IGHV as a covariate

4.7.2.2 Only M-CLL

4.7.2.3 Only U-CLL

4.7.3 Variance expalined for lymphocyte doubling time

5 Annotation for other four factor that only explain RNA expression variations

5.1 Factor 3

5.2 Factor 5

5.3 Factor 6

5.4 Factor 7

5.5 Arrange plots for Section 1 in the manuscript

5.5.1 Figure 1 in main text

5.5.2 Forest plot showing values of variance explained for each view

5.5.3 Annotation of LF1 for Supplementary figure

5.5.4 KM plots for Supplementary Figure

5.5.5 Lymphocyte doubling time for supplementary figure

5.5.6 Associations with demographics

5.5.7 Associations with outcomes in treatment-naive patients

5.5.8 Annotatons of other factors

6 Validate CLL-PD in external, independent cohorts

6.1 Test whether CLL-PD can be computed using single-omic dataset

6.1.1 Using gene expression dataset for predicting LF4

6.1.2 Using methylation dataset for predicting LF4

6.1.3 Using drug responses

6.1.4 Using gene mutations

6.1.5 Compare the prediction accuracy (variance explained for CLL-PD)

6.2 Estimate and test F4 (CLL-PD) in external cohorts using gene expression

6.2.1 ICGC-CLL cohort with RNAseq data (EGAS00001000374)

6.2.1.1 Pre-processing data

6.2.2 Predict factor value in ICGC cohort

6.2.3 Test assocation with clincial outcome

6.2.3.1 Univariate test

6.2.3.2 Combine IGHV and CLL-PD

6.2.3.3 Multivariate test

6.2.3.4 Correlation between predicted CLL-PD and demographics

6.2.4 Association with genomics

6.2.5 Assocation with mutation load

6.2.6 Enrichment analysis

6.3 Three external cohorts with microarray data

6.3.1 Query data from GEO and format datasets

6.3.1.1 GSE22762 (Munic cohort)

6.3.1.2 GSE39671 (UCSD cohort)

6.3.1.3 GSE10138 (Duke cohort)

6.3.2 Test associations with outcomes

6.3.2.1 Munic cohort (GSE22762)