Purpose: How do the TMB calculations change with adjustments to the VAF cutoff filter? More specifically, if we increase VAF filter cutoff, do the callers’ TMB stats relate to each other more?
VarDict is the least related to the other callers, but increasing the VAF filter cutoff does somewhat recover how its TMB stats relate to the other callers’ TMB stats. Regardless, VarDict’s data should still be dropped as it is too aberrant from the other callers as well as overly sensitive.
To run this from the command line, use:
Rscript -e "rmarkdown::render('analyses/snv-callers/vaf_cutoff_experiment.Rmd',
clean = TRUE)"
This assumes you are in the top directory of the repository.
Read in set up script.
if (!("GGally" %in% installed.packages())) {
install.packages("GGally")
}
if (!("ggupset" %in% installed.packages())) {
install.packages("ggupset")
}
# Magrittr pipe
`%>%` <- dplyr::`%>%`
Set up output directories.
base_results_dir <- "results"
base_plots_dir <- "plots"
Make new directories for the comparison analysis.
vaf_cutoff_results_dir <- file.path(base_results_dir, "vaf_cutoff_data")
# Make caller specific plots folder
if (!dir.exists(vaf_cutoff_results_dir)) {
dir.create(vaf_cutoff_results_dir)
}
Function for creating a TMB correlation matrix amongst the callers for a particular cutoff.
cor_matrix_plot <- function(cutoff) {
# For a given VAF cutoff, calculate TMB correlations amongst the callers and plot it using
# GGally::ggpairs
#
# Args:
# cutoff: The VAF cutoff set to analyze
#
# Returns:
# A correlation matrix plot amongst the callers
# Isolate the TMB stats for this cutoff
cutoff_df <- tmb_filter_long %>%
dplyr::filter(vaf_cutoff == cutoff) %>%
dplyr::select("lancet", "mutect2", "strelka2", "vardict")
# Make the plot
cor_plot <- GGally::ggpairs(cutoff_df, mapping = ggplot2::aes(alpha = 0.05)) +
ggplot2::theme_classic() +
ggplot2::ggtitle(paste0("VAF cutoff: ", cutoff))
# Return the plot
return(cor_plot)
}
Function for correlating TMB statistics for each VAF cutoff set.
cor_by_vaf <- function(cutoff = 0.1) {
# For a given VAF cutoff, isolate the TMB statistics and correlate them.
#
# Args:
# cutoff: the VAF cutoff to calculate the correlation by.
#
# Returns:
# R values for Spearman's correlation amongst the callers
cutoff_df <- tmb_filter_long %>%
dplyr::filter(vaf_cutoff == cutoff) %>%
dplyr::select(-Tumor_Sample_Barcode, -vaf_cutoff)
# Do correlations between callers
cor_res <- cor(cutoff_df, method = "spearman", use = "na.or.complete")
# This is to remove redundancy as upper correlation matrix == lower
cor_res[upper.tri(cor_res, diag = TRUE)] <- NA
# Format into long format data.frame
cor_df <- reshape2::melt(cor_res, na.rm = TRUE, value.name = "cor") %>%
dplyr::mutate(
compare = paste0(Var1, "-", Var2),
cutoff
)
# Return a data.frame ready for being added to the bigger data.frame for plotting.
return(cor_df)
}
We need to re-calculate TMB statistics with various VAF filter cutoffs. We will run this script to do it: system("bash scripts/run_caller_evals_vaf_filter_exp.sh")
Get the lists of each type of file.
# Make a list of the vaf filter experiment tmb files
filter_tmb_files <- list.files(vaf_cutoff_results_dir,
pattern = "_tmb.rds$", recursive = TRUE,
full.names = TRUE
)
Take a look at the TMBs from different VAF filter cutoffs.
filter_tmb_files
[1] "results/vaf_cutoff_data/cutoff_0.10/lancet_tmb.rds" "results/vaf_cutoff_data/cutoff_0.10/mutect2_tmb.rds"
[3] "results/vaf_cutoff_data/cutoff_0.10/strelka2_tmb.rds" "results/vaf_cutoff_data/cutoff_0.10/vardict_tmb.rds"
[5] "results/vaf_cutoff_data/cutoff_0.20/lancet_tmb.rds" "results/vaf_cutoff_data/cutoff_0.20/mutect2_tmb.rds"
[7] "results/vaf_cutoff_data/cutoff_0.20/strelka2_tmb.rds" "results/vaf_cutoff_data/cutoff_0.20/vardict_tmb.rds"
[9] "results/vaf_cutoff_data/cutoff_0.30/lancet_tmb.rds" "results/vaf_cutoff_data/cutoff_0.30/mutect2_tmb.rds"
[11] "results/vaf_cutoff_data/cutoff_0.30/strelka2_tmb.rds" "results/vaf_cutoff_data/cutoff_0.30/vardict_tmb.rds"
[13] "results/vaf_cutoff_data/cutoff_0.40/lancet_tmb.rds" "results/vaf_cutoff_data/cutoff_0.40/mutect2_tmb.rds"
[15] "results/vaf_cutoff_data/cutoff_0.40/strelka2_tmb.rds" "results/vaf_cutoff_data/cutoff_0.40/vardict_tmb.rds"
Convert the VAF cutoffs into their own vector.
vaf_filter <- stringr::word(filter_tmb_files, sep = "/", 3)
vaf_filter <- as.numeric(gsub("cutoff_", "", vaf_filter))
vaf_filter
[1] 0.1 0.1 0.1 0.1 0.2 0.2 0.2 0.2 0.3 0.3 0.3 0.3 0.4 0.4 0.4 0.4
Get caller information as a vector.
caller_filter <- stringr::word(filter_tmb_files, sep = "/", 4)
caller_filter <- gsub("_tmb.rds", "", caller_filter)
caller_filter
[1] "lancet" "mutect2" "strelka2" "vardict" "lancet" "mutect2" "strelka2" "vardict" "lancet" "mutect2"
[11] "strelka2" "vardict" "lancet" "mutect2" "strelka2" "vardict"
Read in the data. Name each with their caller and VAF filter cutoff.
Turn this into a data frame.
tmb_filter_df <- dplyr::bind_rows(filter_tmb_list, .id = "NAME") %>%
dplyr::mutate(
caller = stringr::word(NAME, sep = "_", 1),
vaf_cutoff = as.numeric(stringr::word(NAME, sep = "_", 2))
)
Make this into a long form data.frame for plotting purposes.
tmb_filter_long <- tmb_filter_df %>%
dplyr::distinct(Tumor_Sample_Barcode, caller, tmb, vaf_cutoff) %>%
tidyr::spread(caller, tmb)
First we’ll make a series of corrletion matrix plots.
lapply(unique(vaf_filter), cor_matrix_plot)
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
[[1]]
[[2]]
[[3]]
[[4]]
Let’s make a summary plot of the Spearman’s correlations. Get correlations by vaf cutoff.
# Run this on each set
cor_by_vaf_df <- do.call(
"rbind.data.frame",
lapply(unique(vaf_filter), cor_by_vaf)
)
Plot the correlations and the vaf_cutoff per each caller combination.
ggplot2::ggplot(cor_by_vaf_df, ggplot2::aes(x = reorder(compare, -cor), fill = as.factor(cutoff), y = cor)) +
ggplot2::geom_point(shape = 21, size = 2.5, stroke = .75, color = "black") +
ggplot2::theme_classic() +
ggupset::scale_x_mergelist(sep = "-") +
ggupset::axis_combmatrix(sep = "-") +
ggplot2::xlab("") +
ggplot2::ylab("TMB Spearman Correlation") +
ggplot2::scale_fill_brewer("VAF Filter Cutoff", palette = "YlGnBu")
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)
Matrix products: default
BLAS/LAPACK: /usr/lib/libopenblasp-r0.2.19.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
loaded via a namespace (and not attached):
[1] Rcpp_1.0.1 rstudioapi_0.10 GGally_1.4.0 knitr_1.23 magrittr_1.5
[6] progress_1.2.2 hms_0.4.2 ggupset_0.1.0 munsell_0.5.0 tidyselect_0.2.5
[11] colorspace_1.4-1 R6_2.4.0 rlang_0.4.0 plyr_1.8.4 stringr_1.4.0
[16] dplyr_0.8.3 tools_3.6.0 grid_3.6.0 gtable_0.3.0 xfun_0.8
[21] htmltools_0.3.6 lazyeval_0.2.2 assertthat_0.2.1 digest_0.6.20 tibble_2.1.3
[26] crayon_1.3.4 reshape2_1.4.3 purrr_0.3.2 readr_1.3.1 RColorBrewer_1.1-2
[31] ggplot2_3.2.0 glue_1.3.1 labeling_0.3 stringi_1.4.3 compiler_3.6.0
[36] pillar_1.4.2 prettyunits_1.0.2 scales_1.0.0 reshape_0.8.8 pkgconfig_2.0.2