This analysis resulted from an idea in a comment here. In short we were seeing particularly low VAF for Lancet TCGA data and were wondering whether this had to do with the data being WXS instead of WGS.
Lancet was doing poorly with WXS samples when compared on a within participant basis.
Declare names of input and output directories.
# magrittr pipe
`%>%` <- dplyr::`%>%`
data_dir <- file.path("..", "..", "..", "data")
scratch_dir <- file.path("..", "..", "..", "scratch")
Read in the metadata.
metadata <- readr::read_tsv(file.path(data_dir, "pbta-histologies.tsv"))
Parsed with column specification:
cols(
.default = col_character(),
OS_days = [32mcol_double()[39m,
age_last_update_days = [32mcol_double()[39m,
normal_fraction = [32mcol_double()[39m,
tumor_fraction = [32mcol_double()[39m,
tumor_ploidy = [32mcol_double()[39m,
molecular_subtype = [33mcol_logical()[39m
)
See spec(...) for full column specifications.
493 parsing failures.
row col expected actual file
2334 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../../data/pbta-histologies.tsv'
2335 molecular_subtype 1/0/T/F/TRUE/FALSE Group4 '../../../data/pbta-histologies.tsv'
2336 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../../data/pbta-histologies.tsv'
2337 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../../data/pbta-histologies.tsv'
2338 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../../data/pbta-histologies.tsv'
.... ................. .................. ...... ....................................
See problems(...) for more details.
Retrieve all the participant IDs for participants that have both WGS and WXS data.
matched_participants <- metadata %>%
dplyr::filter(experimental_strategy != "RNA-Seq") %>%
dplyr::group_by(Kids_First_Participant_ID) %>%
dplyr::summarize(strategies = paste0(experimental_strategy, collapse = ",")) %>%
dplyr::filter(grepl("WXS", strategies) & grepl("WGS", strategies)) %>%
dplyr::pull(Kids_First_Participant_ID)
Get the biospecimen IDS for these participants.
biospecimens <- metadata %>%
dplyr::filter(Kids_First_Participant_ID %in% matched_participants) %>%
dplyr::pull(Kids_First_Biospecimen_ID)
Connect to SQLite database.
# Start up connection
con <- DBI::dbConnect(
RSQLite::SQLite(),
file.path(scratch_dir, "v11_snv_db.sqlite")
)
Make a list of columns we will keep.
cols_to_keep <- c(
"Chromosome",
"Start_Position",
"End_Position",
"Reference_Allele",
"Allele",
"Tumor_Sample_Barcode",
"Variant_Classification",
"VAF"
)
Set up the Lancet data from the SQL database and only keep the biospecimens we identified.
lancet <- dplyr::tbl(con, "lancet") %>%
dplyr::select(cols_to_keep) %>%
dplyr::inner_join(
dplyr::tbl(con, "samples") %>%
dplyr::select(
Tumor_Sample_Barcode = Kids_First_Biospecimen_ID,
experimental_strategy, short_histology,
Kids_First_Participant_ID
)
) %>%
dplyr::filter(Tumor_Sample_Barcode %in% biospecimens) %>%
as.data.frame()
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
Joining, by = "Tumor_Sample_Barcode"
Plot this as a violin plot
vaf_plot <- ggplot2::ggplot(lancet, ggplot2::aes(
x = experimental_strategy,
y = VAF,
fill = experimental_strategy
)) +
ggplot2::geom_violin() +
ggplot2::theme_classic() +
ggplot2::ggtitle("Lancet participants with WXS and WGS")
Print out plot with all participants together.
vaf_plot
Split up data by participant.
vaf_plot + ggplot2::facet_wrap(~ Kids_First_Participant_ID)
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)
Matrix products: default
BLAS/LAPACK: /usr/lib/libopenblasp-r0.2.19.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
[4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
loaded via a namespace (and not attached):
[1] Rcpp_1.0.1 dbplyr_1.4.2 pillar_1.4.2 compiler_3.6.0 base64enc_0.1-3
[6] tools_3.6.0 digest_0.6.20 bit_1.1-14 jsonlite_1.6 RSQLite_2.1.1
[11] evaluate_0.14 memoise_1.1.0 tibble_2.1.3 gtable_0.3.0 pkgconfig_2.0.2
[16] rlang_0.4.0 DBI_1.0.0 rstudioapi_0.10 xfun_0.8 dplyr_0.8.3
[21] knitr_1.23 hms_0.4.2 bit64_0.9-7 grid_3.6.0 tidyselect_0.2.5
[26] glue_1.3.1 R6_2.4.0 rmarkdown_1.13 readr_1.3.1 purrr_0.3.2
[31] ggplot2_3.2.0 blob_1.1.1 magrittr_1.5 scales_1.0.0 htmltools_0.3.6
[36] rsconnect_0.8.13 assertthat_0.2.1 colorspace_1.4-1 labeling_0.3 lazyeval_0.2.2
[41] munsell_0.5.0 crayon_1.3.4