Imputation workflow for low coverage whole genome sequencing data

public 1yr ago Version: v0.2.0 0 bookmarks

View Workflow

lcwgs-imputation-workflow — View Workflow

This workflow is for imputation using low coverage whole genome sequencing data with QUILT . Also, it can perform benchmarking for both QUILT and GLIMPSE given different scenarios.

Dependencies

QUILT (QUILT_prepare_reference.R, QUILT.R)
GLIMPSE v2.0 (GLIMPSE2_split_reference, GLIMPSE2_phase, GLIMPSE2_ligate)
GLIMPSE v1.1.1 (GLIMPSE_chunk, GLIMPSE_phase, GLIMPSE_ligate)
samtools
bcftools

Usage

The usage of this workflow is described in the Snakemake Workflow Catalog .

If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) https://github.com/Zilong-Li/lcWGS-imputation-workflow and its DOI (see above).

Code Snippets

shell:
    """
    (
    if [ -s {params.af} ];then perl -lane 'print join(":",@F[0..3])."\\t$F[4]"' {params.af} > {output.tmp}; \
    else \
        bcftools +fill-tags {input.sites[0]} -- -t AF |  bcftools query -f '{params.ql1}' > {output.tmp}; \
    fi
    awk '{params.awk}' <(bcftools query -f '{params.ql0}' {input.sites[0]}) {output.tmp} >{output.af}
    bcftools view -s {params.samples} {params.truth} | bcftools query -f '{params.ql2}' | sed -E 's/\/|\|/\\t/g' > {output.tmp2}
    awk '{params.awk2}' <(bcftools query -f '{params.ql0}' {input.sites[0]}) {output.tmp2} >{output.gt}
    ) &> {log}
    """

SnakeMake BCFtools From line 30 of rules/accuracy.smk

shell:
    """
    bcftools query -f '{params.ql2}' -s {params.samples} {input} | sed -E 's/\/|\|/\\t/g' > {output}
    """

SnakeMake BCFtools From line 63 of rules/accuracy.smk

shell:
    """
    bcftools query -f '{params.ql2}' -s {params.samples} {input} | sed -E 's/\/|\|/\\t/g' > {output}
    """

SnakeMake BCFtools From line 87 of rules/accuracy.smk

shell:
    """
    bcftools query -f '{params.ql2}' -s {params.samples} {input} | sed -E 's/\/|\|/\\t/g' > {output}
    """

SnakeMake BCFtools From line 111 of rules/accuracy.smk

script:
    "../scripts/accuracy_single.R"

SnakeMake From line 143 of rules/accuracy.smk

script:
    "../scripts/accuracy_single.R"

SnakeMake From line 171 of rules/accuracy.smk

script:
    "../scripts/accuracy_single.R"

SnakeMake From line 199 of rules/accuracy.smk

script:
    "../scripts/accuracy_quilt.R"

SnakeMake From line 230 of rules/accuracy.smk

shell:
    """
    bcftools query -f '{params.ql2}' -s {params.samples} {input} | sed -E 's/\/|\|/\\t/g' > {output}
    """

SnakeMake BCFtools From line 252 of rules/accuracy.smk

shell:
    """
    bcftools query -f '{params.ql2}' -s {params.samples} {input} | sed -E 's/\/|\|/\\t/g' > {output}
    """

SnakeMake BCFtools From line 276 of rules/accuracy.smk

script:
    "../scripts/accuracy_single.R"

SnakeMake From line 306 of rules/accuracy.smk

script:
    "../scripts/accuracy_single.R"

SnakeMake From line 330 of rules/accuracy.smk

script:
    "../scripts/accuracy_panelsize.R"

SnakeMake From line 367 of rules/accuracy.smk

script:
    "../scripts/accuracy_depth.R"

SnakeMake From line 404 of rules/accuracy.smk

shell:
    """
    (
    if [ {wildcards.depth} == 0 ];then \
        samtools view -o {output} {params.bam} {wildcards.chrom} && samtools index {output} \
    ;else\
        FRAC=$(echo "scale=4 ; {wildcards.depth} / {params.depth}" | bc -l) && \
        samtools view -s $FRAC -o {output} {params.bam} {wildcards.chrom} && samtools index {output} \
    ; fi
    ) &> {log}
    """

SnakeMake SAMtools From line 15 of rules/downsample.smk

shell:
    """ echo {input} | tr ' ' '\\n' > {output} """

SnakeMake From line 39 of rules/downsample.smk

shell:
    """
    (
    if [ -s {params.gmap} ];then \
        {params.time} -v GLIMPSE2_split_reference \
        --keep-monomorphic-ref-sites \
        --reference {input.refvcf} \
        --map '{params.gmap}' \
        --input-region {params.irg} \
        --output-region {params.org} \
        --output {output} \
        --threads 4 && \
        mv {output}_*.bin {output} \
    ; else \
        {params.time} -v GLIMPSE2_split_reference \
        --keep-monomorphic-ref-sites \
        --reference {input.refvcf} \
        --input-region {params.irg} \
        --output-region {params.org} \
        --output {output} \
        --threads 4 && \
        mv {output}_*.bin {output} \
    ; fi \
    ) &> {log}
    """

SnakeMake From line 19 of rules/glimpse.smk

shell:
    """
    (
        {params.time} -v GLIMPSE2_phase \
        --bam-list {input.bams} \
        --reference {input.refbin} \
        --burnin {params.burnin} \
        --main {params.main} \
        --pbwt-depth {params.pbwtL} \
        --pbwt-modulo {params.pbwtS} \
        --ne {params.ne} \
        --output {output} \
    ) &> {log}
    """

SnakeMake From line 78 of rules/glimpse.smk

shell:
    """
    echo {input} | tr ' ' '\\n' > {output.lst}
    GLIMPSE2_ligate --input {output.lst} --output {output.tmp} --threads 2 && \
    awk 'NR>1 {{ print $1 }}' {params.sample} > {output.sample} && \
    bcftools reheader -s {output.sample} -o {output.vcf} {output.tmp} && \
    bcftools index -f {output.vcf}
    """

SnakeMake BCFtools From line 135 of rules/glimpse.smk

shell:
    """
    (
    {params.time} -v bcftools mpileup -q {params.bq} -Q {params.mq} -f {params.fasta} -I -E -A -a 'FORMAT/DP' -r {wildcards.chrom} -T {input.sites[0]} -b {input.bams} -Ou | \
        bcftools call -Aim -C alleles -T {input.tsv[0]} -Ob -o {output.vcf} && bcftools index -f {output.vcf} \
    ) &> {log}
    """

SnakeMake BCFtools From line 173 of rules/glimpse.smk

shell:
    """
    (
    if [ -s {params.gmap} ];then \
        {params.time} -v GLIMPSE_phase \
        --input {input.glvcf} \
        --reference {input.refvcf} \
        --map '{params.gmap}' \
        --input-region {params.irg} \
        --output-region {params.org} \
        --burnin {params.burnin} \
        --main {params.main} \
        --pbwt-depth {params.pbwtL} \
        --pbwt-modulo {params.pbwtS} \
        --ne {params.ne} \
        --output {output} && \
        bcftools index -f {output} \
    ; else \
        {params.time} -v GLIMPSE_phase \
        --input {input.glvcf} \
        --reference {input.refvcf} \
        --input-region {params.irg} \
        --output-region {params.org} \
        --burnin {params.burnin} \
        --main {params.main} \
        --pbwt-depth {params.pbwtL} \
        --pbwt-modulo {params.pbwtS} \
        --ne {params.ne} \
        --output {output} && \
        bcftools index -f {output} \
    ; fi \
    ) &> {log}
    """

SnakeMake BCFtools From line 215 of rules/glimpse.smk

shell:
    """
    echo {input} | tr ' ' '\\n' > {output.lst}
    GLIMPSE_ligate --input {output.lst} --output {output.vcf} && bcftools index -f {output.vcf}
    """

SnakeMake BCFtools From line 273 of rules/glimpse.smk

shell:
    """
    (
    if [ -s {params.gmap} ];then \
    {params.time} -v QUILT_prepare_reference.R \
        --genetic_map_file='{params.gmap}' \
        --reference_vcf_file={input.vcf} \
        --chr={wildcards.chrom} \
        --regionStart={wildcards.start} \
        --regionEnd={wildcards.end} \
        --buffer={params.buffer} \
        --nGen={params.nGen} \
        --use_hapMatcherR={params.lowram} \
        --use_mspbwt=FALSE \
        --impute_rare_common={params.impute_rare_common} \
        --rare_af_threshold={params.rare_af_threshold} \
        --outputdir={params.outdir} \
    ; else \
    {params.time} -v QUILT_prepare_reference.R \
        --reference_vcf_file={input.vcf} \
        --chr={wildcards.chrom} \
        --regionStart={wildcards.start} \
        --regionEnd={wildcards.end} \
        --buffer={params.buffer} \
        --use_hapMatcherR={params.lowram} \
        --nGen={params.nGen} \
        --use_mspbwt=FALSE \
        --impute_rare_common={params.impute_rare_common} \
        --rare_af_threshold={params.rare_af_threshold} \
        --outputdir={params.outdir} \
    ; fi
    ) &> {log}
    """

SnakeMake From line 36 of rules/quilt.smk

shell:
    """
    (
    if [ -s {params.gmap} ];then \
    {params.time} -v QUILT_prepare_reference.R \
        --genetic_map_file='{params.gmap}' \
        --reference_vcf_file={input.vcf} \
        --chr={wildcards.chrom} \
        --regionStart={wildcards.start} \
        --regionEnd={wildcards.end} \
        --use_hapMatcherR={params.lowram} \
        --buffer={params.buffer} \
        --nGen={params.nGen} \
        --use_mspbwt=TRUE \
        --impute_rare_common={params.impute_rare_common} \
        --rare_af_threshold={params.rare_af_threshold} \
        --mspbwt_nindices={params.nindices} \
        --outputdir={params.outdir} \
    ; else \
    {params.time} -v QUILT_prepare_reference.R \
        --reference_vcf_file={input.vcf} \
        --chr={wildcards.chrom} \
        --regionStart={wildcards.start} \
        --regionEnd={wildcards.end} \
        --buffer={params.buffer} \
        --use_hapMatcherR={params.lowram} \
        --nGen={params.nGen} \
        --use_mspbwt=TRUE \
        --rare_af_threshold={params.rare_af_threshold} \
        --impute_rare_common={params.impute_rare_common} \
        --mspbwt_nindices={params.nindices} \
        --outputdir={params.outdir} \
    ; fi \
    ) &> {log}
    """

SnakeMake From line 106 of rules/quilt.smk

shell:
    """
    (
    if [ -s {params.gmap} ];then \
    {params.time} -v QUILT_prepare_reference.R \
        --genetic_map_file='{params.gmap}' \
        --reference_vcf_file={input.vcf} \
        --chr={wildcards.chrom} \
        --regionStart={wildcards.start} \
        --regionEnd={wildcards.end} \
        --buffer={params.buffer} \
        --use_hapMatcherR={params.lowram} \
        --impute_rare_common={params.impute_rare_common} \
        --rare_af_threshold={params.rare_af_threshold} \
        --nGen={params.nGen} \
        --use_zilong=TRUE \
        --use_mspbwt=FALSE \
        --mspbwt_nindices={params.nindices} \
        --mspbwtB={params.mspbwtB} \
        --outputdir={params.outdir} \
    ; else \
    {params.time} -v QUILT_prepare_reference.R \
        --reference_vcf_file={input.vcf} \
        --chr={wildcards.chrom} \
        --regionStart={wildcards.start} \
        --regionEnd={wildcards.end} \
        --buffer={params.buffer} \
        --use_hapMatcherR={params.lowram} \
        --impute_rare_common={params.impute_rare_common} \
        --rare_af_threshold={params.rare_af_threshold} \
        --nGen={params.nGen} \
        --use_zilong=TRUE \
        --use_mspbwt=FALSE \
        --mspbwt_nindices={params.nindices} \
        --mspbwtB={params.mspbwtB} \
        --outputdir={params.outdir} \
    ; fi \
    ) &> {log}
    """

SnakeMake From line 179 of rules/quilt.smk

shell:
    """
    {params.time} -v QUILT.R \
        --reference_vcf_file={input.vcf} \
        --prepared_reference_filename={input.rdata} \
        --bamlist={input.bams} \
        --chr={wildcards.chrom} \
        --regionStart={wildcards.start} \
        --regionEnd={wildcards.end} \
        --buffer={params.buffer} \
        --nGen={params.nGen} \
        --zilong=FALSE \
        --use_mspbwt=FALSE \
        --Ksubset={params.Ksubset} \
        --Knew={params.Ksubset} \
        --nGibbsSamples={params.nGibbsSamples} \
        --use_hapMatcherR={params.lowram} \
        --impute_rare_common={params.impute_rare_common} \
        --rare_af_threshold={params.rare_af_threshold} \
        --n_seek_its={params.n_seek_its} \
        --n_burn_in_seek_its={params.n_burnin_its} \
        --small_ref_panel_block_gibbs_iterations='{params.block_gibbs}' \
        --small_ref_panel_gibbs_iterations={params.gibbs_iters} \
        --output_filename={output} &> {log}
    """

SnakeMake From line 258 of rules/quilt.smk

shell:
    """
    ( \
       echo {input} | tr ' ' '\n' > {output.lst} && \
       bcftools concat --file-list {output.lst} --output-type b --threads 4 -o {output.vcf} && \
       bcftools index -f {output.vcf} \
    ) &> {log}
    """

SnakeMake BCFtools From line 314 of rules/quilt.smk

shell:
    """
    {params.time} -v QUILT.R \
        --reference_vcf_file={input.vcf} \
        --prepared_reference_filename={input.rdata} \
        --bamlist={input.bams} \
        --use_hapMatcherR={params.lowram} \
        --impute_rare_common={params.impute_rare_common} \
        --chr={wildcards.chrom} \
        --regionStart={wildcards.start} \
        --regionEnd={wildcards.end} \
        --buffer={params.buffer} \
        --nGen={params.nGen} \
        --zilong=FALSE \
        --use_mspbwt=TRUE \
        --Ksubset={params.Ksubset} \
        --Knew={params.Ksubset} \
        --nGibbsSamples={params.nGibbsSamples} \
        --n_seek_its={params.n_seek_its} \
        --n_burn_in_seek_its={params.n_burnin_its} \
        --rare_af_threshold={params.rare_af_threshold} \
        --small_ref_panel_block_gibbs_iterations='{params.block_gibbs}' \
        --small_ref_panel_gibbs_iterations={params.gibbs_iters} \
        --output_filename={output} &> {log}
    """

SnakeMake From line 362 of rules/quilt.smk

shell:
    """
    ( \
       echo {input} | tr ' ' '\n' > {output.lst} && \
       bcftools concat --file-list {output.lst} --output-type b --threads 4 -o {output.vcf} && \
       bcftools index -f {output.vcf} \
    ) &> {log}
    """

SnakeMake BCFtools From line 418 of rules/quilt.smk

shell:
    """
    {params.time} -v QUILT.R \
        --reference_vcf_file={input.vcf} \
        --prepared_reference_filename={input.rdata} \
        --bamlist={input.bams} \
        --use_hapMatcherR={params.lowram} \
        --impute_rare_common={params.impute_rare_common} \
        --chr={wildcards.chrom} \
        --regionStart={wildcards.start} \
        --regionEnd={wildcards.end} \
        --buffer={params.buffer} \
        --nGen={params.nGen} \
        --mspbwtL={params.mspbwtL} \
        --mspbwtM={params.mspbwtM} \
        --zilong=TRUE \
        --use_mspbwt=FALSE \
        --Ksubset={params.Ksubset} \
        --Knew={params.Ksubset} \
        --nGibbsSamples={params.nGibbsSamples} \
        --n_seek_its={params.n_seek_its} \
        --n_burn_in_seek_its={params.n_burnin_its} \
        --rare_af_threshold={params.rare_af_threshold} \
        --small_ref_panel_block_gibbs_iterations='{params.block_gibbs}' \
        --small_ref_panel_gibbs_iterations={params.gibbs_iters} \
        --output_filename={output} &> {log}
    """

SnakeMake From line 468 of rules/quilt.smk

shell:
    """
    ( \
       echo {input} | tr ' ' '\n' > {output.lst} && \
       bcftools concat --file-list {output.lst} --output-type b --threads 4 -o {output.vcf} && \
       bcftools index -f {output.vcf} \
    ) &> {log}
    """

SnakeMake BCFtools From line 526 of rules/quilt.smk

script:
    "../scripts/subset_samples.R"

SnakeMake From line 14 of rules/refpanels.smk

shell:
    """
    ( \
        bcftools view -v snps -m2 -M2 --samples-file {input} --threads 4 {params.vcf}| bcftools norm - -d snps -Ob -o {output.vcf} --threads 4 && bcftools index -f {output.vcf} && \
        touch -m {output.vcf}.csi && \
        bcftools view -G {output.vcf} -Oz -o {output.sites} --threads 4 && tabix -f {output.sites} && \
        bcftools query -f'%CHROM\t%POS\t%REF,%ALT\n' {output.sites} | bgzip -c > {output.tsv} && tabix -s1 -b2 -e2 {output.tsv}
    )  &> {log}
    """

SnakeMake BCFtools tabix From line 36 of rules/refpanels.smk

shell:
    """
    ( \
        bcftools view -v snps -m2 -M2 --samples-file {input} --threads 4 {params.vcf} {wildcards.chrom}:{params.start}-{params.end}| bcftools norm - -d snps -Ob -o {output.vcf} --threads 4 && bcftools index -f {output.vcf} && \
        touch -m {output.vcf}.csi && \
        bcftools view -G {output.vcf} -Oz -o {output.sites} --threads 4 && tabix -f {output.sites} && \
        bcftools query -f'%CHROM\t%POS\t%REF,%ALT\n' {output.sites} | bgzip -c > {output.tsv} && tabix -s1 -b2 -e2 {output.tsv}
    )  &> {log}
    """

SnakeMake BCFtools tabix From line 83 of rules/refpanels.smk

shell:
    """
    ( \
        echo {input} | tr ' ' '\n' > {output.sites}.list && \
        bcftools concat -f {output.sites}.list -Da --threads 4 -Oz -o {output.sites} && tabix -f {output.sites} && \
        bcftools query -f'%CHROM\t%POS\t%REF,%ALT\n' {output.sites} | bgzip -c > {output.tsv} && tabix -s1 -b2 -e2 {output.tsv}
    ) & > {log}
    """

SnakeMake BCFtools tabix From line 120 of rules/refpanels.smk

shell:
    """
    echo {input} | tr ' ' '\\n' | xargs grep -E 'Elaps|Maximum' | awk '{{print $NF}}' | sed 'N;s/\\n/ /' > {output}
    """

SnakeMake From line 19 of rules/speed.smk

shell:
    """
    echo {input} | tr ' ' '\\n' | xargs grep -E 'Elaps|Maximum' | awk '{{print $NF}}' | sed 'N;s/\\n/ /' > {output}
    """

SnakeMake From line 42 of rules/speed.smk

shell:
    """
    echo {input} | tr ' ' '\\n' | xargs grep -E 'Elaps|Maximum' | awk '{{print $NF}}' | sed 'N;s/\\n/ /' > {output}
    """

SnakeMake From line 65 of rules/speed.smk

shell:
    """
    echo {input} | tr ' ' '\\n' | xargs grep -E 'Elaps|Maximum' | awk '{{print $NF}}' | sed 'N;s/\\n/ /' > {output}
    """

SnakeMake From line 88 of rules/speed.smk

shell:
    """
    echo {input} | tr ' ' '\\n' | xargs grep -E 'Elaps|Maximum' | awk '{{print $NF}}' | sed 'N;s/\\n/ /' > {output}
    """

SnakeMake From line 111 of rules/speed.smk

script:
    "../scripts/speed_single.R"

SnakeMake From line 137 of rules/speed.smk

script:
    "../scripts/speed_single.R"

SnakeMake From line 161 of rules/speed.smk

script:
    "../scripts/speed_single.R"

SnakeMake From line 185 of rules/speed.smk

script:
    "../scripts/speed_single.R"

SnakeMake From line 205 of rules/speed.smk

script:
    "../scripts/speed_single.R"

SnakeMake From line 225 of rules/speed.smk

script:
    "../scripts/speed_panelsize.R"

SnakeMake From line 260 of rules/speed.smk

script:
    "../scripts/speed_depth.R"

SnakeMake From line 294 of rules/speed.smk

snakemake@source("common.R")


groups <- as.numeric(snakemake@config[["downsample"]])

df.truth <- read.table(snakemake@input[["truth"]])
df.truth <- sapply(seq(1, dim(df.truth)[2] - 1, 2), function(i) {
  rowSums(df.truth[, (i + 1):(i + 2)])
}) # matrix: nsnps x nsamples
rownames(df.truth) <- read.table(snakemake@input[["truth"]])[,1]
af <- as.numeric(read.table(snakemake@input[["af"]])[, 2])
names(af) <- read.table(snakemake@input[["af"]])[, 1]

dl.quilt1 <- lapply(snakemake@input[["regular"]], parse.quilt.gts)
dl.quilt2 <- lapply(snakemake@input[["zilong"]], parse.quilt.gts)
dl.glimpse1 <- lapply(snakemake@input[["glimpse1"]], parse.quilt.gts)
dl.glimpse2 <- lapply(snakemake@input[["glimpse2"]], parse.quilt.gts)

bins <- sort(unique(c(
  c(0, 0.01 / 100, 0.02 / 100, 0.05 / 100),
  c(0, 0.01 / 10, 0.02 / 10, 0.05 / 10),
  c(0, 0.01 / 1, 0.02 / 1, 0.05 / 1),
  seq(0.1, 0.5, length.out = 5)
)))

accuracy_by_af <- lapply(seq(length(groups)), function(i) {
  d <- acc_r2_by_af(df.truth, dl.quilt2[[i]], dl.glimpse2[[i]], dl.quilt1[[i]], dl.glimpse1[[i]], af, bins)
  colnames(d) <- c("bin","QUILT2", "GLIMPSE2", "QUILT1", "GLIMPSE1")
  d
})
names(accuracy_by_af) <- paste0(as.character(groups), "x")

saveRDS(accuracy_by_af, snakemake@output[["rds"]])

## (rds <- readRDS("/maps/projects/alab/people/rlk420/quilt2/human/UKBB_GEL_CEU/bench-speed/results/summary/all.accuracy.panelsize0.chr20.rds"))

pdf(paste0(snakemake@output[["rds"]], ".pdf"), w = 12, h = 6)

a1 <- accuracy_by_af[[1]]
x <- a1$bin[!sapply(a1[, 2], is.na)] # remove AF bin with NULL results
x <- log10(as.numeric(x))
labels <- 100 * bins[-1]
labels <- labels[!sapply(a1[, 2], is.na)]
ymin <- min(sapply(accuracy_by_af, function(d) {
  m <- as.matrix(apply(d[, -1], 2, unlist))
  min(m, na.rm = T)
}))


par(mfrow = c(1, 2))
plot(1, col = "transparent", axes = F, xlim = c(min(x), max(x)), ylim = c(0.9 * ymin, 1.0), ylab = "Aggregated R2 within each AF bin", xlab = "Allele Frequency")
nd <- length(groups)

for (i in 1:nd) {
  d <- accuracy_by_af[[i]]
  # https://stackoverflow.com/questions/33004238/r-removing-null-elements-from-a-list
  y <- rmna(d$QUILT2)
  lines(x, y, type = "l", lwd = i / nd * 2.5, pch = 1, col = mycols["QUILT2"])
  y <- rmna(d$GLIMPSE2)
  lines(x, y, type = "l", lwd = i / nd * 2.5, pch = 1, col = mycols["GLIMPSE2"])
  y <- rmna(d$QUILT1)
  lines(x, y, type = "l", lwd = i / nd * 2.5, pch = 1, col = mycols["QUILT1"])
  y <- rmna(d$GLIMPSE1)
  lines(x, y, type = "l", lwd = i / nd * 2.5, pch = 1, col = mycols["GLIMPSE1"])
}
axis(side = 1, at = x, labels = labels)
axis(side = 2, at = seq(0, 1, 0.2))
legend("bottomright", legend = paste0(groups, "x"), lwd = (1:nd) * 2.5 / nd, bty = "n")

plot(1, col = "transparent", axes = F, xlim = c(min(x), max(x)), ylim = c(0.90, 1.0), ylab = "Aggregated R2 within each AF bin", xlab = "Allele Frequency")
for (i in 1:nd) {
  d <- accuracy_by_af[[i]]
  y <- rmna(d$QUILT2)
  lines(x, y, type = "l", lwd = i / nd * 2.5, pch = 1, col = mycols["QUILT2"])
  y <- rmna(d$GLIMPSE2)
  lines(x, y, type = "l", lwd = i / nd * 2.5, pch = 1, col = mycols["GLIMPSE2"])
  y <- rmna(d$QUILT1)
  lines(x, y, type = "l", lwd = i / nd * 2.5, pch = 1, col = mycols["QUILT1"])
  y <- rmna(d$GLIMPSE1)
  lines(x, y, type = "l", lwd = i / nd * 2.5, pch = 1, col = mycols["GLIMPSE1"])
}
axis(side = 1, at = x, labels = labels)
axis(side = 2)
legend("bottomleft", legend = c("QUILT2", "GLIMPSE2", "QUILT1", "GLIMPSE1"), col = mycols, pch = 1, lwd = 1.5, cex = 1.0, xjust = 0, yjust = 1, bty = "n")


## chunkfile <- "/maps/projects/alab/people/rlk420/quilt2/human/HRC_CEU/quilt-rare-common/results/refpanels/chr20.glimpse.chunks"
chunkfile <- snakemake@params[["chunks"]]
chunk.names <- read.table(chunkfile)[,4]
chunk <- lapply(strsplit(gsub(".*:","",chunk.names),"-"), as.integer)
pos <- as.integer(sapply(strsplit(names(af),":"),"[[",2))
chunk_af <- lapply(chunk, function(c) {
  af[which(pos > c[1] & pos < c[2])]
})
names(chunk_af) <- chunk.names

accuracy_by_af_chunk <- lapply(chunk_af, function(af) {
  all <- lapply(seq(length(groups)), function(i) {
    d <- acc_r2_by_af(df.truth, dl.quilt2[[i]], dl.glimpse2[[i]], dl.quilt1[[i]], dl.glimpse1[[i]], af, bins)
    colnames(d) <- c("bin","QUILT2", "GLIMPSE2", "QUILT1", "GLIMPSE1")
    d
  })
  names(all) <- paste0(as.character(groups), "x")
  all
})

for(c in 1:length(chunk.names)) {
  if(c %% 2 == 1) par(mfrow = c(1, 2))
  title <- paste(names(chunk_af)[c], "#", length(chunk_af[[c]]))
  acc_chunk <- accuracy_by_af_chunk[[c]]
  plot(1, col = "transparent", axes = F, xlim = c(min(x), max(x)), ylim = c(0, 1.0), ylab = "Aggregated R2 within each AF bin", xlab = "Allele Frequency",main = title)
  for (i in 1:nd) {
    d <- acc_chunk[[i]]
    y <- rmna(d$QUILT2)
    lines(x, y, type = "l", lwd = i / nd * 2.5, pch = 1, col = mycols["QUILT2"])
    y <- rmna(d$GLIMPSE2)
    lines(x, y, type = "l", lwd = i / nd * 2.5, pch = 1, col = mycols["GLIMPSE2"])
    y <- rmna(d$QUILT1)
    lines(x, y, type = "l", lwd = i / nd * 2.5, pch = 1, col = mycols["QUILT1"])
    y <- rmna(d$GLIMPSE1)
    lines(x, y, type = "l", lwd = i / nd * 2.5, pch = 1, col = mycols["GLIMPSE1"])
  }
  axis(side = 1, at = x, labels = labels)
  axis(side = 2, at = seq(0, 1, 0.2))
  legend("bottomright", legend = paste0(groups, "x"), lwd = (1:nd) * 2.5 / nd, bty = "n")
}

dev.off()

R From line 2 of scripts/accuracy_depth.R

snakemake@source("common.R")


refsize0 <- as.integer(system(paste("bcftools query -l", snakemake@params$vcf, "|", "wc", "-l"), intern = TRUE))
groups <- as.numeric(snakemake@config[["refsize"]])
groups[groups == 0] <- refsize0
groups <- groups * 2
nd <- length(groups)

df.truth <- read.table(snakemake@input[["truth"]])
df.truth <- sapply(seq(1, dim(df.truth)[2] - 1, 2), function(i) {
  rowSums(df.truth[, (i + 1):(i + 2)])
}) # matrix: nsnps x nsamples
rownames(df.truth) <- read.table(snakemake@input[["truth"]])[,1]
af <- as.numeric(read.table(snakemake@input[["af"]])[, 2])
names(af) <- read.table(snakemake@input[["af"]])[, 1]

dl.quilt1 <- lapply(snakemake@input[["regular"]], parse.quilt.gts)
dl.quilt2 <- lapply(snakemake@input[["zilong"]], parse.quilt.gts)
dl.glimpse1 <- lapply(snakemake@input[["glimpse1"]], parse.quilt.gts)
dl.glimpse2 <- lapply(snakemake@input[["glimpse2"]], parse.quilt.gts)

bins <- sort(unique(c(
  c(0, 0.01 / 100, 0.02 / 100, 0.05 / 100),
  c(0, 0.01 / 10, 0.02 / 10, 0.05 / 10),
  c(0, 0.01 / 1, 0.02 / 1, 0.05 / 1),
  seq(0.1, 0.5, length.out = 5)
)))

accuracy_by_af <- lapply(seq(length(groups)), function(i) {
  d <- acc_r2_by_af(df.truth, dl.quilt2[[i]], dl.glimpse2[[i]], dl.quilt1[[i]], dl.glimpse1[[i]], af, bins)
  colnames(d) <- c("bin","QUILT2", "GLIMPSE2", "QUILT1", "GLIMPSE1")
  d
})

names(accuracy_by_af) <- paste0("refsize", as.character(groups))
saveRDS(accuracy_by_af, snakemake@output[["rds"]])

wong <- c("#e69f00", "#d55e00", "#56b4e9", "#cc79a7", "#009e73", "#0072b2", "#f0e442")
mycols <- wong

pdf(paste0(snakemake@output[["rds"]], ".pdf"), w = 6, h = 12)
a1 <- accuracy_by_af[[1]]
x <- a1$bin[!sapply(a1[, 2], is.na)]
x <- log10(as.numeric(x))
labels <- 100 * bins[-1]
labels <- labels[!sapply(a1[, 2], is.na)]
ymin <- min(sapply(accuracy_by_af, function(d) {
  m <- as.matrix(apply(d[, -1], 2, unlist))
  min(m, na.rm = T)
}))

par(mfrow = c(2, 1))
plot(1, col = "transparent", axes = F, xlim = c(min(x), max(x)), ylim = c(0.90, 1.0), ylab = "Aggregated R2 within each AF bin", xlab = "Allele Frequency")
for (i in 1:nd) {
  d <- accuracy_by_af[[i]]
  y <- rmna(d$QUILT2)
  lines(x, y, type = "l", lty = nd - i + 1, pch = 1, col = mycols[1])
  y <- rmna(d$GLIMPSE2)
  lines(x, y, type = "l", lty = nd - i + 1, pch = 1, col = mycols[2])
  y <- rmna(d$QUILT1)
  lines(x, y, type = "l", lty = nd - i + 1, pch = 1, col = mycols[3])
  y <- rmna(d$GLIMPSE1)
  lines(x, y, type = "l", lty = nd - i + 1, pch = 1, col = mycols[4])
}
axis(side = 1, at = x, labels = labels)
axis(side = 2)
legend("bottomleft", legend = c("QUILT2", "GLIMPSE2", "QUILT1", "GLIMPSE1"), col = mycols, pch = 1, lwd = 1.5, cex = 1.0, xjust = 0, yjust = 1, bty = "n")

plot(1, col = "transparent", axes = F, xlim = c(min(x), max(x)), ylim = c(0.9 * ymin, 1.0), ylab = "Aggregated R2 within each AF bin", xlab = "Allele Frequency")
nd <- length(groups)
for (i in 1:nd) {
  d <- accuracy_by_af[[i]]
  y <- rmna(d$QUILT2)
  lines(x, y, type = "l", lty = nd - i + 1, pch = 1, col = mycols[1])
  y <- rmna(d$GLIMPSE2)
  lines(x, y, type = "l", lty = nd - i + 1, pch = 1, col = mycols[2])
  y <- rmna(d$QUILT1)
  lines(x, y, type = "l", lty = nd - i + 1, pch = 1, col = mycols[3])
  y <- rmna(d$GLIMPSE1)
  lines(x, y, type = "l", lty = nd - i + 1, pch = 1, col = mycols[4])
}
axis(side = 1, at = x, labels = labels)
axis(side = 2, at = seq(0, 1, 0.2))
legend("bottomright", legend = paste0("N=", groups), lty = nd:1, bty = "n")

dev.off()

R BCFtools From line 2 of scripts/accuracy_panelsize.R

snakemake@source("common.R")

acc_r2_all <- function(d0, d1, d2, d3) {
  id <- intersect(intersect(intersect(rownames(d0), rownames(d1)), rownames(d2)), rownames(d3))
  y1 <- cor(as.vector(d0[id,]), as.vector(d1[id,]), use = "pairwise.complete")**2
  y2 <- cor(as.vector(d0[id,]), as.vector(d2[id,]), use = "pairwise.complete")**2
  y3 <- cor(as.vector(d0[id,]), as.vector(d3[id,]), use = "pairwise.complete")**2
  c(y1, y2, y3)
}

local_r2_by_af <- function(d0, d1, d2, d3, af, bins) {
  id <- intersect(intersect(intersect(rownames(d0), rownames(d1)), rownames(d2)), rownames(d3))
  id <- intersect(id, names(af))
  res1 <- r2_by_freq(breaks = bins, af, truthG = d0, testDS = d1, which_snps = id)
  res2 <- r2_by_freq(breaks = bins, af, truthG = d0, testDS = d2, which_snps = id)
  res3 <- r2_by_freq(breaks = bins, af, truthG = d0, testDS = d3, which_snps = id)
  as.data.frame(cbind(bin = bins[-1], regular = res1[, "simple"], mspbwt = res2[, "simple"], zilong = res3[, "simple"]))
}

groups <- as.numeric(snakemake@config[["downsample"]])

df.truth <- read.table(snakemake@input[["truth"]])
df.truth <- sapply(seq(1, dim(df.truth)[2] - 1, 2), function(i) {
  rowSums(df.truth[, (i + 1):(i + 2)])
}) # matrix: nsnps x nsamples
rownames(df.truth) <- read.table(snakemake@input[["truth"]])[,1]
af <- as.numeric(read.table(snakemake@input[["af"]])[, 2])
names(af) <- read.table(snakemake@input[["af"]])[, 1]

groups <- as.numeric(snakemake@config[["downsample"]])

dl.regular <- lapply(snakemake@input[["regular"]], parse.quilt.gts)
dl.mspbwt <- lapply(snakemake@input[["mspbwt"]], parse.quilt.gts)
dl.zilong <- lapply(snakemake@input[["zilong"]], parse.quilt.gts)

bins <- sort(unique(c(
  c(0, 0.01 / 100, 0.02 / 100, 0.05 / 100),
  c(0, 0.01 / 10, 0.02 / 10, 0.05 / 10),
  c(0, 0.01 / 1, 0.02 / 1, 0.05 / 1),
  seq(0.1, 0.5, length.out = 5)
)))

accuracy <- matrix(sapply(1:length(groups), function(i) {
  acc_r2_all(df.truth, dl.regular[[i]], dl.mspbwt[[i]], dl.zilong[[i]])
}), ncol = length(groups))


accuracy_by_af <- lapply(1:length(groups), function(i) {
  d <- local_r2_by_af(df.truth, dl.regular[[i]], dl.mspbwt[[i]], dl.zilong[[i]], af, bins)
  colnames(d) <- c("bin", "regular", "mspbwt", "zilong" )
  d
})
names(accuracy_by_af) <- paste0(as.character(groups), "x")
saveRDS(accuracy_by_af, snakemake@output[["rds"]])

## accuracy_by_af <- readRDS("/maps/projects/alab/people/rlk420/quilt2/human/HRC_CEU/quilt-rare-common/results/summary/quilt.accuracy.panelsize0.chr20.rds" )

wong <- c("#e69f00", "#d55e00", "#56b4e9", "#cc79a7", "#009e73", "#0072b2", "#f0e442")

pdf(paste0(snakemake@output[["rds"]], ".pdf"), w = 12, h = 6)

par(mfrow = c(1, 2))

plot(groups, accuracy[1, ], type = "b", lwd = 1.0, pch = 1, col = wong[1], ylab = "Aggregated R2 for the chromosome", xlab = "Samples sequencing depth", ylim = c(0.9 * min(accuracy), 1.0))
lines(groups, accuracy[2, ], type = "b", lwd = 1.0, pch = 1, col = wong[2])
lines(groups, accuracy[3, ], type = "b", lwd = 1.0, pch = 1, col = wong[3])
legend("bottomright", legend = c("QUILT-regular", "QUILT-mspbwt", "QUILT-zilong"), col = mycols, pch = 1, lwd = 1.5, cex = 1.1, xjust = 0, yjust = 1, bty = "n")

a1 <- accuracy_by_af[[1]]
x <- a1$bin[!sapply(a1[, 2], is.na)] # remove AF bin with NULL results
x <- log10(as.numeric(x))
labels <- 100 * bins[-1]
labels <- labels[!sapply(a1[, 2], is.na)]
ymin <- min(sapply(accuracy_by_af, function(d) {
  m <- as.matrix(apply(d[, -1], 2, unlist))
  min(m, na.rm = T)
}))

plot(1, col = "transparent", axes = F, xlim = c(min(x), max(x)), ylim = c(0, 1.0), ylab = "Aggregated R2 within each MAF bin", xlab = "Minor Allele Frequency")

nd <- length(groups)
for (i in 1:nd) {
  d <- accuracy_by_af[[i]]
  y <- rmna(d$regular)
  lines(x, y, type = "l", lty = nd - i + 1, pch = 1, col = wong[1])
  y <- rmna(d$mspbwt)
  lines(x, y, type = "l", lty = nd - i + 1, pch = 1, col = wong[2])
  y <- rmna(d$zilong)
  lines(x, y, type = "l", lty = nd - i + 1, pch = 1, col = wong[3])
}
axis(side = 1, at = x, labels = labels)
axis(side = 2)

legend("bottomright", legend = paste0(groups, "x"), lwd = (1:nd) * 2.5 / nd, bty = "n")

dev.off()

R From line 2 of scripts/accuracy_quilt.R

snakemake@source("common.R")

acc_r2_all <- function(d0, d1) {
  id <- intersect(rownames(d0), rownames(d1))
  y1 <- cor(as.vector(d0[id,]), as.vector(d1[id,]), use = "pairwise.complete")**2
}

acc_r2_by_af <- function(d0, d1, af, bins) {
  id <- intersect(rownames(d0), rownames(d1))
  res <- r2_by_freq(breaks = bins, af[id], truthG = d0[id,], testDS = d1[id,])
  as.data.frame(cbind(bin = bins[-1], single = res[, "simple"], orphan = res[, "simple"]))
}

groups <- as.numeric(snakemake@config[["downsample"]])

df.truth <- read.table(snakemake@input[["truth"]])
df.truth <- sapply(seq(1, dim(df.truth)[2] - 1, 2), function(i) {
  rowSums(df.truth[, (i + 1):(i + 2)])
}) # matrix: nsnps x nsamples
rownames(df.truth) <- read.table(snakemake@input[["truth"]])[,1]
af <- as.numeric(read.table(snakemake@input[["af"]])[, 2])
names(af) <- read.table(snakemake@input[["af"]])[, 1]

## SNPs with (1-af) > 0.0005 & (1-af) < 0.001 are all imputed hom ALT and truth hom ALT. but those are stupidly easy to impute and don’t tell you anything
## af <- ifelse(af>0.5, 1-af, af)

dl.single <- lapply(snakemake@input[["single"]], parse.quilt.gts)

bins <- sort(unique(c(
  c(0, 0.01 / 100, 0.02 / 100, 0.05 / 100),
  c(0, 0.01 / 10, 0.02 / 10, 0.05 / 10),
  c(0, 0.01 / 1, 0.02 / 1, 0.05 / 1),
  seq(0.1, 0.5, length.out = 5)
)))

accuracy <- matrix(sapply(1:length(groups), function(i) {
  acc_r2_all(df.truth, dl.single[[i]])
}), ncol = length(groups))

accuracy_by_af <- lapply(1:length(groups), function(i) {
  acc_r2_by_af(df.truth, dl.single[[i]], af, bins)
})
saveRDS(accuracy_by_af, snakemake@output[["rds"]])

wong <- c("#e69f00", "#d55e00", "#56b4e9", "#cc79a7", "#009e73", "#0072b2", "#f0e442")
mycols <- wong[1:4]

pdf(snakemake@output[["pdf"]], w = 12, h = 6)
par(mfrow = c(1, 2))
plot(groups, accuracy[1, ], type = "b", lwd = 1.0, pch = 1, col = mycols[1], ylab = "Aggregated R2 for the chromosome", xlab = "Samples sequencing depth", ylim = c(0.9 * min(accuracy), 1.0))
legend("bottomright", legend = c(snakemake@params[["N"]]), col = mycols, pch = 1, lwd = 1.5, cex = 1.1, xjust = 0, yjust = 1, bty = "n")

a1 <- accuracy_by_af[[1]]
x <- a1$bin[!sapply(a1[, 2], is.na)] # remove AF bin with NA results
x <- log10(as.numeric(x))
labels <- 100 * bins[-1]
labels <- labels[!sapply(a1[, 2], is.na)]
ymin <- min(sapply(accuracy_by_af, function(d) {
  m <- as.matrix(apply(d[, -1], 2, unlist))
  min(m, na.rm = T)
}))

plot(1, col = "transparent", axes = F, xlim = c(min(x), max(x)), ylim = c(0, 1.0), ylab = "Aggregated R2 within each MAF bin", xlab = "Minor Allele Frequency")
nd <- length(groups)
for (i in 1:nd) {
  d <- accuracy_by_af[[i]]
  y <- rmna(d$single)
  lines(x, y, type = "l", lty = nd - i + 1, pch = 1, col = mycols[1])
}
axis(side = 1, at = x, labels = labels)
axis(side = 2)
legend("bottomright", legend = paste0(groups, "x"), lty = nd:1, bty = "n")

dev.off()

R From line 2 of scripts/accuracy_single.R

gettimes <- function(ss) {
  sapply(strsplit(ss, ":"), function(s) {
    s <- as.numeric(s)
    n <- length(s)
    sum(sapply(1:n, function(i) {
      s[i] * 60^(n - i)
    }))
  })
}

gnutime <- function(dl) {
  sapply(dl, function(d) {
    sum(gettimes(d[, 1]))
  })
}

gunram <- function(dl) {
  sapply(dl, function(d) {
    max(d[, 2]) / 1024 # MB units
  })
}

groups <- as.numeric(snakemake@config[["downsample"]])
nd <- length(groups)

dl.regular <- lapply(snakemake@input[["regular"]], read.table)
dl.zilong <- lapply(snakemake@input[["zilong"]], read.table)
dl.glimpse1 <- lapply(snakemake@input[["glimpse1"]], read.table)
dl.glimpse2 <- lapply(snakemake@input[["glimpse2"]], read.table)
rds <- list(QUILT2 = dl.zilong, GLIMPSE2 = dl.glimpse2, QUILT1 = dl.regular, GLIMPSE1 = dl.glimpse1)
rds <- lapply(rds,function(l) {names(l) <- paste0("depth=",groups,"x"); l} )

saveRDS(rds, snakemake@output[["rds"]])
rds <- readRDS(snakemake@output[["rds"]])


times <- data.frame(QUILT2 = gnutime(rds$QUILT2), GLIMPSE2 = gnutime(rds$GLIMPSE2), QUILT1 = gnutime(rds$QUILT1), GLIMPSE1 = gnutime(rds$GLIMPSE1))
rownames(times) <- groups
rams <- data.frame(QUILT2 = gunram(rds$QUILT2), GLIMPSE2 = gunram(rds$GLIMPSE2), QUILT1 = gunram(rds$QUILT1), GLIMPSE1 = gunram(rds$GLIMPSE1))
rownames(rams) <- groups

mycols <- c("#e69f00", "#d55e00", "#56b4e9", "#cc79a7", "#009e73", "#0072b2", "#f0e442")
palette(mycols)

pdf(paste0(snakemake@output[["rds"]], ".pdf"), w = 12, h = 6)
par(mfrow = c(1, 2))
barplot(t(times) / 60, beside = T, col = 1:4, ylab = "Runtime in Minutes", xlab = "Sequencing depth")
legend("topleft", legend = colnames(times), fill =  1:4)
barplot(t(rams) / 1024, beside = T, col = 1:4, ylab = "Maximun RAM in GBs", xlab = "Sequencing depth")
dev.off()

R From line 1 of scripts/speed_depth.R

gettimes <- function(ss) {
  sapply(strsplit(ss, ":"), function(s) {
    s <- as.numeric(s)
    n <- length(s)
    sum(sapply(1:n, function(i) {
      s[i] * 60^(n - i)
    }))
  })
}

gnutime <- function(dl) {
  sapply(dl, function(d) {
    sum(gettimes(d[, 1]))
  })
}

gunram <- function(dl) {
  sapply(dl, function(d) {
    max(d[, 2]) / 1024 # MB units
  })
}

## saveRDS(snakemake,  snakemake@output[["rds"]])
## print(refsize0)
## q()

refsize0 <- as.integer(system(paste("bcftools query -l", snakemake@params$vcf, "|", "wc", "-l"), intern = TRUE))

groups <- as.numeric(snakemake@config[["refsize"]])
groups[groups == 0] <- refsize0
groups <- groups * 2
nd <- length(groups)

dl.regular <- lapply(snakemake@input[["regular"]], read.table)
dl.zilong <- lapply(snakemake@input[["zilong"]], read.table)
dl.glimpse1 <- lapply(snakemake@input[["glimpse1"]], read.table)
dl.glimpse2 <- lapply(snakemake@input[["glimpse2"]], read.table)
rds <- list(QUILT2 = dl.zilong, GLIMPSE2 = dl.glimpse2, QUILT1 = dl.regular, GLIMPSE1 = dl.glimpse1)
rds <- lapply(rds,function(l) {names(l) <- paste0("size=",groups); l} )

saveRDS(rds, snakemake@output[["rds"]])
rds <- readRDS(snakemake@output[["rds"]])


times <- cbind(gnutime(rds$QUILT2), gnutime(rds$GLIMPSE2), gnutime(rds$QUILT1), gnutime(rds$GLIMPSE1)) / 60
rams <- cbind(gunram(rds$QUILT2), gunram(rds$GLIMPSE2), gunram(rds$QUILT1), gunram(rds$GLIMPSE1)) / 1024


wong <- c("#e69f00", "#d55e00", "#56b4e9", "#cc79a7", "#009e73", "#0072b2", "#f0e442")
mycols <- wong


pdf(paste0(snakemake@output[["rds"]], ".pdf"), w = 12, h = 6)
par(mfrow = c(1, 2))
plot(groups, times[, 1], type = "b", lwd = 1.0, pch = 1, col = mycols[1], ylab = "Runtime in Minutes", xlab = "Reference panel size", ylim = c(min(times) * 0.9, max(times) * 1.1), log = 'y')
lines(groups, times[, 2], type = "b", lwd = 1.0, pch = 1, col = mycols[2])
lines(groups, times[, 3], type = "b", lwd = 1.0, pch = 1, col = mycols[3])
lines(groups, times[, 4], type = "b", lwd = 1.0, pch = 1, col = mycols[4])
legend("topleft", legend = names(rds), col = mycols, pch = 1, lwd = 1.5, cex = 1.1, xjust = 0, yjust = 1, bty = "n")

plot(groups, rams[, 1], type = "b", lwd = 1.0, pch = 1, col = mycols[1], ylab = "Maximum RAM in GBs", xlab = "Reference panel size", ylim = c(min(rams) * 0.9, max(rams) * 1.1))
lines(groups, rams[, 2], type = "b", lwd = 1.0, pch = 1, col = mycols[2])
lines(groups, rams[, 3], type = "b", lwd = 1.0, pch = 1, col = mycols[3])
lines(groups, rams[, 4], type = "b", lwd = 1.0, pch = 1, col = mycols[4])
dev.off()

R BCFtools From line 1 of scripts/speed_panelsize.R

gettimes <- function(ss) {
  sapply(strsplit(ss, ":"), function(s) {
    s <- as.numeric(s)
    n <- length(s)
    sum(sapply(1:n, function(i) {
      s[i] * 60^(n - i)
    }))
  })
}

gnutime <- function(dl) {
  sapply(dl, function(d) {
    sum(gettimes(d[, 1]))
  })
}

gunram <- function(dl) {
  sapply(dl, function(d) {
    max(d[, 2]) / 1024 # MB units
  })
}

groups <- as.numeric(snakemake@config[["downsample"]])
nd <- length(groups)

dl.regular <- lapply(snakemake@input, read.table)
times <- data.frame(gnutime(dl.regular))
rams <- data.frame(gunram(dl.regular))

saveRDS(list(time = times, ram = rams), snakemake@output[["rds"]])


wong <- c("#e69f00", "#d55e00", "#56b4e9", "#cc79a7", "#009e73", "#0072b2", "#f0e442")
mycols <- wong[1:3]


pdf(snakemake@output[["pdf"]], w = 12, h = 6)
par(mfrow = c(1, 2))
plot(groups, times[, 1], type = "b", lwd = 1.0, pch = 1, col = mycols[1], ylab = "Total Time in seconds for the chromosome", xlab = "Sequencing depth", ylim = c(min(times) * 0.9, max(times) * 1.1))
legend("topleft", legend = c(snakemake@params[["N"]]), col = mycols, pch = 1, lwd = 1.5, cex = 1.1, xjust = 0, yjust = 1, bty = "n")

plot(groups, rams[, 1], type = "b", lwd = 1.0, pch = 1, col = mycols[1], ylab = "Maximum RAM in MBs for the chromosome", xlab = "Sequencing depth", ylim = c(min(rams) * 0.9, max(rams) * 1.1))
dev.off()

R From line 2 of scripts/speed_single.R

ql <- paste("query", "-l", snakemake@params[["vcf"]])
size <- as.integer(snakemake@wildcards[["size"]])
allsamples <- as.character(system2("bcftools", ql, stdout = TRUE))
targesamples <- snakemake@params[["samples"]]
# remove target sample from the panel
allsamples <- allsamples[!allsamples %in% targesamples]
if (size == 0) {
  subsets <- allsamples
} else {
  # random sample N pairs haplotypes
  subsets <- allsamples[sort(sample(1:length(allsamples), size))]
}
cat(subsets, file = snakemake@output[[1]], sep = "\n")