BioWorkflows: Analysis pipeline associated with master's thesis on the population structure, demographic history and distribution of fitness effects of birches in Scandinavia.

script:
    "../scripts/recode_variant_ids_plink.py"

SnakeMake From line 64 of rules/admixture.smk

script:
    "../scripts/create_pairwise_ld_lists.py"

SnakeMake From line 83 of rules/admixture.smk

script:
    "../scripts/extract_sites_plink.py"

SnakeMake From line 99 of rules/admixture.smk

script:
    "../scripts/run_admixture.py"

SnakeMake From line 116 of rules/admixture.smk

script:
    "../scripts/plot_cv_error_admixture.py"

SnakeMake From line 129 of rules/admixture.smk

script:
    "../scripts/create_sample_set.py"

SnakeMake From line 143 of rules/admixture.smk

script:
    "../scripts/plot_pca.py"

SnakeMake From line 167 of rules/admixture.smk

script:
    "../scripts/plot_clusters_location.py"

SnakeMake From line 185 of rules/admixture.smk

script:
    "../scripts/create_barplot_admixture.py"

SnakeMake From line 204 of rules/admixture.smk

script:
    "../scripts/fit_pop_scenarios_dadi.py"

SnakeMake From line 121 of rules/dadi.smk

script:
    "../scripts/determine_best_batch_dadi.py"

SnakeMake From line 141 of rules/dadi.smk

script:
    "../scripts/plot_trajectory_dadi.py"

SnakeMake From line 158 of rules/dadi.smk

script:
    "../scripts/plot_sfs_comparison_2D.py"

SnakeMake From line 178 of rules/dadi.smk

script:
    "../scripts/plot_sfs_comparison_1D.py"

SnakeMake From line 197 of rules/dadi.smk

script:
    "../scripts/bootstrap_pop_scenarios_dadi.py"

SnakeMake From line 217 of rules/dadi.smk

script:
    "../scripts/determine_cis_dadi.py"

SnakeMake From line 239 of rules/dadi.smk

script:
    "../scripts/plot_models_lrt_dadi.py"

SnakeMake From line 281 of rules/dadi.smk

script:
    "../scripts/plot_models_lrt_dadi.py"

SnakeMake From line 328 of rules/dadi.smk

script:
    "../scripts/plot_models_lrt_dadi.py"

SnakeMake From line 361 of rules/dadi.smk

script:
    "../scripts/plot_models_lrt_dadi.py"

SnakeMake From line 406 of rules/dadi.smk

script:
    "../scripts/tabulate_results_dadi.py"

SnakeMake From line 436 of rules/dadi.smk

script:
    "../scripts/tabulate_results_dadi.py"

SnakeMake From line 464 of rules/dadi.smk

script:
    "../scripts/calculate_basic_stats.py"

SnakeMake From line 481 of rules/dadi.smk

script:
    "../scripts/calculate_basic_stats.py"

SnakeMake From line 501 of rules/dadi.smk

script:
    "../scripts/plot_cv_error_feems.py"

SnakeMake From line 44 of rules/feems.smk

script:
    "../scripts/calculate_migration_surfaces_cv.py"

SnakeMake From line 65 of rules/feems.smk

script:
    "../scripts/calculate_migration_surfaces_cv.py"

SnakeMake From line 83 of rules/feems.smk

script:
    "../scripts/calculate_migration_surfaces_lambda.py"

SnakeMake From line 101 of rules/feems.smk

script:
    "../scripts/calculate_migration_surfaces_lambda.py"

SnakeMake From line 120 of rules/feems.smk

script:
    "../scripts/plot_sample_allocation_feems.py"

SnakeMake From line 136 of rules/feems.smk

script:
    "../scripts/plot_sample_locations.py"

SnakeMake From line 151 of rules/feems.smk

shell:
    "cairosvg {input} -d 200 -o {output}"

SnakeMake From line 191 of rules/header.smk

shell:
    "gatk IndexFeatureFile -I {input}"

SnakeMake gatk From line 202 of rules/header.smk

script:
    "../scripts/plot_pca.py"

SnakeMake From line 38 of rules/pca.smk

script:
    "../scripts/plot_pca.py"

SnakeMake From line 57 of rules/pca.smk

script:
    "../scripts/plot_pca.py"

SnakeMake From line 77 of rules/pca.smk

script:
    "../scripts/plot_pca.py"

SnakeMake From line 96 of rules/pca.smk

script:
    "../scripts/plot_clusters_location.py"

SnakeMake From line 112 of rules/pca.smk

shell:
    "wget {params.url} -O {output}"

SnakeMake From line 57 of rules/polydfe.smk

script:
    "../scripts/prepare_input_polydfe.py"

SnakeMake From line 75 of rules/polydfe.smk

script:
    "../scripts/run_polydfe.py"

SnakeMake From line 91 of rules/polydfe.smk

script:
    "../scripts/run_polydfe.py"

SnakeMake From line 106 of rules/polydfe.smk

script:
    "../scripts/parse_dfe_output.R"

SnakeMake From line 118 of rules/polydfe.smk

script:
    "../scripts/plot_dfe_types.py"

SnakeMake From line 130 of rules/polydfe.smk

script:
    "../scripts/plot_dfe_bootstrapped.py"

SnakeMake From line 151 of rules/polydfe.smk

script:
    "../scripts/create_bootstraps_dfe.R"

SnakeMake From line 166 of rules/polydfe.smk

script:
    "../scripts/create_init_bootstraps_dfe.R"

SnakeMake From line 181 of rules/polydfe.smk

script:
    "../scripts/parse_dfe_output.R"

SnakeMake From line 193 of rules/polydfe.smk

script:
    "../scripts/plot_dfe_bootstrapped.py"

SnakeMake From line 208 of rules/polydfe.smk

script:
    "../scripts/plot_dfe_param_dist.py"

SnakeMake From line 220 of rules/polydfe.smk

script:
    "../scripts/plot_dfe_bootstrapped.py"

SnakeMake From line 241 of rules/polydfe.smk

script:
    "../scripts/plot_dfe_gradient_error.py"

SnakeMake From line 253 of rules/polydfe.smk

script:
    "../scripts/compare_dfe_types.R"

SnakeMake From line 266 of rules/polydfe.smk

script:
    "../scripts/plot_props_nested_dfe_types.py"

SnakeMake From line 277 of rules/polydfe.smk

script:
    "../scripts/create_sample_set.py"

SnakeMake From line 21 of rules/sample_sets.smk

script:
    "../scripts/create_sample_set.py"

SnakeMake From line 35 of rules/sample_sets.smk

script:
    "../scripts/create_sample_set.py"

SnakeMake From line 49 of rules/sample_sets.smk

script:
    "../scripts/create_sample_set.py"

SnakeMake From line 63 of rules/sample_sets.smk

script:
    "../scripts/create_sample_set.py"

SnakeMake From line 77 of rules/sample_sets.smk

script:
    "../scripts/create_sample_set.py"

SnakeMake From line 92 of rules/sample_sets.smk

script:
    "../scripts/create_sample_set.py"

SnakeMake From line 107 of rules/sample_sets.smk

script:
    "../scripts/create_sample_set.py"

SnakeMake From line 122 of rules/sample_sets.smk

script:
    "../scripts/create_sample_set.py"

SnakeMake From line 137 of rules/sample_sets.smk

script:
    "../scripts/create_sample_set.py"

SnakeMake From line 152 of rules/sample_sets.smk

shell:
    "cp {input} {output}"

SnakeMake From line 164 of rules/sample_sets.smk

script:
    "../scripts/create_sample_set.py"

SnakeMake From line 179 of rules/sample_sets.smk

script:
    "../scripts/create_subpopulation_files.py"

SnakeMake From line 194 of rules/sample_sets.smk

script:
    "../scripts/create_subpopulation_files.py"

SnakeMake From line 210 of rules/sample_sets.smk

shell:
    "wget {params.url} -O {output}"

SnakeMake From line 69 of rules/snp_calling.smk

script:
    "../scripts/create_sample_set.py"

SnakeMake From line 98 of rules/snp_calling.smk

script:
    "../scripts/create_sample_set.py"

SnakeMake From line 112 of rules/snp_calling.smk

script:
    "../scripts/trim_reads.py"

SnakeMake From line 127 of rules/snp_calling.smk

shell:
    "fastqc {input} --outdir {params.outdir}"

SnakeMake FastQC From line 144 of rules/snp_calling.smk

shell:
    "multiqc {params.outdir} -o {params.outdir}"

SnakeMake MultiQC From line 159 of rules/snp_calling.smk

shell:
    "bwa index {input}"

SnakeMake BWA From line 174 of rules/snp_calling.smk

script:
    "../scripts/map_reads.py"

SnakeMake From line 188 of rules/snp_calling.smk

shell:
    "samtools coverage -m {input} > {output}"

SnakeMake SAMtools From line 201 of rules/snp_calling.smk

shell:
    "samtools coverage -m {input} > {output}"

SnakeMake SAMtools From line 212 of rules/snp_calling.smk

shell:
    "samtools faidx {input}"

SnakeMake SAMtools From line 223 of rules/snp_calling.smk

script:
    "../scripts/mark_duplicates.py"

SnakeMake From line 235 of rules/snp_calling.smk

shell:
    "samtools index {input} {output}"

SnakeMake SAMtools From line 246 of rules/snp_calling.smk

shell:
    "gatk CreateSequenceDictionary -R {input} -O {output}"

SnakeMake gatk From line 257 of rules/snp_calling.smk

script:
    "../scripts/generate_genomic_intervals.py"

SnakeMake From line 272 of rules/snp_calling.smk

script:
    "../scripts/call_haplotypes.py"

SnakeMake From line 291 of rules/snp_calling.smk

script:
    "../scripts/gather_variants.py"

SnakeMake From line 302 of rules/snp_calling.smk

shell:
    "gatk SortVcf -I {input} -O {output} --MAX_RECORDS_IN_RAM 50000"

SnakeMake gatk From line 313 of rules/snp_calling.smk

script:
    "../scripts/generate_genomic_intervals.py"

SnakeMake From line 327 of rules/snp_calling.smk

script:
    "../scripts/import_variants.py"

SnakeMake From line 341 of rules/snp_calling.smk

script:
    "../scripts/call_imported_variants.py"

SnakeMake From line 355 of rules/snp_calling.smk

script:
    "../scripts/quality_filter_snps.py"

SnakeMake From line 369 of rules/snp_calling.smk

script:
    "../scripts/filter_snps.py"

SnakeMake From line 381 of rules/snp_calling.smk

script:
    "../scripts/setup_est_sfs.py"

SnakeMake From line 392 of rules/snp_calling.smk

script:
    "../scripts/prepare_input_est_sfs.py"

SnakeMake From line 407 of rules/snp_calling.smk

script:
    "../scripts/gather_input_est_sfs.py"

SnakeMake From line 423 of rules/snp_calling.smk

script:
    "../scripts/derive_ancestral_alleles.py"

SnakeMake From line 438 of rules/snp_calling.smk

script:
    "../scripts/scatter_output_est_sfs.py"

SnakeMake From line 450 of rules/snp_calling.smk

script:
    "../scripts/recode_ancestral_alleles_vcf.py"

SnakeMake From line 464 of rules/snp_calling.smk

shell:
    "bgzip {input} -c > {output}"

SnakeMake From line 475 of rules/snp_calling.smk

script:
    "../scripts/cleanup_annotation_file.py"

SnakeMake From line 487 of rules/snp_calling.smk

shell:
    "grep -v '#' {input} | sort -k1,1 -k4,4n -k5,5n -t$'\t' | bgzip -c > {output}"

SnakeMake From line 498 of rules/snp_calling.smk

shell:
    "tabix -p gff {input}"

SnakeMake tabix From line 509 of rules/snp_calling.smk

script:
    "../scripts/predict_variant_effects_vep.py"

SnakeMake From line 524 of rules/snp_calling.smk

script:
    "../scripts/determine_degeneracy.py"

SnakeMake From line 537 of rules/snp_calling.smk

shell:
    "bgzip {input} -c > {output}"

SnakeMake From line 548 of rules/snp_calling.smk

script:
    "../scripts/determine_synonymy.py"

SnakeMake From line 561 of rules/snp_calling.smk

shell:
    "bgzip {input} -c > {output}"

SnakeMake From line 572 of rules/snp_calling.smk

script:
    "../scripts/gather_variants.py"

SnakeMake From line 583 of rules/snp_calling.smk

shell:
    "gatk SortVcf -I {input} -O {output} --MAX_RECORDS_IN_RAM 50000"

SnakeMake gatk From line 594 of rules/snp_calling.smk

script:
    "../scripts/plot_sfs_est.py"

SnakeMake From line 609 of rules/snp_calling.smk

script:
    "../scripts/call_haplotypes.py"

SnakeMake From line 47 of rules/snp_calling_tetraploid.smk

script:
    "../scripts/import_variants.py"

SnakeMake From line 63 of rules/snp_calling_tetraploid.smk

script:
    "../scripts/calculate_fst.py"

SnakeMake From line 37 of rules/summary_stats.smk

script:
    "../scripts/calculate_fst_windowed.py"

SnakeMake From line 51 of rules/summary_stats.smk

script:
    "../scripts/calculate_pi.py"

SnakeMake From line 64 of rules/summary_stats.smk

script:
    "../scripts/calculate_tajimas_d.py"

SnakeMake From line 77 of rules/summary_stats.smk

script:
    "../scripts/plot_fst.py"

SnakeMake From line 90 of rules/summary_stats.smk

script:
    "../scripts/plot_fst_windowed.py"

SnakeMake From line 103 of rules/summary_stats.smk

script:
    "../scripts/plot_tajimas_d.py"

SnakeMake From line 116 of rules/summary_stats.smk

script:
    "../scripts/plot_pi.py"

SnakeMake From line 129 of rules/summary_stats.smk

script:
    "../scripts/collect_sample_set_stats.py"

SnakeMake From line 144 of rules/summary_stats.smk

script:
    "../scripts/calculate_hwe.py"

SnakeMake From line 160 of rules/summary_stats.smk

script:
    "../scripts/calculate_hwe.py"

SnakeMake From line 176 of rules/summary_stats.smk

script:
    "../scripts/plot_hwe.py"

SnakeMake From line 189 of rules/summary_stats.smk

script:
    "../scripts/plot_hwe.py"

SnakeMake From line 202 of rules/summary_stats.smk

script:
    "../scripts/plot_heterozygosity.py"

SnakeMake From line 216 of rules/summary_stats.smk

script:
    "../scripts/plot_heterozygosity_comparison.py"

SnakeMake From line 229 of rules/summary_stats.smk

script:
    "../scripts/calculate_missingness.py"

SnakeMake From line 244 of rules/summary_stats.smk

script:
    "../scripts/plot_missingness_sites.py"

SnakeMake From line 257 of rules/summary_stats.smk

script:
    "../scripts/plot_missingness_individuals.py"

SnakeMake From line 270 of rules/summary_stats.smk

script:
    "../scripts/calculate_linkage.py"

SnakeMake From line 284 of rules/summary_stats.smk

script:
    "../scripts/plot_umap.py"

SnakeMake From line 35 of rules/umap.smk

script:
    "../scripts/plot_umap.py"

SnakeMake From line 52 of rules/umap.smk

script:
    "../scripts/plot_umap.py"

SnakeMake From line 71 of rules/umap.smk

script:
    "../scripts/generate_vcf_sample_set.py"

SnakeMake From line 45 of rules/vcf_filtering.smk

script:
    "../scripts/generate_vcf_sample_set.py"

SnakeMake From line 62 of rules/vcf_filtering.smk

script:
    "../scripts/generate_vcf_sample_set.py"

SnakeMake From line 80 of rules/vcf_filtering.smk

script:
    "../scripts/filter_by_info.py"

SnakeMake From line 96 of rules/vcf_filtering.smk

script:
    "../scripts/filter_by_info.py"

SnakeMake From line 112 of rules/vcf_filtering.smk

script:
    "../scripts/filter_by_info.py"

SnakeMake From line 127 of rules/vcf_filtering.smk

script:
    "../scripts/filter_by_info.py"

SnakeMake From line 142 of rules/vcf_filtering.smk

script:
    "../scripts/filter_by_info.py"

SnakeMake From line 157 of rules/vcf_filtering.smk

script:
    "../scripts/filter_by_info.py"

SnakeMake From line 172 of rules/vcf_filtering.smk

script:
    "../scripts/filter_by_info.py"

SnakeMake From line 187 of rules/vcf_filtering.smk

script:
    "../scripts/plot_sfs_1D.py"

SnakeMake From line 203 of rules/vcf_filtering.smk

script:
    "../scripts/plot_sfs_2D.py"

SnakeMake From line 218 of rules/vcf_filtering.smk

script:
    "../scripts/plot_sfs_1D.py"

SnakeMake From line 236 of rules/vcf_filtering.smk

script:
    "../scripts/plot_sfs_1D.py"

SnakeMake From line 254 of rules/vcf_filtering.smk

script:
    "../scripts/plot_sfs_1D.py"

SnakeMake From line 273 of rules/vcf_filtering.smk

script:
    "../scripts/plot_degeneracy.py"

SnakeMake From line 286 of rules/vcf_filtering.smk

script:
    "../scripts/plot_synonymy.py"

SnakeMake From line 298 of rules/vcf_filtering.smk

script:
    "../scripts/annotate_ld.py"

SnakeMake From line 311 of rules/vcf_filtering.smk

script:
    "../scripts/filter_by_info.py"

SnakeMake From line 325 of rules/vcf_filtering.smk

script:
    "../scripts/convert_plink.py"

SnakeMake From line 21 of rules/vcf_to_plink.smk

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

try:
    vcf = snakemake.input[0]
    window_size_kb = snakemake.params.get('window_size_kb', 50)
    out = snakemake.output[0]
except NameError:
    # testing
    vcf = 'output/default/snps/pendula/biallelic/snps.vcf.gz'
    window_size_kb = 50
    out = 'scratch/vcf.r2.gz'

shell(f"bcftools +prune --annotate r2 {vcf} --window {window_size_kb}kb -Oz -o {out}")

Python Snakemake BCFtools From line 5 of scripts/annotate_ld.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import dadi
import pandas as pd

import demographic_scenarios
import sfs_utils

try:
    vcf = snakemake.input.vcf
    pops = snakemake.input.pops
    result = snakemake.input.result
    scenario_name = snakemake.params.scenario
    sample_set = snakemake.params.sample_set
    n_pops = snakemake.params.n_pops
    params_out = snakemake.output.params
    params_pretty_out = snakemake.output.params_pretty
    mode = snakemake.params.mode

    n_iter = snakemake.config['dadi_n_iter_' + mode]

    generation_time = snakemake.config['generation_time']
    mu = snakemake.config['mu']
    N_e = snakemake.config['N_e'][sample_set]
except NameError:
    # testing
    vcf = "output/default/snps/pendula/synonymous/snps.vcf.gz"
    """pops = "output/default/sample_sets/subpopulations/pendula/2_pops.txt"
    scenario_name = 'split_symmetric_migration'
    n_pops = 2"""
    pops = "output/default/sample_sets/subpopulations/pendula/1_pops.txt"
    result = "output/default/dadi/one_pop_size_change_since_ice_age/pendula/synonymous/batches/1/dadi.csv"
    scenario_name = 'one_pop_size_change_since_ice_age'
    n_pops = 1
    sample_set = "pendula"
    params_out = "scratch/dadi.csv"
    params_pretty_out = "scratch/dadi.out"
    mode = 'local_optimization'

    n_iter = 10

    generation_time = 20
    mu = 1e-9
    N_e = 675000

n_proj = 20
dd = dadi.Misc.make_data_dict_vcf(vcf, pops)

# divide data into chunks of size chunk_size_bs
# we then subsample from these chunks with replacement
# to create the bootstraps
# https://dadi.readthedocs.io/en/latest/user-guide/bootstrapping/
chunks = dadi.Misc.fragment_data_dict(dd, demographic_scenarios.DemographicScenario.chunk_size_bs)

# obtain one bootstrap sample
fs_bs = dadi.Misc.bootstraps_from_dd_chunks(chunks, 1, sfs_utils.get_pop_ids(n_pops), [n_proj] * n_pops)[0]

# we pass the original data dict here as it is only
# used to create parametric bootstraps
params = dict(fs=fs_bs, dd=dd, n_pops=n_pops, n_proj=n_proj, n_iter=n_iter,
              N_e=N_e, mu=mu, generation_time=generation_time, mode=mode)

scenario = demographic_scenarios.create(scenario_name, params)

# load params from simulation result
params = pd.read_csv(result, index_col=None, header=0).iloc[0]

# determine initial value
scenario.p0 = [params[name] for name in scenario.params]

# repeat the simulation with the appropriate initial value for one
# iteration for convenience reasons
scenario.simulate()

# save to file
data = scenario.to_dataframe()
open(params_pretty_out, 'w').write(data.to_markdown())
open(params_out, 'w').write(data.to_csv())

Python Pandas sfs-python dadi From line 5 of scripts/bootstrap_pop_scenarios_dadi.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import pandas as pd

import sfs_utils
import utils

try:
    vcfs = snakemake.input.vcfs
    pops_1 = snakemake.input.pops_1
    pops_2 = snakemake.input.pops_2
    names = snakemake.params.names
    out_txt = snakemake.output.txt
    out_csv = snakemake.output.csv
except NameError:
    # testing
    sample_sets = ["pubescens"]
    flag = "0fold"
    vcfs = [f"output/tetraploid/snps/{sample_set}/{flag}/snps.vcf.gz" for sample_set in sample_sets]
    # vcf_all = [f"output/tetraploid/snps/{sample_set}/all/snps.vcf.gz" for sample_set in sample_sets]
    pops_1 = [f"output/tetraploid/sample_sets/subpopulations/{sample_set}/1_pops.txt" for sample_set in sample_sets]
    pops_2 = [f"output/tetraploid/sample_sets/subpopulations/{sample_set}/2_pops.txt" for sample_set in sample_sets]
    names = sample_sets
    out_txt = "scratch/stats.txt"
    out_csv = "scratch/stats.csv"

stats = pd.DataFrame(index=['n sites', "Tajima's D", 'pi', 'pi per site', 'S', 'FST', 'FST_0'])

for i, name in enumerate(names):
    print(f'Calculating for subset: {name}', flush=True)

    # load spectrum for subset
    # down-projecting does not seem to affect any of
    # the statistics calculated however
    fs_1D = sfs_utils.get_full_spectrum(vcfs[i], pops_1[i])
    fs_2D = sfs_utils.get_full_spectrum(vcfs[i], pops_2[i])

    n_sites = utils.count_lines_vcf(vcfs[i])
    # n_sites_all = utils.count_lines_vcf(vcf_all[i])

    stats[name] = [
        n_sites,
        fs_1D.Tajima_D(),
        fs_1D.pi(),
        fs_1D.pi() / n_sites,
        fs_1D.S(),
        fs_2D.Fst(),
        fs_2D.scramble_pop_ids().Fst()
    ]

open(out_txt, 'w').write(stats.to_markdown())
stats.to_csv(out_csv)

Python Pandas utils sfs-python From line 5 of scripts/calculate_basic_stats.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

vcf = snakemake.input.vcf
pops = snakemake.input.pops
out = snakemake.params.prefix

shell(f"vcftools --gzvcf {vcf} --weir-fst-pop {pops[0]} --weir-fst-pop {pops[1]} --out {out}")

Python Snakemake VCFtools From line 5 of scripts/calculate_fst.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

vcf = snakemake.input.vcf
pops = snakemake.input.pops
out = snakemake.params.prefix

shell(f"vcftools --gzvcf {vcf} --weir-fst-pop {pops[0]} --weir-fst-pop {pops[1]} --fst-window-size 1000000000 --out {out}")

Python Snakemake VCFtools From line 5 of scripts/calculate_fst_windowed.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

file = snakemake.params.input_prefix
out = snakemake.params.output_prefix
midp = snakemake.params.midp

flag = 'midp' if midp else ''

# execute command
shell(f"plink --bfile {file} --hardy {flag} --hwe 0 --allow-extra-chr --out {out}")

Python Snakemake pLink From line 5 of scripts/calculate_hwe.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

file = snakemake.params.input_prefix
out = snakemake.params.output_prefix

# execute command
# LD window options obtained from Luis Leal
shell(f"plink --bfile {file} --r2 --ld-window-kb 50 --ld-window 500 "
      f"--ld-window-r2 0 --allow-extra-chr --out {out}")

Python Snakemake pLink From line 5 of scripts/calculate_linkage.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from feems_utils import *

try:
    test_mode = False
    sample_set = snakemake.params['sample_set']
    sample_class = snakemake.params['sample_class']
    flag = snakemake.params['flag']
    buffer = snakemake.params.get('buffer', 2)
    lambdas = snakemake.params.lambdas
    out_cv = snakemake.output.cv
    out_surfaces = snakemake.output.surfaces
except NameError:
    # testing
    test_mode = True
    sample_set = 'pendula'
    sample_class = 'default'
    flag = 'biallelic'
    buffer = 2
    lambdas = np.geomspace(1e-8, 1e2, 11)
    out_cv = 'scratch/cv.png'
    out_surfaces = 'scratch/surfaces.png'

sp_graph, samples, coords = get_sp_graph(sample_set, flag, sample_class=sample_class, buffer=buffer)

projection = get_projection(coords)

# determine cross-validation error
best_lambda = determine_best_lamb(lambdas, sp_graph)

utils.save_fig(out_cv, tight_layout=True, show=test_mode, clear=True)

# fit the graph
sp_graph.fit(lamb=best_lambda)

# draw figure showing the migration surfaces
v = create_plot(sp_graph, projection)

v.draw_edge_colorbar()
v.draw_edges(use_weights=True)
v.draw_obs_nodes(use_ids=False)

utils.save_fig(out_surfaces, tight_layout=True, show=test_mode)

Python From line 5 of scripts/calculate_migration_surfaces_cv.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from feems_utils import *

try:
    test_mode = False
    sample_set = snakemake.params['sample_set']
    sample_class = snakemake.params['sample_class']
    flag = snakemake.params['flag']
    lamb = snakemake.params.get('lamb', None)
    buffer = snakemake.params.get('buffer', 2)
    out_cv_error = snakemake.output.get('cv_error', None)
    out_surfaces = snakemake.output.surfaces
except NameError:
    # testing
    test_mode = True
    sample_set = 'pendula'
    sample_class = 'default'
    flag = 'biallelic'
    lamb = 10.0
    buffer = 2
    out_cv_error = 'scratch/cv_error.txt'
    out_surfaces = 'scratch/surfaces.png'

sp_graph, samples, coords = get_sp_graph(sample_set, flag, sample_class=sample_class, buffer=buffer)

projection = get_projection(coords)

if out_cv_error:
    # determine cross-validation error
    cv_error = determine_cv_error(lamb, sp_graph)[0, 0]

    with open(out_cv_error, 'w') as f:
        f.write(str(cv_error))

# fit the graph
sp_graph.fit(lamb=lamb)

# draw figure showing the migration surfaces
v = create_plot(sp_graph, projection)

v.draw_edge_colorbar()
v.draw_edges(use_weights=True)
v.draw_obs_nodes(use_ids=False)

plt.savefig(out_surfaces, bbox_inches='tight', pad_inches=0)

if test_mode:
    plt.show()

Python From line 5 of scripts/calculate_migration_surfaces_lambda.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

file = snakemake.params.input_prefix
out = snakemake.params.output_prefix

# execute command
shell(f"plink --bfile {file} --missing --allow-extra-chr --out {out}")

Python Snakemake pLink From line 5 of scripts/calculate_missingness.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

vcf = snakemake.input.vcf
out = snakemake.params.prefix

shell(f"vcftools --site-pi --gzvcf {vcf} --out {out}")

Python Snakemake VCFtools From line 5 of scripts/calculate_pi.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

vcf = snakemake.input.vcf
out = snakemake.params.prefix

shell(f"vcftools --gzvcf {vcf} --TajimaD 1000 --out {out}")

Python Snakemake VCFtools From line 5 of scripts/calculate_tajimas_d.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts

ref = snakemake.input.ref
bam = snakemake.input.bam
intervals = snakemake.input.intervals
java_opts = get_java_opts(snakemake)
tmp_dir = snakemake.resources.tmpdir
ploidy = snakemake.params.ploidy
out = snakemake.output[0]

shell(f"gatk --java-options '{java_opts}' HaplotypeCaller "
      f"-R {ref} -I {bam} -O {out} --tmp-dir {tmp_dir} "
      f"--interval-padding 0 -L {intervals} -ERC GVCF "
      f"--sample-ploidy {ploidy}")

Python Snakemake gatk snakemake-wrapper-utils From line 5 of scripts/call_haplotypes.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts

ref = snakemake.input.ref
db = snakemake.input.db
intervals = snakemake.input.intervals
java_opts = get_java_opts(snakemake)
out = snakemake.output[0]

shell(f"gatk GenotypeGVCFs --java-options '{java_opts}' -V gendb://{db} \
    -R {ref} -O {out} --include-non-variant-sites -L {intervals}")

Python Snakemake gatk snakemake-wrapper-utils From line 5 of scripts/call_imported_variants.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

input = snakemake.input[0]
out = snakemake.output[0]

# cleanup file
shell(f"agat_convert_sp_gxf2gxf.pl -g {input} -o {out} --gvi 3")

Python Snakemake From line 5 of scripts/cleanup_annotation_file.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import pandas as pd

import utils

try:
    vcfs = snakemake.input.vcfs
    sample_sets = snakemake.params.sample_sets
    sample_class = snakemake.params['sample_class']
    out = snakemake.output[0]
except NameError:
    sample_sets = ['pendula', 'pubescens', 'pendula_pubescens']
    sample_class = 'default'
    vcfs = ['output/default/snps/pendula/all/snps.vcf.gz',
            'output/default/snps/pubescens/all/snps.vcf.gz',
            'output/default/snps/pendula_pubescens/all/snps.vcf.gz'
            ]
    out = 'scratch/sample_set_stats.csv'

n_samples, n_sites = [], []
for i, sample_set in enumerate(sample_sets):
    n_samples.append(len(utils.get_samples(sample_set, sample_class)))
    n_sites.append(utils.count_lines_vcf(vcfs[i]))

data = pd.DataFrame(columns=[s.replace('_', ' ') for s in sample_sets])
data.loc['n samples'] = n_samples
data.loc['n sites'] = n_sites

data.T.to_csv(out)

Python Pandas utils From line 5 of scripts/collect_sample_set_stats.py

if (exists('snakemake')) {
    full_anc = snakemake@input[['full_anc']]
    full = snakemake@input[['full']]
    deleterious_anc = snakemake@input[['deleterious_anc']]
    deleterious = snakemake@input[['deleterious']]
    output = snakemake@output[[1]]
    postprocessing_source = snakemake@config$polydfe_postprocessing_source
} else {
    # testing
    full_anc = "output/default/polydfe/pendula/output/C.full_anc.out"
    full = "output/default/polydfe/pendula/output/C.full.out"
    deleterious_anc = "output/default/polydfe/pendula/output/C.deleterious_anc.out"
    deleterious = "output/default/polydfe/pendula/output/C.deleterious.out"
    output = "scratch/model_comparison.txt"
    postprocessing_source = "resources/polydfe/postprocessing/script.R"
}

source(postprocessing_source)

tt = rbind(compareModels(full_anc, full)$LRT,
           compareModels(deleterious_anc, deleterious)$LRT,
           compareModels(full, deleterious)$LRT,
           compareModels(full_anc, deleterious_anc)$LRT)

tt = data.frame(type1=c('full_anc', 'deleterious_anc', 'full', 'full_anc'),tt)
tt = data.frame(type2=c('full', 'deleterious', 'deleterious', 'deleterious_anc'),tt)

write.table(tt, output)

R Snakemake From line 5 of scripts/compare_dfe_types.R

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

vcf = snakemake.input.vcf
prefix = snakemake.params.prefix

# create bed, bim and fam files
shell(f"plink --vcf {vcf} --out {prefix} --allow-extra-chr")

Python Snakemake pLink From line 5 of scripts/convert_plink.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import utils

try:
    test_mode = False
    Q = snakemake.input.Q
    sample_set = snakemake.params.sample_set
    flag = snakemake.params.flag
    sample_class = snakemake.params.sample_class
    subsample_sets = snakemake.params.subsample_sets
    out = snakemake.output[0]
except NameError:
    # testing
    test_mode = True
    sample_set = 'pendula_pubescens'
    flag = 'biallelic'
    sample_class = 'default'
    subsample_sets = [
        'pendula_north_admixture',
        'pendula_south_admixture',
        'pubescens_north_admixture',
        'pubescens_south_admixture'
    ]
    Q = f"output/{sample_class}/admixture/{sample_set}/{flag}/snps.admixture.4.Q"
    out = 'scratch/admixture_locations.png'

# load data
data = pd.read_csv(Q, sep=' ', header=None)
K = data.columns.shape[0]

data[['name', 'latitude']] = utils.get_samples(sample_set, sample_class=sample_class)[['name', 'latitude']]
data['sample_set'] = ''

# assign subsample sets to samples
for subsample_set in subsample_sets:
    subsamples = utils.get_samples(subsample_set, sample_class=sample_class)
    data.sample_set[data.name.isin(subsamples.name)] = subsample_set

# sort individuals
# sort sample sets in the specified order
data['sample_set_order'] = [subsample_sets.index(s) for s in data.sample_set]
data = data.sort_values(['sample_set_order', 'latitude']).reset_index()

# plot data
ax = data[range(K)].plot(kind='bar', stacked=True, xticks=[], yticks=[0, 1], width=1, legend=None)

# define ticks
labels = [utils.get_proper_name(s) for s in subsample_sets]
tick_locs = [np.median(data[data.sample_set == s].index) for s in subsample_sets]
labels2 = [data[data.sample_set == s].sample_set.unique() for s in subsample_sets]
plt.xticks(ticks=tick_locs, labels=labels)

# plot vertical lines separating the subsample sets
plt.vlines([np.max(data[data.sample_set == s].index) for s in subsample_sets[:-1]], 0, 1, linestyles='dashed', color='black')

ax.set_aspect(data.shape[0] / 5)
utils.scale_cf(4 / 3)
ax.margins(0)

utils.save_fig(out, tight_layout=True, show=test_mode)

Python Pandas numpy matplotlib utils From line 5 of scripts/create_barplot_admixture.py

if (exists('snakemake')) {
    input = snakemake@input[[1]]
    n = snakemake@params[['n']]
    prefix = snakemake@params[['prefix']]
    postprocessing_source = snakemake@config$polydfe_postprocessing_source
} else {
    input = "output/default/polydfe/pendula.1_pops.C.full_anc.out"
    n = 10
    prefix = "scratch/polydfe"
    postprocessing_source = "resources/polydfe/postprocessing/script.R"
}

source(postprocessing_source)

bootstrapData(input, outputfile = prefix, rep = n)

R Snakemake From line 5 of scripts/create_bootstraps_dfe.R

if (exists('snakemake')) {
    input = snakemake@input[[1]]
    prefix = snakemake@params[['prefix']]
    type = snakemake@params[['type']]
    postprocessing_source = snakemake@config$polydfe_postprocessing_source

    source(postprocessing_source)
} else {
    input = "output/default/polydfe/pendula/out/C.deleterious.out"
    prefix = "scratch/init_C.deleterious"
    type = "deleterious"
    postprocessing_source = "https://github.com/paula-tataru/polyDFE/raw/master/postprocessing.R"

    library(devtools)
    source_url(postprocessing_source)
}

# For some reason we need to pass the parameters to be fixed explicitly.
# We only support model C here
fix = c("eps_cont")

if (type == 'deleterious_anc' | type == 'deleterious') {
    fix = c(fix, c("p_b", "S_b"))
}

if (type == 'deleterious' | type == 'full') {
    fix = c(fix, "eps_an")
}

createInitLines(input, prefix, fix = fix)

R Snakemake devtools From line 5 of scripts/create_init_bootstraps_dfe.R

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import os.path

from snakemake.shell import shell

try:
    bed = snakemake.input.bed
    window_size = snakemake.params.get('window_size', 50)
    step_size = snakemake.params.get('step_size', 10)
    R2 = snakemake.params.get('R2', 0.1)
    out = snakemake.output[0]
except NameError:
    # testing
    input = 'output/default/snps/pendula/biallelic/snps.bed'
    window_size = 50
    step_size = 10
    R2 = 0.1
    out = 'scratch/'

output_prefix = os.path.dirname(out)
input_file_prefix = os.path.basename(bed).replace('.bed', '')

shell(f"""
    cd {output_prefix}
    plink --bfile {input_file_prefix} --indep-pairwise {window_size} {step_size} {R2} --allow-extra-chr
""")

Python Snakemake pLink From line 5 of scripts/create_pairwise_ld_lists.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import pandas as pd

import pca_utils
import utils

# The derivation of some sample sets depends on the previous deviation of
# other, more general sample sets. This is because the utils module will
# look for previously generated files matching the sample set name.
# The order of derivation is thus important, which has to be from less
# specific to more specific.

try:
    input = snakemake.input[0]
    sample_set = snakemake.params.sample_set
    sample_class = snakemake.params.sample_class

    pendula_latitudinal_barrier = snakemake.config['latitudinal_barriers']['pendula']
    pubescens_latitudinal_barrier = snakemake.config['latitudinal_barriers']['pubescens']

    out = snakemake.output[0]
except NameError:
    # testing
    input = 'output/default/admixture/pubescens/biallelic/snps.admixture.2.Q'
    sample_set = 'pubescens_north_admixture'
    sample_class = 'default'

    pendula_latitudinal_barrier = 64
    pubescens_latitudinal_barrier = 65

    out = f'scratch/{sample_set}.args'

# get the samples to be included depending on ADMIXTURE's
# Q matrix denoting the provenance of an individual
def get_samples_admixture(condition, supersample_set):
    Q = pd.read_csv(input, index_col=None, header=None, sep=' ')

    mask = condition(Q.iloc[:, 0]).values

    return utils.get_samples(supersample_set, sample_class)[mask]

# based on PCA
pendula_pubescens_barrier = 20

birch_outliers = ['GNA02', 'GNA04', 'BUR27a', 'BUR27b']

birch_samples = pd.read_csv(utils.config['birch_samples'], index_col=0)

if sample_set == 'all':

    outgroup_samples = pd.read_csv(utils.config['outgroup_samples'], index_col=0)

    # concatenate outgroup and birch samples
    samples = pd.concat([birch_samples, outgroup_samples])

elif sample_set == 'birch':

    samples = birch_samples

elif sample_set == 'left_cluster':

    pca, pc, _ = pca_utils.get('birch', 'biallelic', sample_class)

    samples = birch_samples[pc[:, 0] < pendula_pubescens_barrier]

elif sample_set == 'right_cluster':

    pca, pc, _ = pca_utils.get('birch', 'biallelic', sample_class)

    samples = birch_samples[pc[:, 0] >= pendula_pubescens_barrier]

elif sample_set == 'pendula_pubescens':

    # remove outliers
    samples = birch_samples[~birch_samples.name.isin(birch_outliers)]

elif sample_set == 'pendula':

    samples = utils.get_samples('right_cluster', sample_class)

    samples = samples[~samples.name.isin(birch_outliers)]

elif sample_set == 'pendula_south':

    samples = utils.get_samples('pendula', sample_class)

    samples = samples[samples.latitude <= pendula_latitudinal_barrier]

elif sample_set == 'pendula_north':

    samples = utils.get_samples('pendula', sample_class)

    samples = samples[samples.latitude > pendula_latitudinal_barrier]

elif sample_set == 'pubescens':

    samples = utils.get_samples('left_cluster', sample_class)

    # remove outliers
    samples = samples[~samples.name.isin(birch_outliers)]

elif sample_set == 'pubescens_south':

    samples = utils.get_samples('pubescens', sample_class)

    samples = samples[samples.latitude <= pubescens_latitudinal_barrier]

elif sample_set == 'pubescens_north':

    samples = utils.get_samples('pubescens', sample_class)

    samples = samples[samples.latitude > pubescens_latitudinal_barrier]

elif sample_set == 'outgroups':

    samples = pd.read_csv(utils.config['outgroup_samples'], index_col=0)

elif sample_set == 'pendula_north_admixture':

    samples = get_samples_admixture(lambda Q: Q > 0.5, 'pendula')

elif sample_set == 'pendula_south_admixture':

    samples = get_samples_admixture(lambda Q: Q <= 0.5, 'pendula')

elif sample_set == 'pubescens_north_admixture':

    samples = get_samples_admixture(lambda Q: Q > 0.5, 'pubescens')

elif sample_set == 'pubescens_south_admixture':

    samples = get_samples_admixture(lambda Q: Q <= 0.5, 'pubescens')

elif sample_set == 'pubescens_admixture':

    samples = get_samples_admixture(lambda Q: Q > 0.5, 'pendula_pubescens')

elif sample_set == 'pendula_admixture':

    samples = get_samples_admixture(lambda Q: Q <= 0.5, 'pendula_pubescens')

# write samples to file
samples.name.to_csv(out, index=False, header=False)

Python Pandas utils pca From line 5 of scripts/create_sample_set.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import os
import utils

sample_set = snakemake.params.sample_set
sample_class = snakemake.params.sample_class
n_pops = snakemake.params.n_pops
out = snakemake.output[0]

samples = utils.get_samples(sample_set, sample_class)
samples['pop'] = 'pop0'


def assign_pop1(subset_name, samples):
    subset = utils.get_samples(subset_name, sample_class)

    samples.loc[samples.name.isin(subset.name), 'pop'] = 'pop1'

# subpopulations:
# pubescens: pop0, pendula: pop1
# northern: pop0, southern: pop1
if n_pops == 2:
    if sample_set == 'pendula_pubescens':
        assign_pop1('pendula', samples)
    elif sample_set == 'pendula':
        assign_pop1('pendula_south', samples)
    elif sample_set == 'pubescens':
        assign_pop1('pubescens_south', samples)

# write to file
with open(out, 'w') as f:
    f.write(os.linesep.join(samples[['name', 'pop']].agg('\t'.join, axis=1)) + os.linesep)

Python utils From line 5 of scripts/create_subpopulation_files.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

data = snakemake.input.data
seed = snakemake.input.seed
config = snakemake.input.config
bin = snakemake.input.bin
out_sfs = snakemake.output.sfs
out_probs = snakemake.output.probs
tmp_dir = snakemake.resources.tmpdir

# execute command
shell(f"{bin} {config} {data} {seed} {out_sfs} {out_probs}")

Python Snakemake From line 5 of scripts/derive_ancestral_alleles.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import pandas as pd

import demographic_scenarios

try:
    test_mode = False
    vcf = snakemake.input.vcf
    pops = snakemake.input.pops
    results = snakemake.input.results
    scenario_name = snakemake.params.scenario
    sample_set = snakemake.params.sample_set
    n_pops = snakemake.params.n_pops
    params_out = snakemake.output.params
    params_pretty_out = snakemake.output.params_pretty

    generation_time = snakemake.config['generation_time']
    mu = snakemake.config['mu']
    N_e = snakemake.config['N_e'][sample_set]
except NameError:
    # testing
    test_mode = True
    vcf = "output/tetraploid/snps/pendula/synonymous/snps.vcf.gz"
    sample_set = 'pendula_pubescens'
    scenario_name = 'split_symmetric_migration'
    n_pops = 2
    """scenario_name = 'one_pop_size_change_since_ice_age'
    n_pops = 1"""
    results = [f"output/tetraploid/dadi/{scenario_name}/{sample_set}/synonymous/batches/{n}/dadi.csv"
               for n in range(1, 101)]
    pops = f"output/tetraploid/sample_sets/subpopulations/pendula/{n_pops}_pops.txt"
    params_out = "scratch/dadi.csv"
    params_pretty_out = "scratch/dadi.out"
    generation_time = 20
    mu = 1e-9
    N_e = 675000

res = []
for file in results:
    df = pd.read_csv(file, index_col=None, header=0)
    res.append(df)

res = pd.concat(res, axis=0, ignore_index=True)

# determine result with highest likelihood
best_params = res[res.lnl == res.lnl.max()].iloc[0]

# determine initial value
param_names = demographic_scenarios.get_class(scenario_name).params
p0 = [best_params[name] for name in param_names]

scenario = demographic_scenarios.restore(scenario_name, p0, vcf, pops,
                                         generation_time, n_pops, N_e, mu)

# calculate standard deviation from GIM
scenario.perform_uncertainty_analysis()

# save to file
data = scenario.to_dataframe()
open(params_pretty_out, 'w').write(data.to_markdown())
open(params_out, 'w').write(data.to_csv())

Python Pandas From line 6 of scripts/determine_best_batch_dadi.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import numpy as np
import pandas as pd

import demographic_scenarios
import utils

try:
    vcf = snakemake.input.vcf
    pops = snakemake.input.pops
    results = snakemake.input.results
    result = snakemake.input.result_original
    scenario_name = snakemake.params.scenario
    sample_set = snakemake.params.sample_set
    n_pops = snakemake.params.n_pops
    params_out = snakemake.output.params
    params_pretty_out = snakemake.output.params_pretty

    generation_time = snakemake.config['generation_time']
    mu = snakemake.config['mu']
    N_e = snakemake.config['N_e'][sample_set]
except NameError:
    # testing
    vcf = "output/tetraploid/snps/pendula/synonymous/snps.vcf.gz"
    """pops = "output/tetraploid/sample_sets/subpopulations/pendula/2_pops.txt"
    scenario_name = 'split_symmetric_migration'
    n_pops = 2"""
    scenario_name = 'exp_pop_growth'
    sample_set = "pendula"
    pops = f"output/tetraploid/sample_sets/subpopulations/{sample_set}/1_pops.txt"
    results = [f"output/tetraploid/dadi/{scenario_name}/{sample_set}/synonymous/bootstraps/{n}/dadi.csv" for n in range(1, 11)]
    result = f"output/tetraploid/dadi/{scenario_name}/pendula/synonymous/dadi.csv"
    n_pops = 1
    params_out = "scratch/dadi.csv"
    params_pretty_out = "scratch/dadi.out"

    generation_time = 20
    mu = 1e-9
    N_e = 675000

# load boostrap results
bootstraps = []
for file in results:
    bootstraps.append(pd.read_csv(file, index_col=None, header=0).head(1))

bootstraps = pd.concat(bootstraps, axis=0, ignore_index=True)

# load params from file
params = pd.read_csv(result, index_col=None, header=0).iloc[0]

# determine initial value
param_names = demographic_scenarios.get_class(scenario_name).params
p0 = [params[name] for name in param_names]

scenario = demographic_scenarios.restore(scenario_name, p0, vcf, pops,
                                         generation_time, n_pops, N_e, mu)

# calculate standard deviation from GIM
scenario.perform_uncertainty_analysis()

# get dataframe of params
data = scenario.to_dataframe()

# confidence interval threshold
a = 0.05

# calculate additional statistics based on the bootstraps
cols = [
    'cis_bs_percentile',  # percentile bootstrap confidence interval
    'cis_bs_bca',  # bca bootstrap confidence interval
    'std_bs',  # bca bootstrap standard deviation
    'mean_bs',  # bca bootstrap mean
]

col_data = {key: [] for key in cols}
for name in ['lnl', 'theta'] + scenario.params:
    values = bootstraps[name].values.tolist()

    bs_percentile = utils.get_ci_percentile_bootstrap(values, a)
    bs_bca = utils.get_ci_bca(values, params[name], a)

    col_data['cis_bs_percentile'].append(bs_percentile)
    col_data['cis_bs_bca'].append(bs_bca)
    col_data['std_bs'].append(np.std(values, ddof=1))
    col_data['mean_bs'].append(np.mean(values))

# add to data frame
for col in cols:
    data.loc[col] = col_data[col]

# save to file
open(params_out, 'w').write(data.to_csv())
open(params_pretty_out, 'w').write(data.to_markdown())

Python Pandas numpy utils From line 5 of scripts/determine_cis_dadi.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import vcf
from Bio import SeqIO
from vcf.parser import _Info as Info

import codon_utils

try:
    vcf_file = snakemake.input.vcf
    gff = snakemake.input.gff
    ref = snakemake.input.ref
    out = snakemake.output[0]
except NameError:
    # testing
    vcf_file = 'testing/snps1.ancestral.vep.annotated.vcf.gz'
    gff = 'resources/reference/genome.corrected.gff.gz'
    ref = 'resources/reference/genome.fasta'
    out = 'scratch/annotated.vcf'

vcf_reader = vcf.Reader(filename=vcf_file)

# add new fields to header
vcf_reader.infos['Degeneracy'] = Info('Degeneracy', 1, 'Integer', 'n-fold degeneracy', None, None)
vcf_reader.infos['Degeneracy_Info'] = Info('Degeneracy_Info', 1, 'String',
                                           'Additional information about custom degeneracy annotation:', None, None)

vcf_writer = vcf.Writer(open(out, 'w'), vcf_reader)
ref_reader = SeqIO.parse(ref, 'fasta')


# add degeneracy annotation for given record
def annotate_degeneracy(record, cd, codon, codon_pos, codon_start, pos_codon, contig):
    degeneracy = None
    if 'N' not in codon:
        degeneracy = codon_utils.get_degeneracy(codon, pos_codon)

    record.INFO['Degeneracy'] = degeneracy
    record.INFO['Degeneracy_Info'] = f"{pos_codon},{cd.strand},{codon}"

    print(f'pos codon: {pos_codon}, pos abs: {record.POS}, '
          f'codon start: {codon_start}, codon: {codon}, '
          f'strand: {cd.strand}, ref allele: {contig[record.POS - 1]}, '
          f'degeneracy: {degeneracy}, codon pos: {str(codon_pos)}, '
          f'rec allele: {record.REF}', flush=True)

    return record


info_fields = {'Degeneracy': None, 'Degeneracy_Info': None}

# perform annotation
codon_utils.map_sites(vcf_reader, vcf_writer, gff, ref_reader, annotate_degeneracy, info_fields=info_fields)

Python Biopython PyVCF From line 5 of scripts/determine_degeneracy.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import numpy as np
import vcf
from Bio import SeqIO
from vcf.parser import _Info as Info

import codon_utils

try:
    vcf_file = snakemake.input.vcf
    gff = snakemake.input.gff
    ref = snakemake.input.ref
    out = snakemake.output[0]
except NameError:
    # testing
    vcf_file = 'testing/snps1.ancestral.vep.annotated.vcf.gz'
    gff = 'resources/reference/genome.corrected.gff.gz'
    ref = 'resources/reference/genome.fasta'
    out = 'scratch/annotated.vcf'

vcf_reader = vcf.Reader(filename=vcf_file)

# add new fields to header
vcf_reader.infos['Synonymy'] = Info('Synonymy', 1, 'Integer', 'Whether coding site is synonymous', None, None)
vcf_reader.infos['Synonymy_Info'] = Info('Synonymy_Info', 1, 'String',
                                         'Additional information about custom synonymy annotation:', None, None)

vcf_writer = vcf.Writer(open(out, 'w'), vcf_reader)
ref_reader = SeqIO.parse(ref, 'fasta')

# Maximum number of assertion errors until raising an exception.
# We introduce some leniency here as VEP produced different
# codons in a very cases for which no explanation could be found.
n_errors_tol = 3

# the erroneous records
records_err = []


# add synonymy annotation for given record
def annotate_synonymy(record, cd, codon, codon_pos, codon_start, pos_codon, contig):
    # fetch the alternative allele if present
    alt = codon_utils.get_alt_allele(record, cd.strand)

    info = ''
    synonymy, alt_codon, codons_vep = None, None, None
    if alt:
        # alternative codon
        # 'n' might not be in uppercase
        alt_codon = codon_utils.mutate(codon, alt, pos_codon).upper()

        # whether the alternative codon is synonymous
        if 'N' not in codon and 'N' not in alt_codon:
            synonymy = int(codon_utils.is_synonymous(codon, alt_codon))

        info += f'{alt_codon}'

        if alt_codon in codon_utils.stop_codons: info += ',stop_gained'
        if alt_codon in codon_utils.start_codons: info += ',start_gained'

        # fetch the codons from the VEP annotation
        codons_vep = codon_utils.parse_codons_vep(record)

        # make sure the codons determined by VEP are the same as our codons
        # we can only do the comparison for variant sites
        # only do the comparison if we could find exactly one pair of VEP codons
        # if there are several pairs then this might be because of adjacent SNPs
        # whose alternative codons are not supported at this point
        if len(codons_vep) == 1:

            # check for equality
            if not np.array_equal(codons_vep[0], [codon, alt_codon]):
                records_err.append(record)

            if len(records_err) > n_errors_tol:
                raise AssertionError(f"VEP Codons don't match at {codon_utils.format_errors(records_err)}. "
                                     f"Number of errors ({n_errors_tol}) exceeded.")

    # add to info field
    record.INFO['Synonymy'] = synonymy
    record.INFO['Synonymy_Info'] = info

    print(f'pos codon: {pos_codon}, pos abs: {record.POS}, '
          f'codon start: {codon_start}, strand: {cd.strand}, '
          f'ref allele: {contig[record.POS - 1]}, rec allele: {record.REF}, '
          f'codon pos: {str(codon_pos)}, codons: {str([codon, alt_codon])}, '
          f'VEP: {str(codons_vep)}', flush=True)

    return record


info_fields = {'Synonymy': None, 'Synonymy_Info': None}

# perform annotation
codon_utils.map_sites(vcf_reader, vcf_writer, gff, ref_reader, annotate_synonymy, info_fields=info_fields)

print(f"In total, {len(records_err)} codon assertion errors "
      f"occurred at {str([str(r) for r in records_err])}.")

Python numpy Biopython PyVCF From line 5 of scripts/determine_synonymy.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

try:
    ids = snakemake.input.ids
    input = snakemake.input.bed
    out = snakemake.output.bed
except NameError:
    # testing
    ids = "output/default/snps/pendula/biallelic/plink.prune.in"
    input = "output/default/snps/pendula/biallelic/snps.bed"
    out = "output/default/snps/pendula/biallelic/snps.admixture.bed"

input_prefix = input.replace('.bed', '')
out_prefix = out.replace('.bed', '')

shell(f"plink --bfile {input_prefix} --extract {ids} --make-bed --out {out_prefix} --allow-extra-chr '0'")

Python Snakemake pLink From line 5 of scripts/extract_sites_plink.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts

java_opts = get_java_opts(snakemake)
vcf = snakemake.input.vcf
filter = snakemake.params.filter
out = snakemake.output[0]

shell(f"gatk --java-options '{java_opts}' SelectVariants -V {vcf} -O {out} "
      f"--selectExpressions '{filter}'")

Python Snakemake gatk snakemake-wrapper-utils From line 5 of scripts/filter_by_info.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

# from snakemake_wrapper_utils.java import get_java_opts

vcf = snakemake.input.vcf
# samples = snakemake.input.samples
# java_opts = get_java_opts(snakemake)
out = snakemake.output[0]

# keep only mono and bi-allelic SNPs
# we retain non-variants sites here which we will annotate
# we do this as we need the total number of surveyed sites for predicting the DFE
shell(f"bcftools view {vcf} -Oz --apply-filters .,PASS --max-alleles 2 --exclude-types indels,mnps,other > {out}")

# this didn't work for some reasons. There were still indels among the filtered variants
"""shell(f"gatk --java-options '{java_opts}' SelectVariants -V {vcf} -O {out} \
    --select-type-to-include SNP --select-type-to-include NO_VARIATION \
    --select-type-to-exclude INDEL --select-type-to-exclude SYMBOLIC \
    --select-type-to-exclude MNP --select-type-to-exclude MIXED \
    --sample-name {samples}")"""

Python Snakemake gatk BCFtools From line 5 of scripts/filter_snps.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import dadi

import demographic_scenarios
import sfs_utils

try:
    test_mode = False
    vcf = snakemake.input.vcf
    pops = snakemake.input.pops
    scenario_name = snakemake.params.scenario
    sample_set = snakemake.params.sample_set
    n_pops = snakemake.params.n_pops
    params_out = snakemake.output.params
    params_pretty_out = snakemake.output.params_pretty
    mode = snakemake.params.mode

    n_iter = snakemake.config['dadi_n_iter_' + mode]

    generation_time = snakemake.config['generation_time']
    mu = snakemake.config['mu']
    N_e = snakemake.config['N_e'][sample_set]
except NameError:
    # testing
    test_mode = True
    sample_set = "pendula"
    vcf = f"output/default/snps/{sample_set}/synonymous/snps.vcf.gz"
    n_pops = 2
    pops = f"output/default/sample_sets/subpopulations/{sample_set}/{n_pops}_pops.txt"
    scenario_name = 'split_unidirectional_migration_from_two_to_one_since_ice_age'
    params_out = "scratch/dadi.csv"
    params_pretty_out = "scratch/dadi.out"
    # mode = 'local_optimization'
    mode = 'random_hopping'

    n_iter = 1

    generation_time = 20
    mu = 1e-9
    N_e = 675000

n_proj = 20
dd = dadi.Misc.make_data_dict_vcf(vcf, pops)
fs = sfs_utils.get_spectrum_from_dd(dd, n_proj=n_proj, n_pops=n_pops)

params = dict(fs=fs, dd=dd, n_pops=n_pops, n_proj=n_proj, n_iter=n_iter,
              N_e=N_e, mu=mu, generation_time=generation_time, mode=mode)

scenario = demographic_scenarios.create(scenario_name, params)

# perform simulation
scenario.simulate()

# save to file
data = scenario.to_dataframe()
open(params_pretty_out, 'w').write(data.to_markdown())
open(params_out, 'w').write(data.to_csv())

Python sfs-python dadi From line 5 of scripts/fit_pop_scenarios_dadi.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

try:
    data_files = snakemake.input.data
    out_data = snakemake.output.data
    out_seed = snakemake.output.seed
except NameError:
    # testing
    data_files = [f"output/default/est-sfs/data/{n}.txt" for n in range(1, 10)]
    out_data = "scratch/data.combined.txt"
    out_seed = "scratch/seed.txt"

# create seed file
open(out_seed, 'w').write('0')

# combine contents of all data files
with open(out_data, 'w') as out:
    for file in data_files:
        out.writelines(open(file, 'r').readlines())

Python From line 5 of scripts/gather_input_est_sfs.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts

vcfs = ' '.join([f"-I {vcf}" for vcf in snakemake.input.vcfs])
java_opts = get_java_opts(snakemake)
tmp_dir=snakemake.resources.tmpdir
out = snakemake.output[0]

shell(f"gatk --java-options '{java_opts}' GatherVcfs {vcfs} -O {out} \
    --TMP_DIR {tmp_dir} --REORDER_INPUT_BY_FIRST_VARIANT")

Python Snakemake gatk snakemake-wrapper-utils From line 5 of scripts/gather_variants.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import os

import numpy as np
import pandas as pd

import utils

try:
    targets_file = snakemake.input.targets
    gff = snakemake.input.gff
    n_files = snakemake.params.n_files
    out = snakemake.output
except NameError:
    # testing
    targets_file = "resources/reference/targeted_positions.txt"
    gff = "resources/reference/genome.corrected.gff.gz"
    n_files = 1
    out = ["scratch/intervals1.bed"]

pd.options.mode.chained_assignment = None

# the list of targeted genes whose positions are relative to the gene
targets = pd.read_csv(targets_file, sep='\t')

# load the genes from the annotation file
genes = utils.load_gff(gff)
genes = genes[genes.feature == 'gene']

# only keep rows where TanjaKeep is true
targets = targets.loc[targets.TanjaKeep]
targets = targets.reset_index(drop=True)

# initialize array
rows = []

# loop through genes and fetch their positions relative to the contigs
for i, target in targets.iterrows():
    gene = genes[genes.attribute.str.contains('ID=' + target.CHROMOSOME + ';')].iloc[0]

    contig = gene.seqname

    # We use 0-based coordinates for start and 1-based for end positions
    # as is done BED files. The GFF file uses 1-based coordinates so we need
    # to offset start by 1
    start = (gene.start - 1) + target.START

    # The stop position in the BED file is non-inclusive so we
    # just add the length to the start value
    stop = start + target.LENGTH

    rows.append([contig, start, stop])

    if i % 100 == 0: print(f"Processed genes: {i}/{len(targets)}", flush=True)

# randomize order and split arrays into chunks
# we do this to obtain chunks of similar sequence length
np.random.seed(0)
np.random.shuffle(rows)

# partition set of intervals
rows = np.array_split(rows, n_files)

# write contents to files
for i, chunk in enumerate(rows):
    # sort chunk in ascending contig order
    chunk = sorted(chunk, key=lambda x: (len(x[0]), x[0]))

    with open(out[i], 'w') as f:
        f.write(os.linesep.join(['\t'.join(row) for row in chunk]) + os.linesep)

Python Pandas numpy utils From line 5 of scripts/generate_genomic_intervals.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts

vcf = snakemake.input.vcf
samples = snakemake.input.samples
max_nocall = snakemake.params.max_nocall_fraction
java_opts = get_java_opts(snakemake)
out = snakemake.output[0]
biallelic = snakemake.params.get('biallelic')

flag_biallelic = '--select-type-to-include SNP --restrict-alleles-to BIALLELIC'
optional_flags = flag_biallelic if biallelic else ''

shell(f"gatk --java-options '{java_opts}' SelectVariants -V {vcf} "
      f"-O {out} --sample-name {samples} --max-nocall-fraction {max_nocall} "
      f"--remove-unused-alternates {optional_flags}")

Python Snakemake gatk snakemake-wrapper-utils From line 5 of scripts/generate_vcf_sample_set.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts

intervals = snakemake.input.intervals
variants = ' '.join([f"-V {vcf}" for vcf in snakemake.input.vcfs])
java_opts = get_java_opts(snakemake)
db_dir = snakemake.output[0]
tmp_dir = snakemake.resources.tmpdir
batch_size = snakemake.params.batch_size

shell(f"gatk --java-options '{java_opts}' GenomicsDBImport {variants} \
    --genomicsdb-workspace-path {db_dir} --tmp-dir {tmp_dir} \
    --interval-padding 100 -L {intervals} --batch-size {batch_size}")

Python Snakemake gatk snakemake-wrapper-utils From line 5 of scripts/import_variants.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

ref = snakemake.input.ref
left = snakemake.input.left
right = snakemake.input.right
name = snakemake.wildcards.name
out = snakemake.output.bam
tmp_dir = snakemake.resources.tmpdir
threads = snakemake.threads

# only map paired end reads which account for most of the data
shell(f"samtools sort -T {tmp_dir} <(bwa mem {ref} {left} {right} -R '@RG\\tID:{name}\\tSM:{name}' -t {threads}) > {out}")

Python Snakemake SAMtools From line 5 of scripts/map_reads.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts

in_bam = snakemake.input.bam
java_opts = get_java_opts(snakemake)
out_bam = snakemake.output.bam
out_metric = snakemake.output.metrics

shell(f"gatk MarkDuplicates --java-options '{java_opts}' -I {in_bam} -O {out_bam} -M {out_metric}")

Python Snakemake gatk snakemake-wrapper-utils From line 5 of scripts/mark_duplicates.py

if (exists('snakemake')) {
    input = snakemake@input[[1]]
    output = snakemake@output[[1]]
    postprocessing_source = snakemake@config$polydfe_postprocessing_source
} else {
    # testing
    input = "output/default/polydfe/pendula/output/C.full_anc.out"
    output = "scratch/dfe_parsed_output.txt"
    postprocessing_source = "resources/polydfe/postprocessing/script.R"
}


source(postprocessing_source)

intervals = c(-100, -10, -1, 0, 1)

est = c(sapply(input, parseOutput))
dfe = t(sapply(est, getDiscretizedDFE, intervals))

# write intervals and discretized DFE values
write(paste(intervals, collapse = " "), file = output, append = TRUE)
write(paste(dfe, collapse = " "), file = output, append = TRUE)

# write gradient
write(est[[1]]$criteria, file = output, append = TRUE)

R Snakemake From line 5 of scripts/parse_dfe_output.R

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import matplotlib.pyplot as plt
import numpy as np

import utils

try:
    test_mode = False
    sample_set = snakemake.params.sample_set
    subsample_sets = snakemake.params.subsample_sets
    sample_class = snakemake.params['sample_class']
    out = snakemake.output[0]
except NameError:
    # testing
    test_mode = True
    sample_set = 'pendula'
    subsample_sets = ['pendula_south_admixture', 'pendula_north_admixture']
    sample_class = 'default'
    out = 'scratch/admixture_locations.png'

# fetch all samples
samples = utils.get_samples(sample_set=sample_set, sample_class=sample_class)

# fetch subsamples and choose colors according to subsample set
# only two subsample sets are currently supported
subsamples = utils.get_samples(subsample_sets[0], sample_class=sample_class)
mask = samples.name.isin(subsamples.name)
colors = [int(m) for m in mask]

# perturb the sample locations to make individual samples visible
np.random.seed(seed=0)
samples.latitude += np.random.normal(size=samples.shape[0], scale=0.3)
samples.longitude += np.random.normal(size=samples.shape[0], scale=0.3)

# plot samples
scatter = plt.scatter(samples.longitude, samples.latitude, c=colors, s=3.5)
plt.gca().axis('square')
plt.xlabel('longitude')
plt.ylabel('latitude')

# add legend
labels = [utils.get_proper_name(s) for s in subsample_sets]
plt.legend(handles=scatter.legend_elements()[0], labels=labels)

utils.scale_cf(2 / 3)

utils.save_fig(out, show=test_mode, tight_layout=True)

Python numpy matplotlib utils From line 5 of scripts/plot_clusters_location.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import re

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

import plot_utils

try:
    test_mode = False
    log_files = snakemake.input
    K = snakemake.params.K
    out = snakemake.output[0]
except NameError:
    # testing
    test_mode = True
    log_files = [f"output/default/admixture/pendula/biallelic/snps.admixture.{n}.log" for n in range(1, 5)]
    K = range(1, 5)
    out = 'scratch/cv_error_admixture.svg'

cv_errors = {}
for K, file in zip(K, log_files):
    # get file contents
    contents = open(file, 'r').read()

    # match cv error
    match = re.search("CV error \(K=\d\): (\d+\.\d+)", contents)

    cv_errors[K] = float(match.group(1))

# plot cv errors
plt.plot(cv_errors.keys(), cv_errors.values(), marker='o')
plt.xlabel('K')
plt.ylabel('CV error')

# mark lambda with lowest cv error
best_K = list(cv_errors.keys())[list(cv_errors.values()).index(min(cv_errors.values()))]
plt.axvline(best_K, color="orange")

# only show integers on x-axis
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))

# increase scaling
plot_utils.scale_cf(2 / 3)

# save plot
plot_utils.save_fig(out, show=test_mode, tight_layout=True)

Python matplotlib plot-utils From line 5 of scripts/plot_cv_error_admixture.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import feems_utils
import utils

try:
    test_mode = False
    files = snakemake.input
    lambdas = snakemake.params.lambdas
    out = snakemake.output[0]
except NameError:
    # testing
    test_mode = True
    lambdas = [0.01, 1]
    files = [f"output/default/feems/pendula/biallelic/cv_error_{n}.txt" for n in lambdas]

    out = 'scratch/cv_error_admixture.svg'

cv_errors = []
for file in files:
    # get file contents
    cv_errors.append(float(open(file, 'r').read()))

feems_utils.plot_cv_errors(lambdas, cv_errors)

# save plot
utils.save_fig(out, show=test_mode, tight_layout=True)

Python utils From line 5 of scripts/plot_cv_error_feems.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import matplotlib.pyplot as plt

import numpy as np
import utils
import warnings

try:
    sites_all = snakemake.input.sites_all
    vcf_synonymous = snakemake.input.synonymous
    vcf_nonsynonymous = snakemake.input.nonsynonymous
    out = snakemake.output[0]
except NameError:
    # testing
    sites_all = "output/default/snps/pendula/snps.vcf.gz"
    vcf_synonymous = "output/default/snps/pendula_synonymous/snps.vcf.gz"
    vcf_nonsynonymous = "output/default/snps/pendula_nonsynonymous/snps.vcf.gz"
    out = 'scratch/degeneracy.svg'

# the degeneracy values
ns = [0, 2, 4]

labels_degeneracy = ['non-coding'] + [f"{n}-fold" for n in ns]
labels_synonymy = ['_', 'non-synonymous', '_', 'non-synonymous', 'synonymous', '_', 'synonymous', '_']

n_lines_total = utils.count_lines_vcf(sites_all)
degeneracy_all = {n: utils.get_n_degenerate(sites_all, n) for n in ns}
n_non_coding = n_lines_total - sum(degeneracy_all.values())

# the degeneracy classes with respect to synonymy
synonymous = {n: utils.get_n_degenerate(vcf_synonymous, n) for n in [2, 4]}
nonsynonymous = {n: utils.get_n_degenerate(vcf_nonsynonymous, n) for n in [0, 2]}

# the degeneracy classes with synonymy subclasses
degeneracy_synonymy = {
    0: [nonsynonymous[0], degeneracy_all[0] - nonsynonymous[0]],
    2: [nonsynonymous[2], synonymous[2], degeneracy_all[2] - nonsynonymous[2] - synonymous[2]],
    4: [synonymous[4], degeneracy_all[4] - synonymous[4]]
}

# all values in a nested array
degeneracy_synonymy = [[n_non_coding]] + list(degeneracy_synonymy.values())
degeneracy = [sum(s) for s in degeneracy_synonymy]

# prepare colors of disks
cmap = plt.get_cmap("tab20c")
outer_colors = cmap(np.arange(4) * 4)
inner_colors = cmap([1, 5, 6, 9, 10, 11, 13, 14])

fig, ax = plt.subplots()


# get wedge labels of outer disk
def autopict(percentage):
    n_sites = int(np.round(percentage * n_lines_total / 100))

    return str(n_sites) + '\n' + str(np.round(percentage, 1)) + '%'


widths = [0.5, 0.1]
# plot outer disk showing the degeneracy classes
ax.pie(degeneracy, labels=labels_degeneracy, autopct=autopict, startangle=90,
       wedgeprops=dict(edgecolor='w', linewidth=1, antialiased=True, width=widths[0]),
       pctdistance=0.75, colors=outer_colors, radius=1)

# plot inner disk showing the synonymy classes
wedges_synonymy, _ = ax.pie(utils.flatten(degeneracy_synonymy), startangle=90,
                            wedgeprops=dict(edgecolor='w', linewidth=1, antialiased=True, width=widths[1]),
                            colors=inner_colors, radius=1 - widths[0])

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    plt.legend(wedges_synonymy, labels_synonymy, loc="upper left")

ax.axis('equal')
utils.save_fig(out, tight_layout=True)

Python numpy matplotlib utils From line 5 of scripts/plot_degeneracy.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cycler

import plot_utils
import polydfe_utils
import utils

plt.rcParams['axes.prop_cycle'] = cycler('color', plt.get_cmap('Set2').colors)

try:
    test_mode = False
    input = snakemake.input
    bs_type = snakemake.params.bs_type
    label_type = snakemake.params.label_type
    out = snakemake.output[0]
except NameError:
    # testing
    test_mode = True
    input = {}
    for key in ['pendula_south', 'pendula_north', 'pubescens_south', 'pubescens_north']:
        files = [f"output/default/polydfe/{key}/output/bootstraps/dfe_C.full_anc.{str(n).zfill(3)}.txt"
                 for n in range(1, 101)]
        input[key] = files

    bs_type = 'percentile'
    label_type = 'sample_set'
    out = "scratch/dfe.svg"

width_total = 0.9
width = width_total / len(input.keys())

for i, (key, files) in enumerate(input.items()):
    # parse bootstrapped discretized DFE values
    dfe = []
    for bs in files:
        dfe.append(polydfe_utils.parse_output(bs)['discretized'])

    n = len(dfe[0])

    # determine mean and 95% CIs
    a = 0.05
    values = np.mean(dfe, axis=0)
    dfe = np.array(dfe)

    if bs_type == 'percentile':
        ci = [utils.get_ci_percentile_bootstrap(dfe[:, i], a) for i in range(n)]
    elif bs_type == 'bca':
        ci = [utils.get_ci_bca(dfe[:, i], values[i], a) for i in range(n)]

    yerr = np.array([[values[i] - ci[i][0], ci[i][1] - values[i]] for i in range(n)]).T

    indices = np.array(range(n)) + i * width

    if label_type == 'sample_set':
        label = utils.get_proper_name(key)
    elif label_type == 'polydfe_type':
        label = polydfe_utils.get_label(key)

    # plot bar
    plt.bar(indices, values, width=width, label=label, yerr=yerr, error_kw=dict(capsize=3))

# adjust x-axis
plt.xlabel('$4N_e s$')
plt.xticks([r + (width_total - width) / 2 for r in range(n)], polydfe_utils.get_interval_names(files[0]))

plt.autoscale(tight=True)

eps = 0.01
plt.ylim(min(values - yerr[0]), max(values + yerr[1]) + eps)

plt.legend()

plot_utils.save_fig(out, tight_layout=True, show=test_mode)

Python numpy matplotlib utils plot-utils From line 5 of scripts/plot_dfe_bootstrapped.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

import polydfe_utils

try:
    bootstrapped = snakemake.input.bootstrapped
    out = snakemake.output[0]
except NameError:
    # testing
    bootstrapped = [f"output/default/polydfe/pendula/output/bootstraps/dfe_C.full_anc.{str(n).zfill(3)}.txt"
                    for n in range(1, 101)]
    out = "scratch/dfe.svg"

data = polydfe_utils.parse_output_files(bootstrapped)
data = data.sort_values(by=['gradient'])

names = polydfe_utils.get_interval_names(bootstrapped[0])
x = data.gradient.values

# prepare axes and colors
axs = plt.subplots(2, 3)[1].flatten()
colors = plt.get_cmap('Set2').colors

# iterate over intervals
for i, name in enumerate(names):
    y = data[i].values

    # plot points
    axs[i].scatter(x, y, c=[colors[i]] * len(x), s=10, alpha=0.5)

    # plot linear regression line
    m, b = np.polyfit(np.log(x), y, 1)
    axs[i].plot(x, m * np.log(x) + b, c='black')

    # plot mean per bin
    bins = stats.binned_statistic(np.log(x), y, 'mean', bins=10)
    x_bins = np.exp(bins.bin_edges[1:] + (bins.bin_edges[:-1] - bins.bin_edges[1:]) / 2)
    y_bins = bins.statistic
    axs[i].plot(x_bins, y_bins, c='red', alpha=0.5)

    axs[i].set_xlabel("log($\delta$)")
    axs[i].set_ylabel(name)
    axs[i].set_xscale('log')

plt.tight_layout(pad=1)
plt.savefig(out)

Python numpy matplotlib scipy From line 5 of scripts/plot_dfe_gradient_error.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import matplotlib.pyplot as plt
import numpy as np

import polydfe_utils
import utils

try:
    input = snakemake.input
    out = snakemake.output[0]
except NameError:
    # testing
    input = [f"output/default/polydfe/pendula_south/output/bootstraps/dfe_C.full_anc.{str(n).zfill(3)}.txt"
             for n in range(1, 101)]
    out = "scratch/param_dist.svg"

# number of intervals
n = len(polydfe_utils.parse_output(input[0])['discretized'])

fig, axes = plt.subplots(n)

# parse bootstrapped discretized DFE values
values = []
for bs in input:
    values.append(polydfe_utils.parse_output(bs)['discretized'])

# determine mean and 95% CIs
a = 0.05
mean = np.mean(values, axis=0)
values = np.array(values)

for i, ax in enumerate(axes):
    ax.hist(values[:, i], bins=50)

    ci_bca = utils.get_ci_bca(values[:, i], mean[i], a)
    ci_percentile = utils.get_ci_percentile_bootstrap(values[:, i], a)

    for x in ci_bca:
        ax.axvline(x=x, c='red', linewidth=1)

    for x in ci_percentile:
        ax.axvline(x=x, c='black', linewidth=1)

plt.tight_layout(pad=0)
fig.savefig(out)

Python numpy matplotlib utils From line 5 of scripts/plot_dfe_param_dist.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import matplotlib.pyplot as plt
import numpy as np
import polydfe_utils
import plot_utils
from matplotlib import cycler

plt.rcParams['axes.prop_cycle'] = cycler('color', plt.get_cmap('Set2').colors)

try:
    test_mode = False
    files = snakemake.input
    out = snakemake.output[0]
except NameError:
    # testing
    test_mode = True
    files = {'full_anc': "output/default/polydfe/pendula/output/dfe_C.full_anc.txt",
             'full': "output/default/polydfe/pendula/output/dfe_C.full.txt",
             'deleterious': "output/default/polydfe/pendula/output/dfe_C.deleterious.txt"}
    out = "scratch/dfe.svg"

width_total = 0.9
width = width_total / len(files)

# iterate over files and a bars
for i, (name, file) in enumerate(files.items()):
    values = polydfe_utils.parse_output(file)['discretized']
    indices = np.arange(len(values)) + i * width

    label = polydfe_utils.get_label(name)

    plt.bar(indices, values, width=width, label=label)

# adjust x-axis
plt.xlabel('$4N_e s$')
plt.xticks([r + (width_total - width) / 2 for r in range(len(values))], polydfe_utils.get_interval_names(file))

plt.autoscale(tight=True)
plt.legend()

plot_utils.save_fig(out, tight_layout=True, show=test_mode)

Python numpy matplotlib plot-utils From line 5 of scripts/plot_dfe_types.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import plot_utils

try:
    test_mode = False
    input = snakemake.input[0]
    out = snakemake.output[0]
    log_scale = snakemake.params.log_scale
except NameError:
    # testing
    test_mode = True
    input = 'output/default/stats/pendula/biallelic/fst.weir.fst'
    out = 'scratch/fst.svg'
    log_scale = True

plot_utils.plot_hist_basic_stat(input, colname='WEIR_AND_COCKERHAM_FST', xlabel="$F_{st}$",
                                ylabel="n SNPs", color=plot_utils.get_color('tab20c', 1),
                                log_scale=log_scale)

plot_utils.save_fig(out, tight_layout=True, show=test_mode)

Python plot-utils From line 6 of scripts/plot_fst.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import plot_utils

try:
    test_mode = False
    input = snakemake.input[0]
    out = snakemake.output[0]
    log_scale = snakemake.params.log_scale
except NameError:
    # testing
    test_mode = True
    input = 'output/default/stats/pendula/biallelic/fst.windowed.weir.fst'
    out = 'scratch/fst_windowed.svg'
    log_scale = True

plot_utils.plot_hist_basic_stat(input, colname='WEIGHTED_FST', xlabel="$F_{st}$", ylabel="n windows",
                                color="turquoise", log_scale=log_scale, weights='N_VARIANTS')

plot_utils.save_fig(out, tight_layout=True, show=test_mode)

Python plot-utils From line 6 of scripts/plot_fst_windowed.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import plot_utils

try:
    test_mode = False
    input = snakemake.input[0]
    out = snakemake.output[0]
    log_scale = snakemake.params.log_scale
except NameError:
    # testing
    test_mode = True
    input = 'output/default/stats/pendula/all/hwe.hwe'
    out = 'scratch/hwe.svg'
    log_scale = True

opts = dict(xlabel="heterozygosity", delim_whitespace=True,
            ylabel="n SNPs", log_scale=log_scale)

opts_observed = dict(colname='O(HET)', color=plot_utils.get_color('tab20b', 18),
                     bins=50, opts=dict(alpha=0.3, label='observed'))

plot_utils.plot_hist_basic_stat(input, **(opts_observed | opts))

opts_expected = dict(colname='E(HET)', color='cornflowerblue',
                     bins=25, opts=dict(alpha=0.3, label='expected'))

plot_utils.plot_hist_basic_stat(input, **(opts_expected | opts))

plot_utils.save_fig(out, tight_layout=True, show=test_mode)

Python plot-utils From line 5 of scripts/plot_heterozygosity_comparison.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import plot_utils

try:
    test_mode = False
    input = snakemake.input[0]
    out = snakemake.output[0]
    observed = snakemake.params.observed
    log_scale = snakemake.params.log_scale
except NameError:
    # testing
    test_mode = True
    input = 'output/default/stats/pendula/biallelic/hwe.hwe'
    out = 'scratch/hwe.svg'
    observed = False
    log_scale = True

colname = 'O(HET)' if observed else 'E(HET)'

plot_utils.plot_hist_basic_stat(input, colname=colname, xlabel="heterozygosity", delim_whitespace=True,
                                ylabel="n SNPs", color=plot_utils.get_color('tab20b', 18), log_scale=log_scale)

plot_utils.save_fig(out, tight_layout=True, show=test_mode)

Python plot-utils From line 5 of scripts/plot_heterozygosity.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import plot_utils

try:
    test_mode = False
    input = snakemake.input[0]
    out = snakemake.output[0]
    log_scale = snakemake.params.log_scale
except NameError:
    # testing
    test_mode = True
    input = 'output/default/stats/pendula/biallelic/hwe.hwe'
    out = 'scratch/hwe.svg'
    log_scale = False

# filtering dependent on allele frequency to check for bias
'''
def filter_ac(data):
    data['AC'] = data.GENO.str.split('/').apply(lambda x: float(x[1])) / \
                 data.GENO.str.split('/').apply(lambda x: float(x[2]))

    return data[~data.AC.between(0.001, 0.999)]
'''

plot_utils.plot_hist_basic_stat(input, colname='P', xlabel="p-value", delim_whitespace=True,
                                ylabel="n SNPs", color=plot_utils.get_color('Paired', 0),
                                log_scale=log_scale)

plot_utils.save_fig(out, tight_layout=True, show=test_mode)

Python plot-utils From line 5 of scripts/plot_hwe.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import plot_utils

try:
    test_mode = False
    input = snakemake.input[0]
    out = snakemake.output[0]
    log_scale = snakemake.params.log_scale
except NameError:
    # testing
    test_mode = True
    input = 'output/default/stats/pendula/biallelic/missingness.imiss'
    out = 'scratch/missingness.svg'
    log_scale = True

plot_utils.plot_hist_basic_stat(input, colname='F_MISS', xlabel="F_MISS", delim_whitespace=True,
                                ylabel="n individuals", color=plot_utils.get_color('Pastel1', 6),
                                log_scale=log_scale)

plot_utils.save_fig(out, tight_layout=True, show=test_mode)

Python plot-utils From line 5 of scripts/plot_missingness_individuals.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import plot_utils

try:
    test_mode = False
    input = snakemake.input[0]
    out = snakemake.output[0]
    log_scale = snakemake.params.log_scale
except NameError:
    # testing
    test_mode = True
    input = 'output/default/stats/pendula/biallelic/missingness.lmiss'
    out = 'scratch/missingness.svg'
    log_scale = True

plot_utils.plot_hist_basic_stat(input, colname='F_MISS', xlabel="F_MISS", delim_whitespace=True,
                                ylabel="n SNPs", color=plot_utils.get_color('Pastel1', 6),
                                log_scale=log_scale)

plot_utils.save_fig(out, tight_layout=True, show=test_mode)

Python plot-utils From line 5 of scripts/plot_missingness_sites.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import matplotlib.pyplot as plt
import pandas as pd

import plot_utils
import utils

try:
    test_mode = False
    results = snakemake.input
    df = snakemake.params.get('df', 1)
    comparisons = snakemake.params['comparisons']
    labels = snakemake.params.get('labels', list(results.keys()))
    scaling = snakemake.params.get('scaling', 1)
    transpose = snakemake.params.get('transpose', False)
    out = snakemake.output[0]
except NameError:
    # testing
    test_mode = True
    scenarios = [
        "split_no_migration",
        "split_unidirectional_migration_from_two_to_one",
        "split_unidirectional_migration_from_one_to_two",
        "split_symmetric_migration",
        "split_asymmetric_migration",
        "split_asymmetric_migration_one_pop_size_change",
        "split_no_migration_since_ice_age",
        "split_unidirectional_migration_from_two_to_one_since_ice_age",
        "split_unidirectional_migration_from_one_to_two_since_ice_age",
        "split_symmetric_migration_since_ice_age",
        "split_asymmetric_migration_since_ice_age",
        "split_asymmetric_migration_one_pop_size_change_since_ice_age"
    ]
    sample_set = "pendula_pubescens"
    results = {scenario: f"output/tetraploid/dadi/{scenario}/{sample_set}/synonymous/dadi.csv" for scenario in scenarios}
    # each element is a pair of models to compare
    # where the first element is the more simple model
    comparisons = [
        ['split_no_migration_since_ice_age', 'split_no_migration', 1],
        ['split_unidirectional_migration_from_two_to_one_since_ice_age', 'split_unidirectional_migration_from_two_to_one', 1],
        ['split_unidirectional_migration_from_one_to_two_since_ice_age', 'split_unidirectional_migration_from_one_to_two', 1],
        ['split_symmetric_migration_since_ice_age', 'split_symmetric_migration', 1],
        ['split_asymmetric_migration_since_ice_age', 'split_asymmetric_migration', 1],
        ['split_asymmetric_migration_one_pop_size_change_since_ice_age', 'split_asymmetric_migration_one_pop_size_change', 1]
    ]
    labels = [key.replace('_', ' ') for key in results.keys()]
    scaling = 1
    df = 1
    transpose = False
    out = 'scratch/dadi_nested.svg'


def parse_lnl(file):
    # take the last row which if the file contains bootstrap ci estimates
    # and take the first row otherwise
    n_row = -1 if '_ci' in file else 0

    return float(pd.read_csv(file, index_col=None, header=0).iloc[n_row]['lnl'])


simple, complex, p_values = [], [], []
for comp in comparisons:
    simple.append(comp[0])
    complex.append(comp[1])
    df = comp[2] if len(comp) == 3 else 1

    lnl_simple = parse_lnl(results[comp[0]])
    lnl_complex = parse_lnl(results[comp[1]])
    p = utils.lrt(lnl_simple, lnl_complex, df)

    p_values.append(p)

types = list(results.keys())
plot_utils.plot_lrt_p_values(simple, complex, p_values, types, labels, transpose=transpose)

plt.tight_layout()
plot_utils.scale_cf(scaling)
plot_utils.save_fig(out, show=test_mode, tight_layout=True)

Python Pandas matplotlib utils plot-utils From line 5 of scripts/plot_models_lrt_dadi.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import matplotlib.cm as cm
import matplotlib.pyplot as plt

import pca_utils
import utils

try:
    test_mode = False
    sample_set = snakemake.params['sample_set']
    sample_class = snakemake.params['sample_class']
    flag = snakemake.params['flag']
    out = snakemake.output[0]
    add_names = snakemake.params.get('add_names')
    subsample_sets = snakemake.params.get('subsample_sets', [])
    demarcation_type = snakemake.params.get('demarcation_type', 'ellipse')
    scaling = snakemake.params.get('scaling', 1)
    marker_size = snakemake.params.get('marker_size', 20)
    tight_layout = snakemake.params.get('tight_layout', False)
except NameError:
    # testing
    test_mode = True
    sample_set = 'pendula'
    sample_class = 'default'
    flag = 'no_missing'
    out = "scratch/all_sites_no_missing.png"
    add_names = False
    subsample_sets = ['pendula_north', 'pendula_south']
    demarcation_type = 'gradient'
    scaling = 1
    marker_size = 20
    tight_layout = False

pca, pc, samples = pca_utils.get(sample_set, flag, sample_class)

# determine color of dots
if demarcation_type == 'color':
    # we only support two colors here
    subsamples = utils.get_samples(subsample_sets[0], sample_class=sample_class)
    mask = samples.name.isin(subsamples.name)
    colors = [int(m) for m in mask]
else:
    colors = samples.latitude

# plot the 2 components
scatter = plt.scatter(x=pc[:, 0], y=pc[:, 1], c=colors, s=marker_size)

# draw ellipses around subsample sets if specified
if demarcation_type == 'ellipse':
    for i, subsample_set in enumerate(subsample_sets):
        pca_utils.encircle_subset(sample_set, subsample_set, sample_class, pc, i)

# add sample names if specified
if add_names:
    eps = 1
    min_lat = samples.latitude.min()
    max_lat = samples.latitude.max()

    for i in range(len(samples)):
        c = cm.viridis((samples.latitude[i] - min_lat) / (max_lat - min_lat))

        plt.text(x=pc[i, 0] + eps, y=pc[i, 1] + eps, s=samples.name[i], size=6, c=c)

plt.gca().axis('square')

# add axis labels
v1 = round(pca.explained_variance_ratio_[0] * 100, 2)
v2 = round(pca.explained_variance_ratio_[1] * 100, 2)
plt.xlabel(f'{v1}%')
plt.ylabel(f'{v2}%')

# only show legend and colorbar if we don't
# color the samples according to their specified subsets
if demarcation_type == 'color':
    labels = [utils.get_proper_name(s) for s in subsample_sets]
    plt.legend(handles=scatter.legend_elements()[0], labels=labels)
else:
    plt.colorbar().set_label('latitude')

    if len(subsample_sets):
        plt.legend()

# increase scaling
utils.scale_cf(scaling)

utils.save_fig(out, tight_layout=tight_layout, show=test_mode)

Python matplotlib utils pca From line 5 of scripts/plot_pca.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import plot_utils

try:
    test_mode = False
    input = snakemake.input[0]
    out = snakemake.output[0]
    log_scale = snakemake.params.log_scale
except NameError:
    # testing
    test_mode = True
    input = 'output/default/stats/pendula/biallelic/pi.sites.pi'
    out = 'scratch/pi.svg'
    log_scale = True

plot_utils.plot_hist_basic_stat(input, colname='PI', xlabel="$\pi$", ylabel="n SNPs",
                                color=plot_utils.get_color('tab20b', 14), log_scale=log_scale)

plot_utils.save_fig(out, tight_layout=True, show=test_mode)

Python plot-utils From line 5 of scripts/plot_pi.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import cycler

import plot_utils
import polydfe_utils

plt.rcParams['axes.prop_cycle'] = cycler('color', plt.get_cmap('Set2').colors)

try:
    testing = False
    input = snakemake.input[0]
    out = snakemake.output[0]
except NameError:
    # testing
    testing = True
    input = "output/tetraploid/polydfe/pendula/type_comparison.C.txt"
    out = "scratch/probs_nested.svg"

# read data containing p-values
# columns: 'type1': complex model name, 'type2': simple model name, 'p.value': LRT p-value
data = pd.read_csv(input, sep=' ')

types = ['full', 'deleterious', 'full_anc', 'deleterious_anc']
labels = [polydfe_utils.get_short_label(t) for t in types]

plot_utils.plot_lrt_p_values(data.type1, data.type2, data['p.value'], types, labels)

plt.tight_layout()
plot_utils.scale_cf(0.75)
plot_utils.save_fig(out, show=testing, tight_layout=True)

Python Pandas matplotlib plot-utils From line 5 of scripts/plot_props_nested_dfe_types.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import matplotlib.pyplot as plt

import feems_utils

try:
    sample_set = snakemake.params['sample_set']
    sample_class = snakemake.params['sample_class']
    flag = snakemake.params['flag']
    out = snakemake.output[0]
except NameError:
    # testing
    sample_set = 'pendula'
    sample_class = 'default'
    flag = 'biallelic'
    out = 'scratch/feems_locations.svg'

sp_graph, samples, coords = feems_utils.get_sp_graph(sample_set, flag, sample_class=sample_class)

projection = feems_utils.get_projection(coords)

# draw figure showing which nodes the sample snapped to
v = feems_utils.create_plot(sp_graph, projection)
v.draw_samples()
v.draw_obs_nodes(use_ids=False)

plt.savefig(out, bbox_inches='tight', pad_inches=0)

Python matplotlib From line 5 of scripts/plot_sample_allocation_feems.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import cartopy.feature as cfeature
import matplotlib.pyplot as plt
from feems import viz
from pyproj import Proj

import feems_utils
import utils

try:
    test_mode = False
    sample_set = snakemake.params['sample_set']
    sample_class = snakemake.params['sample_class']

    latitudinal_barriers = snakemake.config['latitudinal_barriers']

    # get latitudinal barrier of given sample set from the config
    if sample_set in latitudinal_barriers:
        latitudinal_barrier = latitudinal_barriers[sample_set]
    else:
        latitudinal_barrier = None

    out = snakemake.output[0]
except NameError:
    # testing
    test_mode = True
    sample_set = 'pendula'
    sample_class = 'default'
    latitudinal_barrier = 64
    out = 'scratch/feems_locations.svg'

# fetch coordinates
coords = utils.get_samples(sample_set, sample_class)[['longitude', 'latitude']].values

# initialize figure and axis
fig = plt.figure()
projection = feems_utils.get_projection(coords)
ax = fig.add_subplot(1, 1, 1, projection=projection)
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0)
ax.axis("off")

# add cartopy features
ax.add_feature(cfeature.LAND, facecolor="#f7f7f7", zorder=0)
ax.coastlines("50m", color="#636363", linewidth=0.5, zorder=0)

# project coordinates
project = Proj(projection.proj4_init)
coords_projected = viz.project_coords(coords, project)

# draw coordinates
ax.scatter(
    coords_projected[:, 0],
    coords_projected[:, 1],
    edgecolors="black",
    linewidth=0.5,
    s=150,
    color="#30D5C8",
    marker=".",
    zorder=2
)

# draw line indicating subpopulation boundary
if latitudinal_barrier:
    p1 = project(min(coords[:, 0]), latitudinal_barrier)
    p2 = project(max(coords[:, 0]), latitudinal_barrier)
    ax.plot([p1[0], p2[0]], [p1[1], p2[1]], linestyle='dotted', c='grey', alpha=0.5, linewidth=3)

if test_mode:
    fig.show()

# save plot
plt.savefig(out, bbox_inches='tight', pad_inches=0)

Python matplotlib utils Cartopy FEEMS pyproj From line 5 of scripts/plot_sample_locations.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import sfs_utils
import plot_utils

try:
    test_mode = False
    files = snakemake.input.vcf
    pops = snakemake.input.pops
    n_proj = snakemake.params.n_proj
    unfolded = snakemake.params.unfolded
    log_scale = snakemake.params.log_scale
    labels = snakemake.params.get('labels', [])
    tight_layout = snakemake.params.get('tight_layout', False)
    out = snakemake.output[0]
except NameError:
    # testing
    test_mode = True
    files = ["output/tetraploid/snps/pendula/biallelic/snps.vcf.gz"]
    pops = "output/tetraploid/sample_sets/subpopulations/pendula/1_pops.txt"
    n_proj = 20
    unfolded = True
    log_scale = False
    labels = ['biallelic']
    tight_layout = False
    out = "scratch/pendula.pendula_one_pop.svg"

sfs_utils.plot_sfs_from_file(files, pops, n_proj, unfolded, log_scale, labels)

plot_utils.save_fig(out, tight_layout=tight_layout, show=test_mode)

Python plot-utils sfs-python From line 5 of scripts/plot_sfs_1D.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import sfs_utils
import plot_utils

try:
    test_mode = False
    vcf = snakemake.input.vcf
    pops = snakemake.input.pops
    n_proj = snakemake.params.n_proj
    sample_set = snakemake.params.get('sample_set', None)
    out = snakemake.output[0]
except NameError:
    # testing
    test_mode = True
    sample_set = 'pendula'
    vcf = f"output/default/snps/{sample_set}/4fold/snps.vcf.gz"
    pops = f"output/default/sample_sets/subpopulations/{sample_set}/2_pops.txt"
    n_proj = 20
    out = "scratch/sfs_{sample_set}.svg"

sfs_utils.plot_sfs_from_file([vcf], pops, n_proj, unfolded=True, sample_set=sample_set)

plot_utils.scale_cf(0.65)
plot_utils.save_fig(out, tight_layout=True, show=test_mode)

Python plot-utils sfs-python From line 5 of scripts/plot_sfs_2D.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import numpy as np
import pandas as pd

import demographic_scenarios
import plot_utils

try:
    test_mode = False
    vcf = snakemake.input.vcf
    pops = snakemake.input.pops
    result = snakemake.input.result
    scenario_name = snakemake.params.scenario
    sample_set = snakemake.params.sample_set
    n_pops = snakemake.params.n_pops
    out_sfs = snakemake.output.sfs
    out_residuals = snakemake.output.residuals
    scaling = [1, 0.5]

    generation_time = snakemake.config['generation_time']
    mu = snakemake.config['mu']
    N_e = snakemake.config['N_e'][sample_set]
except NameError:
    # testing
    test_mode = True
    n_pops = 1
    vcf = "output/tetraploid/snps/pendula/synonymous/snps.vcf.gz"
    pops = f"output/tetraploid/sample_sets/subpopulations/pendula/{n_pops}_pops.txt"
    scenario_name = 'one_pop_size_change_since_ice_age'
    result = f"output/tetraploid/dadi/{scenario_name}/pendula/synonymous/dadi.csv"
    out_sfs = "scratch/sfs.png"
    out_residuals = "scratch/residuals.png"
    scaling = np.array([1, 1.15]) * 0.6

    generation_time = 20
    mu = 1e-9
    N_e = 945883

# load params from file
params = pd.read_csv(result, index_col=None, header=0).iloc[0]

# determine initial value
param_names = demographic_scenarios.get_class(scenario_name).params
p0 = [params[name] for name in param_names]

scenario = demographic_scenarios.restore(scenario_name, p0, vcf, pops,
                                         generation_time, n_pops, N_e, mu)

# generate SFS plot
scenario.plot_sfs(scaling_1d=scaling)
plot_utils.save_fig(out_sfs, tight_layout=True, show=test_mode, clear=True)

# generate plot of residuals
scenario.plot_residuals_1d()
plot_utils.save_fig(out_residuals, tight_layout=True, show=test_mode)

Python Pandas numpy plot-utils From line 5 of scripts/plot_sfs_comparison_1D.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import pandas as pd

import demographic_scenarios
import plot_utils

try:
    test_mode = False
    vcf = snakemake.input.vcf
    pops = snakemake.input.pops
    result = snakemake.input.result
    scenario_name = snakemake.params.scenario
    sample_set = snakemake.params.sample_set
    n_pops = snakemake.params.n_pops

    out_data = snakemake.output.data
    out_model = snakemake.output.model
    out_residuals = snakemake.output.residuals

    generation_time = snakemake.config['generation_time']
    mu = snakemake.config['mu']
    N_e = snakemake.config['N_e'][sample_set]
except NameError:
    # testing
    test_mode = True
    sample_set = "pendula"
    n_pops = 2
    vcf = "output/tetraploid/snps/pendula/synonymous/snps.vcf.gz"
    pops = f"output/tetraploid/sample_sets/subpopulations/pendula/{n_pops}_pops.txt"
    scenario_name = 'split_asymmetric_migration'
    result = f"output/tetraploid/dadi_old/{scenario_name}/pendula/synonymous/dadi.csv"

    out_data = "scratch/data.svg"
    out_model = "scratch/model.svg"
    out_residuals = "scratch/residuals.svg"

    generation_time = 20
    mu = 1e-9
    N_e = 675000

# load params from file
params = pd.read_csv(result, index_col=None, header=0).iloc[0]

# determine initial value
param_names = demographic_scenarios.get_class(scenario_name).params
p0 = [params[name] for name in param_names]

scenario = demographic_scenarios.restore(scenario_name, p0, vcf, pops,
                                         generation_time, n_pops, N_e, mu,
                                         sample_set=sample_set)

scenario.plot_data_2d()
plot_utils.save_fig(out_data, tight_layout=True, show=test_mode)

scenario.plot_model_2d()
plot_utils.save_fig(out_model, tight_layout=True, show=test_mode)

scenario.plot_residuals_2d()
plot_utils.save_fig(out_residuals, tight_layout=True, show=test_mode)

Python Pandas plot-utils From line 5 of scripts/plot_sfs_comparison_2D.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import numpy as np
from matplotlib import pyplot as plt
import sfs_utils
import dadi

try:
    input = snakemake.input[0]
    n_proj = snakemake.params.n_proj
    unfolded = snakemake.params.unfolded
    log_scale = snakemake.params.log_scale
    out = snakemake.output[0]
except NameError:
    # testing
    input = "output_old/est-sfs/sfs/1.txt"
    n_proj = 100
    unfolded = False
    log_scale = True
    out = "scratch/pendula.pendula_one_pop.svg"

fs = dadi.Spectrum([int(np.round(float(n))) for n in open(input, 'r').readline().split(',')])

if not unfolded:
    fs = fs.fold()
    n_proj *= 2

fs = fs.project([n_proj])

sfs_utils.plot_sfs([fs], log_scale=log_scale)
plt.savefig(out)

Python numpy matplotlib sfs-python dadi From line 5 of scripts/plot_sfs_est.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import matplotlib.pyplot as plt
import numpy as np
import utils

try:
    vcf_synonymous = snakemake.input.synonymous
    vcf_nonsynonymous = snakemake.input.nonsynonymous
    out = snakemake.output[0]
except NameError:
    # testing
    vcf = 'testing/snps1.ancestral.vep.annotated.vcf.gz'
    out = 'scratch/synonymy.svg'

# the labels
labels_degeneracy = [f"{n}-fold" for n in [2, 4] + [0, 2]]
labels_synonymy = ['synonymous', 'non-synonymous']

# the degeneracy values for each synonymy class
synonymous = [utils.get_n_degenerate(vcf_synonymous, n) for n in [2, 4]]
nonsynonymous = [utils.get_n_degenerate(vcf_nonsynonymous, n) for n in [0, 2]]

# the nested array containing the values
values = [synonymous, nonsynonymous]

# prepare colors for inner and outer disk
cmap = plt.get_cmap("tab20c")
outer_colors = cmap(np.arange(2) * 5)
inner_colors = cmap([1, 2, 6, 7])

fig, ax = plt.subplots()


# get wedge labels of outer disk
def autopict(percentage):
    n_sites = int(np.round(percentage * sum(utils.flatten(values)) / 100))

    return str(n_sites) + '\n' + str(np.round(percentage, 1)) + '%'


# plot outer disk showing the synonymy classes
ax.pie([sum(s) for s in values], startangle=90, labels=labels_synonymy, autopct=autopict,
       wedgeprops=dict(edgecolor='w', linewidth=1, antialiased=True, width=0.4),
       colors=outer_colors, radius=1, pctdistance=0.8)

# plot inner disk showing the degeneracy classes
ax.pie(utils.flatten(values), startangle=90, labels=labels_degeneracy,
       wedgeprops=dict(edgecolor='w', linewidth=1, antialiased=True, width=0.2),
       colors=inner_colors, radius=0.6, labeldistance=0.25)

ax.axis('equal')
utils.save_fig(out, tight_layout=True)

Python numpy matplotlib utils From line 5 of scripts/plot_synonymy.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import plot_utils

try:
    test_mode = False
    input = snakemake.input[0]
    out = snakemake.output[0]
    log_scale = snakemake.params.log_scale
except NameError:
    # testing
    test_mode = True
    input = 'output/default/stats/pendula/all/D.Tajima.D'
    out = 'scratch/tajimas_d.svg'
    log_scale = False

plot_utils.plot_hist_basic_stat(input, colname='TajimaD', xlabel="Tajima's D", weights='N_SNPS',
                                ylabel="n windows", color=plot_utils.get_color('tab20b', 10),
                                log_scale=log_scale)

plot_utils.save_fig(out, tight_layout=True, show=test_mode)

Python plot-utils From line 5 of scripts/plot_tajimas_d.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import numpy as np
import pandas as pd

import demographic_scenarios
import plot_utils

try:
    test_mode = False
    vcf = snakemake.input.vcf
    pops = snakemake.input.pops
    result = snakemake.input.result
    scenario_name = snakemake.params.scenario
    sample_set = snakemake.params.sample_set
    n_pops = snakemake.params.n_pops
    out = snakemake.output[0]

    plot_opts = {}
    generation_time = snakemake.config['generation_time']
    mu = snakemake.config['mu']
    N_e = snakemake.config['N_e'][sample_set]
except NameError:
    # testing
    test_mode = True
    sample_set = 'pendula'
    vcf = f"output/tetraploid/snps/{sample_set}/synonymous/snps.vcf.gz"
    pops = f"output/tetraploid/sample_sets/subpopulations/{sample_set}/1_pops.txt"
    scenario_name = 'one_pop_size_change'
    result = f"output/tetraploid/dadi/{scenario_name}/{sample_set}/synonymous/dadi.csv"
    n_pops = 1
    out = "scratch/trajectory.png"

    plot_opts = {'scaling': np.array([1, 1 / 2]) * 0.8}
    generation_time = 20
    mu = 1e-9
    N_e = 675000

# load params from file
params = pd.read_csv(result, index_col=None, header=0).iloc[0]

# determine initial value
param_names = demographic_scenarios.get_class(scenario_name).params
p0 = [params[name] for name in param_names]

scenario = demographic_scenarios.restore(scenario_name, p0, vcf, pops,
                                         generation_time, n_pops, N_e, mu)

# plot trajectory
scenario.plot_trajectory(opts=plot_opts)

plot_utils.save_fig(out, tight_layout=True, show=test_mode)

Python Pandas numpy plot-utils From line 5 of scripts/plot_trajectory_dadi.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import matplotlib.pyplot as plt
import umap

import pca_utils
import utils

try:
    sample_set = snakemake.params['sample_set']
    sample_class = snakemake.params['sample_class']
    flag = snakemake.params['flag']
    out = snakemake.output[0]
    subsample_sets = snakemake.params.get('subsample_sets', [])
    min_dist = snakemake.params.get('min_dist', 1.5)
    spread = snakemake.params.get('spread', 2)
    tight_layout = snakemake.params.get('tight_layout', False)
except NameError:
    # testing
    sample_set = 'birch'
    sample_class = 'default'
    flag = 'biallelic'
    out = "scratch/pca.svg"
    subsample_sets = ['pubescens', 'pendula']
    min_dist = 1.5
    spread = 2
    tight_layout = False

# fetch genotypes
genotypes, samples, bim, fam = utils.get_genotypes(sample_set, flag, sample_class)

# create umap object
reducer = umap.UMAP(n_neighbors=samples.shape[0] - 1, metric='euclidean',
                    min_dist=min_dist, random_state=4, spread=spread)

# fit embedding using umap
embedding = reducer.fit_transform(genotypes)

# plot and save embedding
plt.scatter(x=embedding[:, 0], y=embedding[:, 1], c=samples.latitude, s=20)
plt.gca().set_aspect('equal', 'datalim')

# draw ellipses around subsample sets if specified
for i, subsample_set in enumerate(subsample_sets):
    pca_utils.encircle_subset(sample_set, subsample_set, sample_class, embedding, i, buf=10)

plt.xlabel(f'x')
plt.ylabel(f'y')

if len(subsample_sets):
    plt.legend()

plt.colorbar().set_label('latitude')

if tight_layout:
    plt.savefig(out, bbox_inches='tight', pad_inches=0)
else:
    plt.savefig(out)

Python matplotlib utils umap pca From line 5 of scripts/plot_umap.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

ref = snakemake.input.ref
vcf = snakemake.input.vcf
gff = snakemake.input.gff
out = snakemake.output[0]

shell(f"vep --gff {gff} --fasta {ref} -i {vcf} --vcf --species betula_pendula --allow_non_variant \
    -o {out} --fields 'Consequence,Codons' --compress_output bgzip")

Python Snakemake Variant Effect Predictor (VEP) From line 5 of scripts/predict_variant_effects_vep.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import os

import utils
from pyvcf_utils import *

try:
    vcf_file = snakemake.input.vcf
    ingroups_file = snakemake.input.ingroups
    outgroups_file = snakemake.input.outgroups
    out_data = snakemake.output.data
    n_max_subsamples = snakemake.config['est_sfs_n_samples']
except NameError:
    # testing
    vcf_file = "testing/snps/pendula/synonymous/snps.vcf.gz"
    ingroups_file = "output/default/sample_sets/ingroups.args"
    outgroups_file = "output/default/sample_sets/outgroups.args"
    out_data = "scratch/pendula.synonymous.data.txt"
    n_max_subsamples = 50

# load ingroup and outgroup samples
ingroups = utils.get_samples_from_file(ingroups_file)
outgroups = utils.get_samples_from_file(outgroups_file)

# seed rng
np.random.seed(seed=0)

# number of subsamples to sample from haplotypes
n_subsamples = min(n_max_subsamples, len(ingroups))

# write to data file
with open(out_data, 'w') as f:
    i = 0
    for record in vcf.Reader(filename=vcf_file):

        # only do inference for polymorphic sites
        if not record.is_monomorphic:

            ingroup = count(subsample(haplotypes(restrict(record.samples, outgroups, True)), n_subsamples))
            outgroup1 = count(subsample(haplotypes(restrict(record.samples, [outgroups[0]])), 1))
            outgroup2 = count(subsample(haplotypes(restrict(record.samples, [outgroups[1]])), 1))

            f.write(' '.join([base_dict_to_string(r) for r in [ingroup, outgroup1, outgroup2]]) + os.linesep)

        i += 1
        if i % 1000 == 0: print(f"Processed sites: {i}", flush=True)

Python utils From line 5 of scripts/prepare_input_est_sfs.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import os

import sfs_utils
import utils

try:
    synonymous = snakemake.input.synonymous
    nonsynonymous = snakemake.input.nonsynonymous
    vcf = snakemake.input.vcf
    pops = snakemake.input.pops
    n_pops = snakemake.params.n_pops
    out = snakemake.output[0]
except NameError:
    # testing
    synonymous = "output/default/snps/pendula/synonymous/snps.vcf.gz"
    nonsynonymous = "output/default/snps/pendula/nonsynonymous/snps.vcf.gz"
    vcf = "output/default/snps/pendula/all/snps.vcf.gz"
    pops = "output/default/sample_sets/subpopulations/pendula/1_pops.txt"
    n_pops = 1
    out = "scratch/pendula.1_pops.txt"

# number of two fold degenerate sites
n_2fold = utils.get_n_degenerate(vcf, 2)

# polyDFE fails to converge for large sample sizes
# n = 20 was used in their own simulations
n_proj = 20
freqs_neut = sfs_utils.get_spectrum(synonymous, pops, n_proj)
freqs_neut = freqs_neut[~freqs_neut.mask].tolist()
n_sites_neut = utils.get_n_degenerate(vcf, 4) + 1 / 3 * n_2fold

freqs_sel = sfs_utils.get_spectrum(nonsynonymous, pops, n_proj)
freqs_sel = freqs_sel[~freqs_sel.mask].tolist()
n_sites_sel = utils.get_n_degenerate(vcf, 0) + 2 / 3 * n_2fold


def write_line(file, arr, sep):
    file.write(sep.join(map(str, arr)) + os.linesep)


m_neut = 1
m_sel = 1
with open(out, 'w') as f:
    write_line(f, [m_neut, m_sel, n_proj], ' ')

    write_line(f, freqs_neut + [n_sites_neut], '\t')
    write_line(f, freqs_sel + [n_sites_sel], '\t')

Python utils sfs-python From line 5 of scripts/prepare_input_polydfe.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts

java_opts = get_java_opts(snakemake)
ref = snakemake.input.ref
vcf = snakemake.input.vcf
out = snakemake.output[0]

# the filters are based on the GATK recommendations for hard filtering
# https://gatk.broadinstitute.org/hc/en-us/articles/360035890471-Hard-filtering-germline-short-variants
shell(f"gatk --java-options '{java_opts}' VariantFiltration -R {ref} -V {vcf} -O {out} \
    --filter-name 'filter1' \
    --filter-expression 'QD < 2.0' \
    --filter-name 'filter2' \
    --filter-expression 'SOR > 3.0' \
    --filter-name 'filter3' \
    --filter-expression 'MQ < 40.0' \
    --filter-name 'filter4' \
    --filter-expression 'MQRankSum < -12.5' \
    --filter-name 'filter5' \
    --filter-expression 'ReadPosRankSum < -8.0'")

Python Snakemake gatk snakemake-wrapper-utils From line 5 of scripts/quality_filter_snps.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from collections import Counter

import vcf
from vcf.parser import _Info as Info

import pyvcf_utils
import utils

try:
    vcf_file = snakemake.input.vcf
    probs = snakemake.input.probs
    data = snakemake.input.data
    samples_file = snakemake.input.samples
    out = snakemake.output.vcf
except NameError:
    # testing
    vcf_file = "output/default/snps/raw/intervals/snps1.vcf.gz"
    probs = "output/default/est-sfs/probs/1.txt"
    data = "output/default/est-sfs/data/1.txt"
    samples_file = "output/default/sample_sets/ingroups.args"
    out = "scratch/1.ancestral.vcf"

samples = utils.get_samples_from_file(samples_file)

vcf_reader = vcf.Reader(filename=vcf_file)

# add AA info field to header
vcf_reader.infos['AA'] = Info('AA', 1, 'String', 'Ancestral Allele', None, None)
vcf_reader.infos['EST_SFS_probs'] = Info('EST_SFS_probs', 1, 'String', 'EST-SFS probabilities', None, None)
vcf_reader.infos['EST_SFS_input'] = Info('EST_SFS_input', 1, 'String', 'EST-SFS input', None, None)

probs_reader = open(probs, 'r')
data_reader = open(data, 'r')
writer = vcf.Writer(open(out, 'w'), vcf_reader)

# write to data file
i = 0
for record in vcf_reader:

    # simply assign the ancestral allele to be the reference allele
    # if the record is monomorphic
    if record.is_monomorphic:
        record.INFO['AA'] = record.REF
        record.INFO['EST_SFS_probs'] = None
        record.INFO['EST_SFS_input'] = None
    else:

        line = probs_reader.readline()

        # read est-sfs input data
        est_input = data_reader.readline()

        # get probability of major allele
        prob_major_allele = float(line.split(' ')[2])

        # restrict haplotypes to the non-outgroup birch samples
        hps = pyvcf_utils.haplotypes(pyvcf_utils.restrict(record.samples, samples))

        major_allele = '.'

        if hps:
            # determine major allele
            major_allele = Counter(hps).most_common()[0][0]

            # take the major allele to be the ancestral allele
            # if its probability is greater than equal 0.5
            if prob_major_allele >= 0.5:
                ancestral_allele = major_allele
            else:
                # there are exactly two alleles
                for allele in record.alleles:
                    if str(allele) != major_allele:
                        ancestral_allele = str(allele)
        else:
            ancestral_allele = '.'

        # add ancestral allele annotation to record
        record.INFO['AA'] = ancestral_allele

        # add additional information from est-sfs
        record.INFO['EST_SFS_probs'] = line.replace('\n', '').replace(' ', '|')
        record.INFO['EST_SFS_input'] = est_input.replace('\n', '').replace(' ', '|').replace(',', ':')

        if not (major_allele == ancestral_allele == record.REF):
            print(f"site: {record.CHROM}:{record.POS}, ancestral allele: {ancestral_allele}, "
                  f"major allele: {major_allele}, reference: {record.REF}, "
                  f"prob major allele: {prob_major_allele}", flush=True)

    writer.write_record(record)

    i += 1
    if i % 1000 == 0: print(f"Processed sites: {i}", flush=True)

# raise error if there are lines left from the EST-SFS output
if next(probs_reader, None) is not None:
    raise AssertionError("Number of sites don't match.")

probs_reader.close()

Python utils PyVCF From line 7 of scripts/recode_ancestral_alleles_vcf.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

try:
    input = snakemake.input.bed
    out = snakemake.output.bed
except NameError:
    # testing
    input = "output/default/snps/pendula/biallelic/snps.bed"
    out = "output/default/snps/pendula/biallelic/snps.recoded_ids.bed"

input_prefix = input.replace('.bed', '')
out_prefix = out.replace('.bed', '')

shell(f"plink2 --bfile {input_prefix} --set-all-var-ids @_# --make-bed --out {out_prefix} --allow-extra-chr")

Python Snakemake plink2 From line 5 of scripts/recode_variant_ids_plink.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import os

from snakemake.shell import shell

try:
    bed = snakemake.input.bed
    out_log = snakemake.output.log
    K = snakemake.params.K
except NameError:
    # testing
    bed = "output/default/snps/pendula/biallelic/snps.admixture.bed"
    out_log = "scratch/admixture"
    K = 4

# We can't specify the path of the output files
# so we change the working directory to the output
# path of the given log file.
bed_abs_path = os.path.abspath(bed)
out_path = os.path.dirname(out_log)
log = os.path.basename(out_log)

shell(f"""
    mkdir -p {out_path}
    cd {out_path}
    admixture --cv {bed_abs_path} {K} | tee {log}
""")

Python Snakemake From line 5 of scripts/run_admixture.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

sfs = snakemake.input.sfs
init = snakemake.input.init
out = snakemake.output[0]
id = snakemake.params.id
m = snakemake.params.m

shell(f"polyDFE -d {sfs} -m {m} -i {init} {id} -v 100 > {out}")

Python Snakemake From line 5 of scripts/run_polydfe.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

try:
    probs_in = snakemake.input.probs
    data_in = snakemake.input.data
    probs_out = snakemake.output.probs
except NameError:
    # testing
    probs_in = "output/default/est-sfs/probs/all.txt"
    data_in = [f"output/default/est-sfs/data/{n}.txt" for n in range(1, 11)]
    probs_out = [f"scratch/est-sfs/probs/{n}.txt" for n in range(1, 11)]

with open(probs_in, 'r') as probs_all:

    # iterate over files
    for data_file, probs_file in zip(data_in, probs_out):

        # obtain number of lines to read and write
        line_count = sum(1 for line in open(data_file))

        with open(probs_file, 'w') as out:

            # iterate over lines
            for i in range(line_count):

                line = probs_all.readline()

                # skip header lines
                while line.startswith('0'):
                    line = probs_all.readline()

                # write to out file
                out.write(line)

    # raise error if there are lines left from the EST-SFS output
    if next(probs_all, None) is not None:
        raise AssertionError("Unassigned sites left from EST-SFS output.")

Python From line 5 of scripts/scatter_output_est_sfs.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

try:
    out = snakemake.output[0]
    tmp_dir = snakemake.resources.tmpdir
    max_sites = snakemake.config['est_sfs_max_sites']
except NameError:
    # testing
    out = "scratch/est-sfs"
    tmp_dir = "/tmp"
    max_sites = 100000

# set up est-sfs
# Note: the compilation only works on Linux
shell(f"""
    set -x
    work_dir=$(realpath .)
    cd {tmp_dir}

    wget https://sourceforge.net/projects/est-usfs/files/est-sfs-release-2.03.tar.gz/download -O est-sfs.tar.gz

    rm -rf est-sfs
    mkdir est-sfs
    tar -xvf est-sfs.tar.gz -C est-sfs --strip-components 1

    cd est-sfs
    sed -i 's/#define max_config 100000/#define max_config {max_sites}/g' est-sfs.c
    export C_INCLUDE_PATH=$CONDA_PREFIX/include
    make

    mv est-sfs "$work_dir/{out}"
""")

Python Snakemake From line 5 of scripts/setup_est_sfs.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

import math

import numpy as np
import pandas
import pandas as pd

try:
    test_mode = False
    results = snakemake.input
    labels = snakemake.params.labels
    out_pretty = snakemake.output.pretty
    out_tex = snakemake.output.tex
except NameError:
    # testing
    test_mode = True
    scenarios = [
        "split_no_migration",
        "split_unidirectional_migration_from_two_to_one",
        "split_unidirectional_migration_from_one_to_two",
        "split_symmetric_migration",
        "split_asymmetric_migration",
        "split_asymmetric_migration_one_pop_size_change",
        "split_no_migration_since_ice_age",
        "split_unidirectional_migration_from_two_to_one_since_ice_age",
        "split_unidirectional_migration_from_one_to_two_since_ice_age",
        "split_symmetric_migration_since_ice_age",
        "split_asymmetric_migration_since_ice_age",
        "split_asymmetric_migration_one_pop_size_change_since_ice_age"
    ]
    sample_set = "pendula_pubescens"
    results = {scenario: f"output/tetraploid/dadi/{scenario}/{sample_set}/synonymous/dadi_ci.csv" for scenario in scenarios}
    labels = [key.replace('_', ' ') for key in results.keys()]
    out_pretty = 'scratch/dadi_tabulated.out'
    out_tex = 'scratch/dadi_tabulated.tex'


# format given float string
def format_number(x):
    x = float(x)

    if x == 0:
        return str(0)

    magnitude = math.floor(math.log(abs(float(x)), 10))

    if magnitude > 0:
        return str(int(x))
    else:
        return str(np.round(x, abs(magnitude) + 1))


def parse_row(file, delimiter):
    data = pd.read_csv(file, index_col=0, header=0)

    mean = data.loc['mean_bs']
    std = data.loc['std_bs']

    for key in mean.index:
        if mean[key]:
            mean[key] = format_number(mean[key])

            if float(std[key]) != 0:
                mean[key] += delimiter + format_number(std[key])

    return mean


def prepare_table(delimiter):
    data = pandas.DataFrame()
    for i, (key, file) in enumerate(results.items()):
        row = parse_row(file, delimiter)
        row.name = labels[i]
        data = data.append(row)

    # remove unnamed columns
    data = data.loc[:, ~data.columns.str.contains('unnamed')]
    data = data.loc[:, ~(data.columns == 'theta')]
    return data.fillna('')


open(out_pretty, 'w').write(prepare_table(" ± ").to_markdown())
open(out_tex, 'w').write(prepare_table("\,$\pm$\,").to_latex(escape=False))

Python Pandas numpy From line 5 of scripts/tabulate_results_dadi.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

input = ' '.join(snakemake.input)
output = ' '.join(snakemake.output)
adapter = "$CONDA_PREFIX/share/trimmomatic-0.36-5/adapters/TruSeq3-PE-2.fa"
threads = snakemake.threads

shell(f"trimmomatic PE -threads {threads} {input} {output} \
    ILLUMINACLIP:{adapter}:2:30:10 SLIDINGWINDOW:4:15 MINLEN:36")

Python Snakemake Trimmomatic From line 5 of scripts/trim_reads.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

snakefile = snakemake.params.snakefile
out = snakemake.output[0]

shell(f"snakemake --snakefile {snakefile} --rulegraph | dot -Tsvg > {out}")

Python Snakemake From line 5 of scripts/visualize_complete_rulegraph.py

__author__ = "Janek Sendrowski"
__contact__ = "[email protected]"
__date__ = "2022-05-31"

from snakemake.shell import shell

input = snakemake.input[0]
out = snakemake.output[0]

shell(f"dot {input} -Tsvg > {out}")

Python Snakemake From line 5 of scripts/visualize_dotfile.py

script:
    "scripts/visualize_complete_rulegraph.py"

SnakeMake From line 207 of master/Snakefile

script:
    "scripts/visualize_dotfile.py"

SnakeMake From line 216 of master/Snakefile

script:
    "scripts/visualize_complete_rulegraph.py"

SnakeMake From line 227 of master/Snakefile

Analysis pipeline associated with master's thesis on the population structure, demographic history and distribution of fitness effects of birches in Scandinavia.

Help improve this workflow!

Demography of Birch Populations across Scandinavia

Workflows

Schematic of total workflow

SNP calling

δaδi

polyDFE

Sample set derivation

VCF filtering

Summary statistics

Code Snippets

Comments

Support

Free

Related Workflows

public

public

public

public

public

public