A Snakemake workflow for the MSCi framework in BP&P

public 1yr ago 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation, topic

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

A Snakemake workflow for the MSCi framework in BP&P

Description

Please note that this pipeline is not maintained, and that there is at least one known bug. Please feel free to steal the code to set up your own pipeline, but it wo

Code Snippets

awk '$3="CDS"' $1 | awk '{{print $1, $4, $5}}' | awk '!visited[$0]++' | sed '/^#/d' | sed 's/ /\t/g'

Shell From line 1 of scripts/generate_coding_bed.sh

import pandas as pd
import fileinput
import random
import argparse

def generate_control_file(ctl_template, imap, n_loci, tree, theta_beta, tau_beta, mcmc_samples, seqfile, seqtype, rep):

    imap_df = pd.read_csv(imap, sep = " ")
    populations = list(imap_df.iloc[:,1].unique())

    n_samples_per_species = str(round(int(n_loci)/len(populations)))
    list_of_n_samples = [n_samples_per_species] * len(populations)
    n_samples = " ".join(list_of_n_samples)

    replacements = {
        "RUNNAME":f"{seqtype}_{rep}",
        "IMAP": imap,
        "SEED":random.randint(1,10000),
        "SEQFILE":seqfile,
        "SPECIES_LINE":str(len(populations)) + " " + ' '.join(populations),
        "N_SAMPLES_PER_SPECIES": n_samples,
        "TREE":tree,
        "N_LOCI":n_loci,
        "THETA_BETA":theta_beta,
        "TAU_BETA":tau_beta,
        "MCMC_SAMPLES":mcmc_samples,
        "BURNIN":str(round(int(mcmc_samples) * 0.1))
        }

    with open (ctl_template, "r") as f:
        data = f.read()
        for key, replacement in replacements.items():
            data = data.replace(key, str(replacement))
    print(data)

parser = argparse.ArgumentParser()
parser.add_argument("--imap")
parser.add_argument("--n_loci")
parser.add_argument("--tree")
parser.add_argument("--theta_beta")
parser.add_argument("--tau_beta")
parser.add_argument("--mcmc_samples")
parser.add_argument("--seqfile")
parser.add_argument("--seqtype")
parser.add_argument("--rep")
parser.add_argument("--ctl_template")
args = parser.parse_args()


if __name__ == "__main__":
    generate_control_file(
        ctl_template = args.ctl_template, imap = args.imap, n_loci = args.n_loci,
        tree = args.tree, theta_beta = args.theta_beta,
        tau_beta = args.tau_beta, mcmc_samples = args.mcmc_samples,
        seqfile = args.seqfile, seqtype = args.seqtype, rep = args.rep)

Python Pandas From line 1 of scripts/generate_control_files.py

bedtools makewindows -b $1 -w 1000 | awk '{if($3-$2 <= 1000 && $3-$2 >= 500) print}' | shuf | head -n 1000

Shell BEDTools From line 1 of scripts/makewindows.sh

shell:
        "mkdir -p {output}"

SnakeMake From line 10 of main/Snakefile

shell:
        """awk '{{print $1}}' {params.imap} > {output} """

SnakeMake From line 20 of main/Snakefile

shell:
        """
        grep ">" {input[1]} | sed 's/>//' | nl | awk '$1=$1' > temp/chr_names.txt
        bcftools annotate --rename-chrs temp/chr_names.txt {input[0]} | bgzip > {output}
        tabix {output}
        """

SnakeMake BCFtools tabix From line 31 of main/Snakefile

shell:
        """
        bcftools view --samples-file {input[0]} {input[1]} --min-alleles 2 --max-alleles 2 --force-samples -Oz > {output}
        """

SnakeMake BCFtools From line 44 of main/Snakefile

shell:
        """
        bcftools +prune -m 0.5 -w 10000 {input[0]} -Ov | bgzip > {output} # note use of old BCFtools; -l is now -m
        tabix {output}
        """

SnakeMake BCFtools tabix From line 56 of main/Snakefile

shell:
        "bedtools complement -i {input[0]} -g {input[1]} > {output}"

SnakeMake BEDTools From line 79 of main/Snakefile

shell:
        """
# awk '$3="CDS"' {input} | awk '{{print $1, $4, $5}}' | awk '!visited[$0]++' | sed '/^#/d' | sed '/ /\\t/g' > {output}
bash scripts/generate_coding_bed.sh {input} > {output}
"""

SnakeMake From line 89 of main/Snakefile

shell:
        """
bash scripts/makewindows.sh {input[0]} > {output[0]}
bash scripts/makewindows.sh {input[1]} > {output[1]}
        """

SnakeMake From line 106 of main/Snakefile

shell:
    """
            awk 'BEGIN{{OFS=":"}} {{print $1,$2,$3}}' {input[0]} | sed 's/:/-/2' | sed '/^#/d' > {output[0]}
            awk 'BEGIN{{OFS=":"}} {{print $1,$2,$3}}' {input[1]} | sed 's/:/-/2' | sed '/^#/d' > {output[1]}
            """

SnakeMake From line 119 of main/Snakefile

shell:
        """
        bcftools query -l {input} > {output}
#comm -12 <(sort temp/vcf_samples_temp.txt) <(sort {input[0]}) > temp/vcf_samples.txt
        """

SnakeMake BCFtools From line 132 of main/Snakefile

shell:
        """

        mkdir -p temp/sequences

        REGIONS=$(cat {input[0]} {input[1]})

        for region in ${{REGIONS}}
        do
                for sample in $(cat {input[4]})
                do
                        printf '>'$(echo ${{region}} | tr -s -c [:alnum:] _)'^'${{sample}} #header
                samtools faidx {input[2]} ${{region}} | bcftools consensus -s ${{sample}} {input[3]}
                        printf '\\n'
        done | cut -f1,2 -d'>' | awk 'BEGIN {{RS = ">" ; FS = "\\n" ; ORS = ""}} $2 {{print ">"$0}}' > temp/sequences/${{region}}.txt
        done

        for region in ${{REGIONS}}
do
    mafft --leavegappyregion --retree 2 --reorder temp/sequences/${{region}}.txt > temp/sequences/${{region}}_aligned.txt
    trimal -in temp/sequences/${{region}}_aligned.txt -out temp/sequences/${{region}}.ph -phylip_paml -gt 0.2
done

        for region in $(cat {input[0]}) # coding
        do
                FILE=temp/sequences/${{region}}.ph
                if [ -f temp/sequences/${{region}}.ph ]
                then
                        cat ${{FILE}}
                        printf '\\n'
                fi
        done > {output[0]}

        for region in $(cat {input[1]}) # noncoding
do
    FILE=temp/sequences/${{region}}.ph
    if [ -f temp/sequences/${{region}}.ph ]
    then
        cat ${{FILE}}
        printf '\\n'
    fi
done > {output[1]}
        """

SnakeMake SAMtools BCFtools Consensus MAFFT API (EBI) trimAl From line 152 of main/Snakefile

shell:
        """
        wget https://github.com/bpp/bpp/releases/download/v4.4.1/bpp-4.4.1-linux-x86_64.tar.gz
        tar zxvf bpp-4.4.1-linux-x86_64.tar.gz
        rm bpp-4.4.1-linux-x86_64.tar.gz
        mv bpp-4.4.1-linux-x86_64/bin/bpp .
        """

SnakeMake From line 199 of main/Snakefile

shell:
        """
        ./bpp --msci {input} | grep -A1 "Newick tree:" | grep -v "Newick tree:" > tree.txt
        """

SnakeMake From line 213 of main/Snakefile

shell:
        """
        mkdir -p control_files

        TREE=$(cat {input[4]})

        for SEQTYPE in "coding" "noncoding"
        do
                END={config[number_of_repeats]}
                for REP in $(seq 1 $END)
                do
                        python scripts/generate_control_files.py --imap {config[imap]} \
                        --n_loci {config[number_of_loci]} --tree "${{TREE}}" \
                                --theta_beta {config[theta_beta]} --tau_beta {config[tau_beta]} \
                                        --mcmc_samples {config[mcmc_samples]} --seqfile ${{SEQTYPE}}_sequences.ph \
                                                --seqtype ${{SEQTYPE}} --rep ${{REP}} --ctl_template {config[ctl_template]} \
                                                        > control_files/${{SEQTYPE}}_${{REP}}.ctl
                done
        done
        """

SnakeMake From line 229 of main/Snakefile

shell:
        """
        # Generate run scripts + submit to cluster
        for SEQTYPE in "coding" "noncoding"
        do
                END={config[number_of_repeats]}
                for REP in $(seq 1 $END)
                do
                        RUNFILE=run_${{SEQTYPE}}_${{REP}}.sh
                        CTL_FILE=control_files/${{SEQTYPE}}_${{REP}}.ctl
                        cp config_files/run_bpp.sh ${{RUNFILE}}
                        sed -i "s/RUNNAME/${{SEQTYPE}}_${{REP}}/g" ${{RUNFILE}}
                        sed -i "s/EMAIL/{config[email]}/g" ${{RUNFILE}}
                        sed -i "s/ACCOUNT/{config[account]}/g" ${{RUNFILE}}
                        sed -i "s|CTL_FILE|${{CTL_FILE}}|g" ${{RUNFILE}}

                        bash ${{RUNFILE}}
                done
        done

        echo 'Done' > debug.txt
        """