Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
Pipeline to construct species phylogenies using BUSCOs.
The pipeline automatically creates the directory structure from the config file. Default directory structure:
|-results |- busco |- ids | |- species_ids | |- common_ids | |- merged_sequences |- alignments | |- raw | |- trimal |- concat_alignments |- phylogeny | |- iqtree | |- mrbayes
Usage example
snakemake --snakefile Snakemake --profile profile/slurm/ --configfile config/default.yaml \
--config genome_dir="" busco_dataset_path="" trimal_path="" iqtree_path="" mrbayes_path="" \
--printshellcmds --latency-wait 60
Requirements
You can select the BUSCO version (v3.0.2 or v5.2.2). To do this, specify the value of
busco_version
3 or 5 in config file. You can also choose MAFFT or PRANK to align sequences by specifying 'mafft' or 'prank' in the
alignment_tool
option in config file. PRANK is only used for DNA sequences! Protein sequences are processed with MAFFT in both cases.
BUSCO v3.0.2
,
TrimAl
,
IQ-TREE
and
MrBayes
paths should be in config file or specified at startup.
BUSCO v5.2.2
,
MAFFT
and
PRANK
are installed using a conda.
Code Snippets
103 104 105 106 107 | shell: "mkdir -p {output.mafft_fna_dir}; for i in {input.mafft_fna_dirs}; do mv $i/*.fna {output.mafft_fna_dir}/; done; " "mkdir -p {output.mafft_faa_dir}; for i in {input.mafft_faa_dirs}; do mv $i/*.faa {output.mafft_faa_dir}/; done; " "mkdir -p {output.trimal_fna_dir}; for i in {input.trimal_fna_dirs}; do mv $i/*.fna {output.trimal_fna_dir}/; done; " "mkdir -p {output.trimal_faa_dir}; for i in {input.trimal_faa_dirs}; do mv $i/*.faa {output.trimal_faa_dir}/; done; " |
27 28 29 30 | shell: "mkdir -p {output.busco_outdir}; cd {output.busco_outdir}; {params.busco_path}/run_BUSCO.py -m {params.mode} -sp {params.species}" " -i {input} -c {threads} -l {params.busco_dataset_path} -o {params.output_prefix} 1>../../../{log.std} 2>&1;" " mv run_{params.output_prefix}/* ./; rm -r run_{params.output_prefix} tmp/" |
61 62 63 64 65 66 67 68 69 | shell: "mkdir -p {output.busco_outdir}; cd {output.busco_outdir}; " "busco -m {params.mode} -i {input} -c {threads} " "-l {params.busco_dataset_path} -o {params.output_prefix} 1>../../../{log.std} 2>&1; " "mv {params.output_prefix}/* ./; rm -r {params.output_prefix}/; " "rm -r busco_downloads/; mv run*/* ./; rm -r run*; " "mv full_table.tsv full_table_{params.output_prefix}.tsv; " "mv missing_busco_list.tsv missing_busco_list_{params.output_prefix}.tsv; " "mv short_summary.txt short_summary_{params.output_prefix}.txt; " |
93 94 95 96 97 98 | shell: "workflow/scripts/CDs_from_MetaEuk.py " "--initial_results {input.metaeuk_initial_results} " "--rerun_results {input.metaeuk_rerun_results} " "--single_copy_busco_sequences {input.single_copy_busco_sequences} " "--outdir {output.single_copy_CDs_sequences} > {log.std} 2>&1 " |
127 128 129 130 131 132 133 134 135 136 | shell: "mkdir -p {output.busco_outdir}; cd {output.busco_outdir}; " "busco --augustus --augustus_species {params.species} -m {params.mode} " "-i {input} -c {threads} -l {params.busco_dataset_path} -o {params.output_prefix} 1>../../../{log.std} 2>&1; " "mv {params.output_prefix}/* ./; rm -r {params.output_prefix}/; " "rm -r busco_downloads/; mv run*/* ./; rm -r run*; " "mv full_table.tsv full_table_{params.output_prefix}.tsv; " "mv missing_busco_list.tsv missing_busco_list_{params.output_prefix}.tsv; " "mv short_summary.txt short_summary_{params.output_prefix}.txt; " "mv busco_sequences/single_copy_busco_sequences . " |
19 20 | shell: "ls {input} | grep -P '.fna$' | sed 's/.fna//' > {output} 2> {log.std}" |
42 43 44 | shell: "mkdir -p {output}; cat {input} | sort | uniq -c | awk '{{if($1=={params.nfiles}){{print $2}}}}' | " "split -l {params.split_size} --numeric-suffixes - {output}/{params.prefix} 1> {log.std} 2>&1" |
64 65 66 67 68 | shell: "workflow/scripts/merged_sequences.py " "--common_ids {input} " "--single_copy_files {params.single_copy_files} " "--outdir {output} 1> {log.std} 2>&1" |
18 19 | shell: "workflow/scripts/concat_fasta.py -o {output} -i {input}/*.fna 1> {log.std} 2>&1" |
39 40 | shell: "workflow/scripts/concat_fasta.py -o {output} -i {input}/*.faa 1> {log.std} 2>&1" |
63 64 | shell: "workflow/scripts/fasta_to_nexus.py -i {input} -t {params.type} -b {params.block} -o {output} 1> {log.std} 2>&1" |
87 88 | shell: "workflow/scripts/fasta_to_nexus.py -i {input} -t {params.type} -b {params.block} -o {output} 1> {log.std} 2>&1" |
20 21 22 23 | shell: "mkdir -p {output}; " "{params.iqtree_path}/iqtree -s {input} -pre {params.prefix} -nt {resources.cpus} {params.options} 1> {log.std} 2>&1; " "mv {params.prefix}.* {output}" |
45 46 47 48 | shell: "mkdir -p {output}; " "{params.iqtree_path}/iqtree -s {input} -pre {params.prefix} -nt {resources.cpus} {params.options} 1> {log.std} 2>&1; " "mv {params.prefix}.* {output}" |
23 24 25 26 27 | shell: "mkdir -p {output}; " "for FILE in `ls {input}/*.fna`; do " "mafft --thread {threads} {params.options} $FILE > {output}/$(basename $FILE) 2> {log.std}; " "done; " |
51 52 53 54 55 | shell: "mkdir -p {output}; " "for FILE in `ls {input}/*.faa`; do " "mafft --thread {threads} {params.options} $FILE > {output}/$(basename $FILE) 2> {log.std}; " "done; " |
19 20 21 22 | shell: "mkdir -p {output}; " "mpirun -np {resources.cpus} {params.mrbayes_path}/mb-mpi {input} {params.options} 1> {log.std} 2>&1; " "mv {input}.* {output}/; " |
43 44 45 46 | shell: "mkdir -p {output}; " "mpirun -np {resources.cpus} {params.mrbayes_path}/mb-mpi {input} {params.options} 1> {log.std} 2>&1; " "mv {input}.* {output}/; " |
23 24 25 26 27 28 | shell: "mkdir -p {output}; " "for FILE in `ls {input}/*.fna`; do " "prank -d=$FILE -o={output}/$(basename $FILE) -codon -F > {log.std} 2>&1; " "mv {output}/$(basename $FILE).best.fas {output}/$(basename $FILE); " "done; " |
19 20 21 22 23 | shell: "mkdir -p {output}; for FILE in `ls {input}/*.fna`; do " "{params.trimal_path}/trimal -in $FILE -out {output}/$(basename $FILE) {params.options} > {log.std} 2>&1; " "{params.trimal_path}/trimal -in {output}/$(basename $FILE) -out {output}/$(basename $FILE) -nogaps > {log.std} 2>&1; " "done" |
44 45 46 47 48 | shell: "mkdir -p {output}; for FILE in `ls {input}/*`; do " "{params.trimal_path}/trimal -in $FILE -out {output}/$(basename $FILE) {params.options} > {log.std} 2>&1; " "{params.trimal_path}/trimal -in {output}/$(basename $FILE) -out {output}/$(basename $FILE) -nogaps > {log.std} 2>&1; " "done" |
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | from pathlib import Path import argparse def get_id(header): id = header[1:].split("|")[0] if "_" in id: id = id.split("_")[0] return id def split_fasta_to_directory(fasta, outdir, single_copy_ids, ext): opened_file_flag = False single_copy_file_flag = True with open(fasta, "r") as infile: for line in infile: if line[0] == ">": if opened_file_flag: outfile.close() opened_file_flag = True id = get_id(line) if id not in single_copy_ids: single_copy_file_flag = False opened_file_flag = False continue else: single_copy_file_flag = True filepath = Path(outdir / f"{id}.{ext}") outfile = open(filepath, "w") if single_copy_file_flag: outfile.write(line) outfile.close() def main(): # set PATHs initial_codon_fasta = list(Path(args.initial_results).glob("*.codon.fas"))[0] initial_protein_fasta = list(Path(args.initial_results).glob("*a.fas"))[0] # *.fasta.fas or *.fa.fas rerun_codon_fasta = list(Path(args.rerun_results).glob("*.codon.fas"))[0] rerun_protein_fasta = list(Path(args.rerun_results).glob("*a.fas"))[0] # *.fasta.fas or *.fa.fas single_copy_ids = [id.stem for id in list(Path(args.single_copy_busco_sequences).glob("*.faa"))] print("Single copy IDs: ", len(single_copy_ids)) outdir = Path(args.outdir) outdir.mkdir() # write FASTA files with single copy CDs sequences to output directory split_fasta_to_directory(initial_codon_fasta, outdir, single_copy_ids, "fna") split_fasta_to_directory(initial_protein_fasta, outdir, single_copy_ids, "faa") split_fasta_to_directory(rerun_codon_fasta, outdir, single_copy_ids, "fna") split_fasta_to_directory(rerun_protein_fasta, outdir, single_copy_ids, "faa") single_copy_CDs = [cd.stem for cd in list(outdir.glob("*.faa"))] print("Single copy CDs: ", len(single_copy_CDs)) print("???: ", list(set(single_copy_ids) - set(single_copy_CDs))) if __name__ == "__main__": parser = argparse.ArgumentParser(description="script to get directory with single copy CDs sequences from Metaeuk output") group_required = parser.add_argument_group('Required options') group_required.add_argument('--initial_results', type=str, help="initial_results directory path") group_required.add_argument('--rerun_results', type=str, help="rerun_results directory path") group_required.add_argument('--single_copy_busco_sequences', type=str, help="single_copy_busco_sequences directory path") group_required.add_argument('--outdir', type=str, help="single copy CDs directory path") args = parser.parse_args() main() |
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | from collections import defaultdict from Bio import SeqIO import argparse def main(): sequence_map = defaultdict(str) for i in args.input: for sequence in SeqIO.parse(i, "fasta"): sequence_map[sequence.name] += str(sequence.seq) outfile = open(args.output, "w") for key, value in sequence_map.items(): outfile.write(f">{key}\n{value}\n") outfile.close() if __name__ == "__main__": parser = argparse.ArgumentParser(description="script for concatenate FASTA files into one big FASTA file " "by concatenating sequences with the same identifier") group_required = parser.add_argument_group('Required options') group_required.add_argument('-i', '--input', type=str, nargs="+", help="input list of concat FASTA files with the same headers") group_required.add_argument('-o', '--output', type=str, help="output FASTA file name") args = parser.parse_args() main() |
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | from Bio import AlignIO from sys import stdin import argparse def main(): with open(args.output, "a") as outfile, open(args.block, "r") as blockfile: AlignIO.convert(args.input, "fasta", outfile, "nexus", args.type) outfile.write("\n") for line in blockfile: outfile.write(line) outfile.write("\n") if __name__ == "__main__": parser = argparse.ArgumentParser(description="script for converting FASTA format to NEXUS format") group_required = parser.add_argument_group('Required options') group_required.add_argument('-i', '--input', type=str, default=stdin, help="input concat FASTA file or stdin") group_required.add_argument('-t', '--type', type=str, help="molecular type (DNA, RNA or protein)") group_required.add_argument('-b', '--block', type=str, help="MrBayes block text file") group_required.add_argument('-o', '--output', type=str, help="output NEXUS file name") args = parser.parse_args() main() |
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | from pathlib import Path import argparse def main(): common_ids_file = open(args.common_ids, 'r') outdir = Path(args.outdir) outdir.mkdir() for idname in common_ids_file: idname = idname.rstrip() file_faa, file_fna = f"{idname}.faa", f"{idname}.fna" merged_file_faa, merged_file_fna = f"{idname}.merged.faa", f"{idname}.merged.fna" out_faa, out_fna = open(outdir / merged_file_faa, 'a'), open(outdir / merged_file_fna, 'a') for directory in args.single_copy_files: dirpath = Path(directory) header = ">%s" % dirpath.parents[0].stem with open(dirpath / file_faa, 'r') as f: seq_faa = "%s\n" % f.readlines()[1].strip() outline_faa = "\n".join([header, seq_faa]) out_faa.write(outline_faa) with open(dirpath / file_fna, 'r') as f: seq_fna = "%s\n" % f.readlines()[1].strip() outline_fna = "\n".join([header, seq_fna]) out_fna.write(outline_fna) if __name__ == "__main__": parser = argparse.ArgumentParser(description="script for merging files with sequences of different species") group_required = parser.add_argument_group('Required options') group_required.add_argument('-c', '--common_ids', type=str, help="common_ids file") group_required.add_argument('-s', '--single_copy_files', type=str, nargs='+', help="single copy busco sequences directories") group_required.add_argument('-o', '--outdir', type=str, help="output directory name") args = parser.parse_args() main() |
Support
- Future updates