Download, Extract, and Build Phylogenetic Trees for Genes with ShotgunUnifrac
Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
A dual use program for downloading and extracting genes from NCBI and for creating phylogenetic trees for many marker genes and merging the results into one
Install
git clone git@github.com:Ulthran/ShotgunUnifrac.git
To install the
CorGE
library for downloading, extracting, and merging genes,
cd ShotgunUnifrac/
pip install CorGE/
Prereqs
-
Anaconda/miniconda (https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html#regular-installation)
-
Snakemake (https://snakemake.readthedocs.io/en/stable/getting_started/installation.html)
-
(For testing only) PyTest
-
(For containerization only) Singularity
Running tests
For the
CorGE
package,
pytest CorGE/tests
For the tree building Snakemake workflow,
pytest .tests/
Running
To download and collect genomes for tree building,
CorGE collect_genomes --ncbi_species LIST_OF_TXIDS.txt --ncbi_accessions LIST_OF_ACCS.txt --local /path/to/local/db
And then to filter out genes of interest and curate everything for tree building,
CorGE extract_genes
The default
--file_type
behavior is 'prot' so that can be left off or switched to 'nucl' if you want to build trees based on nucleotide sequences. Finally to generate the tree, make sure you're in the directory with all the output from the previous step and run,
snakemake -c --use-conda --conda-prefix .snakemake/ --configfile /path/to/project/config.yml
This should output a file called
RAxML_supermatrixRootedTree.final
which contains the final tree
A worked example is given in the docs .
Code Snippets
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | quasigenomes = {} for fp in snakemake.input: with open(fp) as f: gid = "" for l in f.readlines(): if l[0] == ">": gid = l[2:].strip(" \n\r") else: try: quasigenomes[gid] += l.strip(" \n\r") except KeyError: quasigenomes[gid] = l.strip(" \n\r") max_row_len = max(map(len, quasigenomes.values())) for k, v in quasigenomes.items(): # Fill missed genes with blanks if len(v) < max_row_len: filler = "-" * (max_row_len - len(v)) quasigenomes[k] += filler with open(snakemake.output[0], "w") as f: for k, v in quasigenomes.items(): print(f"{len(v)}") f.write(f"> {k}\n") f.write(f"{v}\n") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | import csv from io import TextIOWrapper def read_seqs(f: TextIOWrapper) -> dict: d = dict() current = "" for l in f.readlines(): if l[0] == ">": current = l[2:] d[current] = "" else: d[current] += l.strip() return d # for fp_stats, fp_reduced, fp_seq in zip(snakemake.input.stats, snakemake.output, snakemake.input.seqs): with open(str(snakemake.input.stats)) as f_stats, open( str(snakemake.output), "w" ) as f_reduced, open(str(snakemake.input.seqs)) as f_seq: r_stats = csv.reader(f_stats, delimiter="\t") stats_header = next(r_stats) keepers = list() for l in r_stats: if float(l[2]) < 0.50 and float(l[3]) > 0.0: keepers.append((int(l[0]), float(l[3]))) keepers.sort(key=lambda t: t[1]) # Get n lowest entropy columns keepers = keepers[: int(snakemake.params.bps)] keepers.sort( key=lambda t: t[0] ) # Order by column number to iterate through input fasta seqs = read_seqs(f_seq) for k, v in seqs.items(): f_reduced.write(f"> {k.strip()}\n") for col_num, entropy in keepers: f_reduced.write(f"{v[col_num-1]}") f_reduced.write("\n") |
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | from io import StringIO import collections import math import itertools def enumerate1(xs): for n, x in enumerate(xs): yield (n + 1), x def range1(stop): for n in range(stop): yield (n + 1) class MSA: def __init__(self, descs, cols): self.descs = descs self.cols = cols # Double check that length of each column is equal to number # of sequence descriptions len_descs = len(descs) for col in self.cols: assert len(col) == len_descs def filter(self, fcn): self.cols = [col for col in self.cols if fcn(col)] return self def filter_by_index(self, idxs, remove=False): idxs = set(idxs) if remove: idxs = [idx for idx in range1(len(self.cols)) if idx not in idxs] self.cols = [col for idx, col in enumerate1(self.cols) if idx in idxs] return self def map(self, fcn): return map(fcn, self.cols) column_stats_header = [ "column_position", "number_of_values", "gaps_proportion", "entropy", "consensus_value", "consensus_proportion", ] column_stats_fmt = "{0}\t{1}\t{2:1.2f}\t{3:1.4f}\t{4}\t{5:1.2f}" def column_stats(self): for col_position, col in enumerate1(self.cols): len_col = len(col) ngaps = col.count("-") nvals = len_col - ngaps if nvals == 0: yield { "column_position": col_position, "number_of_values": 0, "gaps_proportion": 1.0, "entropy": 0.0, "consensus_value": "-", "consensus_proportion": 1.0, } continue ctr = collections.Counter(col) del ctr["-"] cts = ctr.values() consensus_val, consensus_cts = ctr.most_common(1)[0] yield { "column_position": col_position, "number_of_values": nvals, "gaps_proportion": ngaps / len_col, "entropy": shannon(cts), "consensus_value": consensus_val, "consensus_proportion": consensus_cts / nvals, } @property def seqs(self): seqvals = map(strcat, zip(*self.cols)) yield from zip(self.descs, seqvals) @classmethod def from_seqs(cls, seqs): descs, seqvals = list(zip(*seqs)) cols = [ strcat(col_chars) for col_chars in itertools.zip_longest(*seqvals, fillvalue="-") ] return cls(descs, cols) def shannon(cts): cts = [c for c in cts if c > 0] # If we use the formula when h=0, python will return -0.0 if len(cts) == 1: return 0.0 total = sum(cts) props = [c / total for c in cts] h = -sum(p * math.log(p) for p in props) return h def strcat(xs): return "".join(xs) def parse_fasta(f, trim_desc=False): f = iter(f) desc = next(f).strip()[1:] if trim_desc: desc = desc.split()[0] seq = StringIO() for line in f: line = line.strip() if line.startswith(">"): yield desc, seq.getvalue() desc = line[1:] if trim_desc: desc = desc.split()[0] seq = StringIO() else: line = line.replace(" ", "").replace("U", "T").replace(".", "-") seq.write(line) yield desc, seq.getvalue() with open(str(snakemake.input)) as f_in, open(str(snakemake.output), "w") as f_out: seqs = parse_fasta(f_in) msa = MSA.from_seqs(seqs) f_out.write("\t".join(msa.column_stats_header)) f_out.write("\n") for stats_result in msa.column_stats(): stats_values = stats_result.values() f_out.write(msa.column_stats_fmt.format(*stats_values)) f_out.write("\n") |
48 49 | shell: "muscle -in {params.data}/merged-sequences/{wildcards.gene}.fasta -out {params.data}/aligned-sequences/{wildcards.gene}.fasta 2>&1 | tee {log}" |
59 60 | script: "scripts/reduce_alignments_stats.py" |
73 74 | script: "scripts/reduce_alignments.py" |
89 90 | shell: "raxmlHPC -s {input} -m {params.alg} -n {wildcards.gene} -p 392781 -w {params.out}/trees 2>&1 | tee {log}" |
98 99 100 101 102 103 104 105 106 107 108 109 | shell: """ if [ -d Astral/ ]; then echo "Astral already installed" else wget https://github.com/smirarab/ASTRAL/raw/master/Astral.5.7.8.zip unzip Astral.5.7.8.zip rm Astral.5.7.8.zip fi touch {output} """ |
123 124 125 126 127 | shell: """ cat {input.trees} > {output.merged} java -jar Astral/astral.5.7.8.jar -i {output.merged} -o {output.tree} 2>&1 | tee {log} """ |
143 144 | shell: "iqtree -s {params.out}/aligned-sequences/{params.ref}.fasta -pre {params.out}/trees/iqtree {params.alg} -g {input} 2>&1 | tee {log}" |
154 155 | script: "scripts/create_supermatrix.py" |
171 172 | shell: "raxmlHPC -s {input} -o {params.outgroup} -m {params.alg} -n supermatrix -p 392781 -w {params.out} 2>&1 | tee {log}" |
189 190 | shell: "raxmlHPC -s {params.out}/aligned-sequences/{params.ref}.fasta -o {params.outgroup} -m {params.alg} -t {input} -n outgroup -w {params.out} 2>&1 | tee {log}" |
206 207 | shell: "raxmlHPC -f I -m {params.alg} -t {input} -n midpoint -w {params.out} 2>&1 | tee {log}" |
Support
- Future updates