Sequence Post-processing Workflow for NNTC HIV env Sequences

public public 1yr ago 0 bookmarks

Overview

Sequence post-processing of NNTC env sequences from brain and lymphoid tissues. SGA consensus sequences used as input to the workflow are found at resources/sequences.fa . Output from the geno2pheno coreceptor

Code Snippets

8
9
shell:
    "mafft {input} > {output}"
8
9
shell:
    "fasttree -nt {input} > {output}"
12
13
script:
   "../scripts/functional_filter.jl"
5
script: "../scripts/hypermut2.py"
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
run:
   from Bio import SeqIO, Seq, SeqRecord
   from collections import Counter
   import numpy as np

   # for simple consensus, ignoring ties
   def mode(collection):
      c = Counter(collection)
      return c.most_common(1)[0][0]

   recs = list(SeqIO.parse(str(input), "fasta"))
   seqs = [list(r.seq.upper()) for r in recs]
   seq_array = np.array(seqs)

   cons = []
   for i in range(seq_array.shape[1]):
      cons += list(mode(seq_array[:,i]))
   cons_record = SeqRecord.SeqRecord(Seq.Seq("".join(cons)), id = "CONSENSUS", description = "")
   SeqIO.write([cons_record] + recs, output[0], "fasta")
35
36
37
38
run:
   import pandas as pd
   dfs = [pd.read_table(f, sep = '\t') for f in input]
   pd.concat(dfs).to_csv(output[0], sep = '\t', index = False)
8
9
shell:
    "mafft {input} > {output}"
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
run:
    from Bio import SeqIO
    import sys
    import re
    import os

    out_recs = []
    out_dir = os.path.dirname(str(output))

    with open(str(input), 'r') as io:
        for r in SeqIO.parse(io, "fasta"):
            donor = re.match("NNTC_[0-9]{2}", r.id).group()
            if donor == wildcards.donor:
                out_recs.append(r)

    SeqIO.write(out_recs, str(output),"fasta")
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import re
import sys
from scipy.stats import fisher_exact
import pandas as pd

inf = float('inf')

IUPAC_CODES = {
    'R': r'[RAG]',
    'Y': r'[YCT]',
    'B': r'[BYSKCGT]',
    'D': r'[DRWKAGT]',
    'H': r'[HYWMACT]',
    'V': r'[VRSMAGC]',
    'N': r'[NRYBDHVWSKMACGT]',
    'W': r'[WAT]',
    'S': r'[SCG]',
    'K': r'[KGT]',
    'M': r'[MAC]',
}

VALID_NA_PATTERN = re.compile(r'[NRYBDHVWSKMACGT]')


def expand_iupac(pattern):
    result = []
    for char in pattern:
        if char in IUPAC_CODES:
            result.append(IUPAC_CODES[char])
        else:
            result.append(char)
    return ''.join(result)


DEFAULT_PATTERNS = {
    'hypermut_from': re.compile(expand_iupac(r'^G(?=RD)')),
    'hypermut_to': re.compile(expand_iupac(r'^A(?=RD)')),
    'control_from': re.compile(expand_iupac(r'^G(?=YN|RC)')),
    'control_to': re.compile(expand_iupac(r'^A(?=YN|RC)')),
}


def fasta_reader(filename):
    with open(filename) as fp:
        header = None
        seq = []
        for line in fp:
            if line.startswith('#'):
                continue
            elif line.startswith('>'):
                if seq:
                    yield header, ''.join(seq).upper()
                header = line[1:].strip().split(' ')[0]
                seq = []
            else:
                seq.append(line.strip())
        if seq:
            yield header, ''.join(seq).upper()


def find_sites(seq, pattern, site_range):
    sites = []
    for offset in site_range:
        match = pattern.search(seq[offset:])
        if match:
            sites.append(offset)
    return sites


def get_comparable_sites(refseq, naseq):
    sites = []
    for offset, (ref, na) in enumerate(zip(refseq, naseq)):
        if not VALID_NA_PATTERN.match(ref):
            continue
        if not VALID_NA_PATTERN.match(na):
            continue
        sites.append(offset)
    return sites


def hypermut(refseq, naseq, patterns=DEFAULT_PATTERNS):
    comparable_sites = get_comparable_sites(refseq, naseq)
    potential_muts = find_sites(
        refseq, patterns['hypermut_from'], comparable_sites)
    potential_ctrls = find_sites(
        refseq, patterns['control_from'], comparable_sites)
    matched_muts = find_sites(
        naseq, patterns['hypermut_to'], potential_muts)
    matched_ctrls = find_sites(
        naseq, patterns['control_to'], potential_ctrls)
    num_potential_muts = len(potential_muts)
    num_matched_muts = len(matched_muts)
    num_potential_ctrls = len(potential_ctrls)
    num_matched_ctrls = len(matched_ctrls)
    try:
        oddsratio = (
            (num_matched_muts / num_potential_muts) /
            (num_matched_ctrls / num_potential_ctrls)
        )
    except ZeroDivisionError:
        oddsratio = inf
    _, p = fisher_exact([
        [num_matched_muts, num_potential_muts - num_matched_muts],
        [num_matched_ctrls, num_potential_ctrls - num_matched_ctrls]
    ], 'greater')
    return (
        num_matched_muts,
        num_potential_muts,
        num_matched_ctrls,
        num_potential_ctrls,
        oddsratio,
        p)


def main():
    fasta_filename = snakemake.input[0]
    sequences = list(fasta_reader(fasta_filename))
    _, refseq = sequences.pop(0)
    cols = ["ID", "n_matched", "n_potential", "n_matched_ctrl", "n_potential_cntrl", "oddsratio", "p"]
    res = []
    for naid, naseq in sequences:
        res.append([naid] + list(hypermut(refseq, naseq, DEFAULT_PATTERNS)))
    df = pd.DataFrame(res, columns = cols)
    df.to_csv(snakemake.output[0], sep='\t', index = False)


if __name__ == '__main__':
    main()
ShowHide 4 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/alecpnkw/nntc_hiv_brain_lymphoid_reservoirs
Name: nntc_hiv_brain_lymphoid_reservoirs
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...