peptide matcher dataset workflow

public public 1yr ago Version: v4 0 bookmarks

This repository contains the workflow used to generate fasta datasets compatible with Peptide matcher or potentially other software capable of parsing the data. To download the datasets, go to

Code Snippets

 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import re
from pandas import read_csv
from os.path import basename, splitext
from Bio import SeqIO
from Bio.Seq import Seq
from sys import stderr
from itertools import groupby

parsed_files  = snakemake.input['parsed']
uniprot_files = snakemake.input['uniprot']
fasta_file    = str(snakemake.output)

fnames = {}
for fname in parsed_files:
    record_id = splitext(basename(fname))[0]
    fnames[record_id] = fname

def compress(s):
    return ''.join(str(sum(1 for x in g)) + k for k, g in groupby(s))
def to_hex(decs):
    return ''.join('%02x' % x for x in decs)

feature_types = {
    'transmembrane region': 'T',
    'signal peptide':       'S'
}
desc_re = re.compile(' ?([^=]+)=(.+?)( \{(.+?)\})?;')
with open(fasta_file, 'w') as fasta:
    for uniprot_file in uniprot_files:
        uniprot = SeqIO.parse(uniprot_file, 'swiss')
        record = next(uniprot)
        assert record.id in fnames, "Record %s not in alphafold" % record.id
        data = read_csv(fnames[record.id], index_col = 0)

        TMs = {}
        for feature in record.features:
            if feature.type in feature_types and type(feature.location.end).__name__ == 'ExactPosition':
                for i in range(feature.location.start - 1, feature.location.end):
                    TMs[i] = feature_types[feature.type]
        acc = []
        sst = []
        conf = []
        tm = []
        match = True
        for i in range(len(record.seq)):
            if i in data['res']:
                match = data['res'][i] == record.seq[i]
                if not match:
                    print("Sequence mismatch in %s" % record.id, file = stderr)
                    break
                acc.append(data['acc'][i])
                sst.append(data['sst'][i])
                conf.append(round(data['conf'][i]))
            else:
                acc.append(-1)
                sst.append('_')
                conf.append(-1)
            tm.append(TMs[i] if i in TMs else '-')
        descr = ''
        for key, val, *rest in desc_re.findall(record.description):
            if key == 'RecName: Full' or key == 'SubName: Full':
                descr = val
                break
        record.description = '%s confidence:%s secstruct:%s accessibility:%s' % (descr, to_hex(conf), compress(sst), to_hex(acc))
        if match:
            record.description += ' transmembrane:%s' % compress(tm)
        else:
            record.seq = Seq(''.join(data['res']))
        SeqIO.write(record, fasta, 'fasta')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd
from urllib import request
import gzip
from Bio.SeqUtils import IUPACData
from Bio import SeqIO
from io import StringIO

hist_file = str(snakemake.input['history'])
pdb_files = snakemake.params['pdb_files']
output_file = str(snakemake.output)

accession = str(snakemake.wildcards['acc'])
date_max = pd.to_datetime(snakemake.config['uniprot']['date_max'])
host = str(snakemake.config['unisave']['host'])

residues = { k.upper(): v for k, v in IUPACData.protein_letters_3to1.items() }
residues['XAA'] = 'X'

def parse_pdb(pdb_file):
    start = stop = -1
    chain_seq = ''
    confs = []
    with gzip.open(pdb_file, 'rt') as pdb_fh:
        for line in pdb_fh:
            if line.startswith('ATOM'):
                atom = line[13:17].strip()
                chain = line[20:22].strip()
                if atom == 'N' and chain == 'A':
                    res = line[17:20]
                    chain_seq += residues[res]
            elif start < 0 and line.startswith('DBREF'):
                values = line.split()
                chain = values[2]
                if chain == 'A':
                    start = int(values[8]) - 1
                    stop  = int(values[9])
    return chain_seq, start, stop

def get_ver(version):
    url = "{host}/{acc}?format={format}&versions={ver}".format(host = host, acc = accession, format = "txt", ver = version)
    with request.urlopen(url) as fp:
        txt = fp.read().decode('utf-8')
    return txt

seq_dict = {}
for pdb_file in pdb_files:
    pdb_seq, start, stop = parse_pdb(pdb_file)
    for i in range(stop - start):
        seq_dict[i + start] = pdb_seq[i]
seq = ''.join(v for k, v in sorted(seq_dict.items()))

data = pd.read_csv(hist_file, sep = "\t")
data['Date'] = pd.to_datetime(data['Date'])

versions = data[data['Date'] <= date_max].drop_duplicates('Sequence version')

found = False
for index, row in versions.iterrows():
    txt = get_ver(row['Entry version'])
    swiss = SeqIO.parse(StringIO(txt), 'swiss')
    record = next(swiss)
    if record.seq == seq:
        found = True
        break
assert found, "Matching sequence for pdb {acc} not found".format(acc = accession)
with open(output_file, 'w') as fp:
    fp.write(txt)
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import gzip
from pandas import read_fwf
from collections import defaultdict 
from Bio.SeqUtils import IUPACData

pdb_files  = snakemake.params['pdb_files']
dssp_files = snakemake.input['dssp_files']
output_file = str(snakemake.output)

dssp_specs = {
    'resnumber': (0, 5),
    'resname': (5, 10),
    'chain': (10, 12),
    'aminoacid': (12, 14),
    'secstruct': (14, 17),
    'turns_helix_3': (17, 19),
    'turns_helix_4': (19, 20),
    'turns_helix_5': (20, 21),
    'geometrical_bend': (21, 22),
    'chirality': (22, 23),
    'beta_bridge_label_1': (23, 24),
    'beta_bridge_label_2': (24, 25),
    'beta_bridge_partner_resnum_1': (25, 29),
    'beta_bridge_partner_resnum_2': (29, 33),
    'beta_sheet_label': (33, 34),
    'accessibility': (34, 38),
    'N_H___O_00': (38, 45),
    'N_H___O_01': (45, 50),
    'O___N_H_00': (50, 56),
    'O___N_H_01': (57, 61),
    'N_H___O_10': (61, 67),
    'N_H___O_11': (68, 72),
    'O___N_H_10': (72, 78),
    'O___N_H_11': (79, 83),
    'tco': (83, 91),
    'kappa': (91, 97),
    'alpha': (97, 103),
    'phi': (103, 109),
    'psi': (109, 115),
    'x_ca': (115, 122),
    'y_ca': (122, 129),
    'z_ca': (129, 136)
}
maxasa_theoretical = {
    'A': 129,
    'C': 167,
    'D': 193,
    'E': 223,
    'F': 240,
    'G': 104,
    'H': 224,
    'I': 197,
    'K': 236,
    'L': 201,
    'M': 224,
    'N': 195,
    'P': 159,
    'Q': 225,
    'R': 274,
    'S': 155,
    'T': 172,
    'V': 174,
    'W': 285,
    'Y': 263
}
maxasa_empirical = {
    'A': 121,
    'C': 148,
    'D': 187,
    'E': 214,
    'F': 228,
    'G':  97,
    'H': 216,
    'I': 195,
    'K': 230,
    'L': 191,
    'M': 203,
    'N': 187,
    'P': 154,
    'Q': 214,
    'R': 265,
    'S': 143,
    'T': 163,
    'V': 165,
    'W': 264,
    'Y': 255
}

residues = { k.upper(): v for k, v in IUPACData.protein_letters_3to1.items() }
residues['XAA'] = 'X'

def parse_dssp(fname):
    colspecs = list(dssp_specs.values())
    names = list(dssp_specs.keys())
    with open(fname) as dssp:
        for line in dssp:
            if not line.rstrip().endswith('.'):
                break
        data = read_fwf(dssp, colspecs = colspecs, names = names)
    secstruct = []
    dssp_seq = ''
    acc = []
    for row in data.itertuples():
        if row.chain == 'A':
            aminoacid = str(row.aminoacid)
            struct = str(row.secstruct)
            asa = int(row.accessibility)
            rsa = round(asa / maxasa_theoretical[aminoacid] * 100)
            secstruct.append(struct if struct != 'nan' else '-')
            dssp_seq += aminoacid
            acc.append(str(rsa))
    return dssp_seq, secstruct, acc

def parse_pdb(pdb_file):
    start = stop = -1
    chain_seq = ''
    confs = []
    with gzip.open(pdb_file, 'rt') as pdb_fh:
        for line in pdb_fh:
            if line.startswith('ATOM'):
                atom = line[13:17].strip()
                chain = line[20:22].strip()
                if atom == 'N' and chain == 'A':
                    res = line[17:20]
                    conf = float(line[60:66])
                    chain_seq += residues[res]
                    confs.append(conf)
            elif start < 0 and line.startswith('DBREF'):
                values = line.split()
                chain = values[2]
                if chain == 'A':
                    start = int(values[8]) - 1
                    stop  = int(values[9])
    return chain_seq, start, stop, confs

with open(output_file, 'w') as output:
    num_wins = len(dssp_files)
    accs  = {}
    ssts  = {}
    confs = {}
    seq   = {}
    count = defaultdict(int)
    for win in range(num_wins):
        dssp_seq, win_ssts, win_accs = parse_dssp(dssp_files[win])
        pdb_seq, start, stop, win_confs = parse_pdb(pdb_files[win])
        assert dssp_seq == pdb_seq, "Sequence mismatch between dssp and pdb"
        for i in range(stop - start):
            pos = i + start
            count[pos] += 1
            if pos in seq:
                assert seq[pos] == dssp_seq[i], "Sequence mismatch between windows"
            else:
                seq[pos] = dssp_seq[i]
            if pos not in confs or win_confs[i] > confs[pos]:
                confs[pos] = win_confs[i]
                accs[pos]  = win_accs[i]
                ssts[pos]  = win_ssts[i]
    output.write('pos,res,count,acc,sst,conf\n')
    for pos, num in sorted(count.items()):
        output.write('%d,%s,%d,%s,%s,%.2f\n' % (pos, seq[pos], num, accs[pos], ssts[pos], confs[pos]))
19
20
shell:
    "wget -O {output} '{config[unisave][host]}/{wildcards.acc}?download=true&format=tsv'"
27
28
shell:
    "mkdir -p {output} && curl -sL {params.url:q} | tar xf - -C {output} --wildcards --no-anchored '*.pdb.gz'"
85
86
script:
    "scripts/dload_version.py"
95
96
shell:
    "gzip -cd {params.pdb_file} | dssp -i /dev/stdin -o {output}"
106
107
script:
    "scripts/parse.py"
115
116
script:
    "scripts/collect.py"
ShowHide 8 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/OKLAB2016/peptide-matcher-data
Name: peptide-matcher-data
Version: v4
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...