"Proteogenomic Approach for Enhanced Virus Identification in Proteomic Analysis"

public public 1yr ago 0 bookmarks

Goal of this Pipeline

In Proteomic analysis, Viruses often yield low identification rates, especially on strain level. The aim is the usage of a proteogenomic approach in a multistage search approach to overcome this problem.

It consists

Code Snippets

8
9
shell: 
     "cat {input.protacc2taxids00} {input.protacc2taxids01} {input.protacc2taxids02} > {output.protacc2taxids_virus}"
17
18
shell:
   "awk '{{print $1}}' {input.protacc2taxids_virus} > {output.accessions}"
26
27
shell:
     "awk '{{print $2}}' {input.protacc2taxids_virus} > {output.taxids}"
37
38
script:
     "../scripts/hashDatabase.py"
13
14
script: 
     "../scripts/concat_results.py"
42
43
script:
    "../scripts/fetch_data.py"
60
61
script:
    "../scripts/get_species_strain.py"
12
13
script: 
    "../scripts/ref_filtering.py"
22
23
shell: 
    "java -cp {params.searchgui} eu.isas.searchgui.cmd.SearchCLI -spectrum_files {input.mgf} -fasta_file {input.proteomes_decoy_fasta} -output_folder {params.result_dir} -id_params {input.par} -output_default_name {params.refname}_searchgui_out -psm_fdr {params.psm_fdr} -peptide_fdr {params.peptide_fdr} -protein_fdr {params.protein_fdr} {params.search_engine} 1 -threads {threads} > {log.stdout_log} 2> {log.stderr_log}"
45
46
shell:
    "java {params.extra} -cp {params.peptideshaker} eu.isas.peptideshaker.cmd.PeptideShakerCLI -reference {params.refname} -fasta_file {input.proteomes_decoy_fasta} -identification_files {input.searchgui_zip} -spectrum_files {input.mgf} -out {output.peptide_shaker_psdb} -threads {threads} -log {log.peptide_shaker_log} > {log.stdout_log} 2> {log.stderr_log}" 
64
65
shell:
    "java {params.extra} -cp {params.peptideshaker} eu.isas.peptideshaker.cmd.ReportCLI -in {input.peptide_shaker_psdb} -out_reports {params.out_dir} -reports 3 > {log.stdout_log} 2> {log.stderr_log}" 
14
15
shell: 
    "java -cp {params.searchgui} eu.isas.searchgui.cmd.FastaCLI -in {input.ref} -decoy > {log.stdout_log} 2> {log.stderr_log}"
39
40
shell: 
    "java -cp {params.searchgui} eu.isas.searchgui.cmd.SearchCLI -spectrum_files {input.mgf} -fasta_file {input.ref_decoy_fasta} -output_folder {params.result_dir} -id_params {input.par} -output_default_name {params.refname}_searchgui_out -psm_fdr {params.psm_fdr} -peptide_fdr {params.peptide_fdr} -protein_fdr {params.protein_fdr} {params.search_engine} 1 -threads {threads} > {log.stdout_log} 2> {log.stderr_log}"
61
62
shell:
    "java -cp {params.peptideshaker} eu.isas.peptideshaker.cmd.PeptideShakerCLI -reference {params.refname} -fasta_file {input.ref_decoy_fasta} -identification_files {input.searchgui_zip} -spectrum_files {input.mgf} -out {output.peptide_shaker_psdb} -threads {threads} -log {log.peptide_shaker_log} > {log.stdout_log} 2> {log.stderr_log}" 
79
80
shell:
    "java -cp {params.peptideshaker} eu.isas.peptideshaker.cmd.ReportCLI -in {input.peptide_shaker_psdb} -out_reports {params.out_dir} -reports 3 > {log.stdout_log} 2> {log.stderr_log}" 
93
94
shell:
    "unzip -u {input.searchgui_zip} -d {params.out_dir} > {log.stdout_log}"
108
109
script:
    "../scripts/ms2rescore_config.py"
14
15
script:
    "../scripts/genome2proteome.py"
28
29
script:
    "../scripts/concat_proteomes.py"
44
45
shell: 
    "java -cp {params.searchgui} eu.isas.searchgui.cmd.FastaCLI -in {input.proteomes} -decoy > {log.stdout_log} 2> {log.stderr_log}"
58
59
script:
    "../scripts/filter_duplicate_ORFs.py"
9
shell:"cat {input} > {output}"  
25
26
shell: 
    "java -cp {params.searchgui} eu.isas.searchgui.cmd.FastaCLI -in {input.concat_db} -decoy > {log.stdout_log} 2> {log.stderr_log}"
50
51
shell: 
    "java -cp {params.searchgui} eu.isas.searchgui.cmd.SearchCLI -spectrum_files {input.mgf} -fasta_file {input.decoy_crap_host} -output_folder {params.result_dir} -id_params {input.par} -output_default_name {params.hostname}_searchgui_out -psm_fdr {params.psm_fdr} -peptide_fdr {params.peptide_fdr} -protein_fdr {params.protein_fdr} {params.search_engine} 1 -threads {threads} > {log.stdout_log} 2> {log.stderr_log}"
72
73
shell:
    "java -cp {params.peptideshaker} eu.isas.peptideshaker.cmd.PeptideShakerCLI -reference {params.hostname} -fasta_file {input.decoy_crap_host} -identification_files {input.searchgui_zip} -spectrum_files {input.mgf} -out {output.peptide_shaker_psdb} -threads {threads} -log {log.peptide_shaker_log} > {log.stdout_log} 2> {log.stderr_log}" 
91
92
shell:
    "java -cp {params.peptideshaker} eu.isas.peptideshaker.cmd.ReportCLI -in {input.peptide_shaker_psdb} -out_reports {params.out_dir} -reports 3 > {log.stdout_log} 2> {log.stderr_log}" 
106
107
script: 
    "../scripts/host_filtering.py"
13
14
script: 
    "../scripts/taxIDMapping.py"
20
21
script:
    "../scripts/create_plots.py"
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
import os
import shutil

concat_proteomes = snakemake.output[0]
res_dir = str(snakemake.params[0])
sample_name = str(snakemake.params[1])

cwd = str(os.getcwd())

full_in_path = f"{cwd}/{res_dir}/{sample_name}/sixpack"

included_extensions = ["fasta"]
file_names = [
    fn for fn in os.listdir(full_in_path) if any(fn.endswith(ext) for ext in included_extensions)
]

with open(concat_proteomes, "w") as out_file:
    for file in file_names:
        with open(f"{full_in_path}/{file}", "r") as f:
            shutil.copyfileobj(f, out_file)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
import pandas as pd

firstDBSearch = snakemake.input[0]
finalDBSearch = snakemake.input[1]

concat_res = snakemake.output[0]


df1 = pd.read_csv(firstDBSearch, sep="\t", index_col=0)
df2 = pd.read_csv(finalDBSearch, sep="\t", index_col=0)
merged_df = pd.concat([df1, df2])
merged_df.reset_index(drop=True, inplace=True)

merged_df.to_csv(concat_res, sep="\t", header=True, index=True)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import pandas as pd
import matplotlib.pyplot as plt


def CreateStrainCountBarPlot(in_file, out_file):
    df = pd.read_csv(in_file, delimiter="\t")

    df["strain"] = df["strain"].astype(str)
    df["isolate"] = df["isolate"].astype(str)
    df["strain_isolate"] = df["strain"].replace("nan", "") + "_" + df["isolate"].replace("nan", "")
    df["strain_isolate"] = df["strain_isolate"].str.strip("_")
    df['strain_isolate'] = df["strain_isolate"].replace("", "NaN")

    counts = df["counts"]
    strain_isolate = df["strain_isolate"]

    fig, ax = plt.subplots()
    ax.bar(strain_isolate, counts)

    # Add labels and title
    ax.set_xlabel("Strain") 
    ax.set_ylabel("Counts")
    ax.set_title("Bar Plot of Counts vs Strain/Isolate")

    plt.xticks(rotation=90)
    plt.savefig(out_file, dpi=300, bbox_inches="tight")
    plt.clf()


def CreateStrainConfidenceBarPlot(in_file, out_file):
    df = pd.read_csv(in_file, delimiter="\t")

    df["strain"] = df["strain"].astype(str)
    df["isolate"] = df["isolate"].astype(str)
    df["strain_isolate"] = df["strain"].replace("nan", "") + "_" + df["isolate"].replace("nan", "")
    df["strain_isolate"] = df["strain_isolate"].str.strip("_")
    df['strain_isolate'] = df["strain_isolate"].replace("", "NaN")

    mean_confidence = df["mean_confidence"]
    strain_isolate = df["strain_isolate"]

    fig, ax = plt.subplots()
    ax.bar(strain_isolate, mean_confidence)

    # Add labels and title
    ax.set_xlabel("Strain") 
    ax.set_ylabel("Mean Confidence")
    ax.set_title("Bar Plot of Mean Confidence vs Strain/Isolate")

    plt.xticks(rotation=90)
    plt.savefig(out_file, dpi=300, bbox_inches="tight")
    plt.clf()


def CreateTaxIdScoresBarPlot(in_file, out_file):
    df = pd.read_csv(in_file, sep="\t")

    taxids = df["taxid"].astype(str)[:20]
    weights = df["weight"][:20]

    plt.bar(taxids, weights)
    plt.xlabel("Tax ID")
    plt.ylabel("Weight")
    plt.title("Tax ID vs Weight")

    plt.xticks(rotation=90)
    plt.savefig(out_file, dpi=300, bbox_inches="tight")
    plt.clf()


def CreateProportionsPieChart(first_search, final_search, out_file):
    df1 = pd.read_csv(first_search, sep="\t", on_bad_lines="skip", usecols=["Protein(s)"])
    df1.columns = ["Peptide"]
    df1["Peptide"] = df1.Peptide.apply(lambda x: x.split(","))
    report_df_1 = df1.explode("Peptide", ignore_index=True)

    df2 = pd.read_csv(final_search, sep="\t", on_bad_lines="skip", usecols=["Protein(s)"])
    df2.columns = ["ORF"]
    df2["ORF"] = df2.ORF.apply(lambda x: x.split(","))
    report_df_2 = df2.explode("ORF", ignore_index=True)

    num_entries = [len(report_df_1), len(report_df_2)]
    labels = ["First Search", "Final Search"]

    def percent_and_amount(x):
        amount = int(round(x/100.0 * sum(num_entries), 0))
        p = "{:.1f}%  ({:d})".format(x, amount)
        return p

    plt.pie(num_entries, labels=labels, autopct=percent_and_amount)
    plt.title("Number of PSMs")
    plt.savefig(out_file, dpi=300, bbox_inches="tight")
    plt.clf()


def CreateConfidenceHistogram(in_file, out_file):
    df = pd.read_csv(in_file, sep="\t", on_bad_lines="skip")
    df.rename(columns={"Protein(s)": "Proteins"}, inplace=True)
    df["Proteins"] = df.Proteins.apply(lambda x: x.split(","))
    exploded_df = df.explode("Proteins", ignore_index=True)

    plt.hist(exploded_df["Confidence [%]"], bins=20)
    plt.xlabel("Confidence")
    plt.ylabel("Count")
    plt.title("Confidence Histogram")
    plt.savefig(out_file, dpi=300, bbox_inches="tight")
    plt.clf()


def main():
    strain_mappings = snakemake.input[0]
    taxID_scores = snakemake.input[1]
    frist_search = snakemake.input[2]
    final_search = snakemake.input[3]

    strain_counts_bar_plot = snakemake.output[0]
    strain_conf_bar_plot = snakemake.output[1]
    taxIdScores_bar_plot = snakemake.output[2]
    proportions_pie_chart = snakemake.output[3]
    first_search_confidence_histogram = snakemake.output[4]
    final_search_confidence_histogram = snakemake.output[5]

    CreateStrainCountBarPlot(strain_mappings, strain_counts_bar_plot)
    CreateStrainConfidenceBarPlot(strain_mappings, strain_conf_bar_plot)
    CreateTaxIdScoresBarPlot(taxID_scores, taxIdScores_bar_plot)
    CreateProportionsPieChart(frist_search, final_search, proportions_pie_chart)
    CreateConfidenceHistogram(frist_search, first_search_confidence_histogram)
    CreateConfidenceHistogram(final_search, final_search_confidence_histogram)


if __name__ == "__main__":
    main()
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import pandas as pd
from ete3 import NCBITaxa
from Bio import Entrez, SeqIO


def fetchGenomes(all_records, taxon_id_list, max_sequence_length, sequence_length_diff):
    used_seqs = 0
    for id in range(len(taxon_id_list)):
        handle = Entrez.efetch(db="nucleotide", id=taxon_id_list[id], rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")
        print(f"{id+1}/max{len(taxon_id_list)}; Sequence ID:", taxon_id_list[id], "Sequence Lenght:", len(record.seq))
        if len(record.seq) > max_sequence_length:
            print(f"Sequence with ID {taxon_id_list[id]} is longer than allowed. It will be skipped!")                
            continue
        elif (used_seqs > 0) and (len(all_records[used_seqs-1].seq) >= (len(record.seq) * sequence_length_diff)):
            print("Abort query here since the difference in sequence length is too big!")
            break
        else:
            all_records.append(record)
            used_seqs += 1
    return all_records


def getTaxIdsToQuery(mapped_taxids, number_taxids, weight_diff):

    taxid_df = pd.read_csv(mapped_taxids, sep="\t", header=0, index_col=False)
    taxids = taxid_df["taxid"].to_list()
    relevant_taxids = taxids[:number_taxids]
    taxids_to_query = []

    for i in range(len(relevant_taxids)):
        if i == 0:
            taxids_to_query.append(relevant_taxids[i])
        else:
            previous_taxid = taxid_df.loc[taxid_df["taxid"] == relevant_taxids[i-1]]
            previous_taxid_score = previous_taxid["weight"].values[0]
            current_taxid = taxid_df.loc[taxid_df["taxid"] == relevant_taxids[i]]
            current_taxid_score = current_taxid["weight"].values[0]

            if previous_taxid_score <= (current_taxid_score * weight_diff):
                taxids_to_query.append(relevant_taxids[i])
            else:
                break

    print("TaxIDs used: ", taxids_to_query)
    return taxids_to_query


def fetchData(taxids_to_query, max_number_accessions, max_sequence_length, sequence_length_diff):
    all_records = []
    for taxid in taxids_to_query:
        print(f"Querying for TaxId: {taxid}")
        search_query = f"txid{taxid}[Organism]"
        search_handle = Entrez.esearch(db="nucleotide", term=search_query, retmax=max_number_accessions, sort="Sequence Length")
        taxon_id_list = Entrez.read(search_handle)["IdList"]

        try:
            all_records = fetchGenomes(all_records, taxon_id_list, max_sequence_length, sequence_length_diff)
        except TypeError:
            print(f"No Sequences found for {taxid}!")
    return all_records


def fetchDataNCBI(taxids_to_query, max_number_accessions, max_sequence_length, sequence_length_diff):
    ncbi = NCBITaxa()
    all_records = []
    print("Using the NCBI Querying!")

    for taxid in taxids_to_query:
        print(f"Querying for TaxId: {taxid}")
        descendants = ncbi.get_descendant_taxa(taxid)
        descendant_names = ncbi.translate_to_names(descendants)

        for name in descendant_names:
            search_query = f"{name}"
            print(f"Querying for Taxon Name: {search_query}")
            search_handle = Entrez.esearch(db="nucleotide", term=search_query, retmax=max_number_accessions, sort="Sequence Length")
            taxon_id_list = Entrez.read(search_handle)["IdList"]

            try:
                all_records = fetchGenomes(all_records, taxon_id_list, max_sequence_length, sequence_length_diff)
            except TypeError:
                print(f"No Sequences found for {name}!")
    return all_records


def getStrainNames(all_records):
    df = pd.DataFrame(columns=["genbank_accession", "species", "strain", "isolate"])
    # get df with strain names...
    for record in all_records:
        accession = record.id
        species = record.features[0].qualifiers["organism"][0]
        try:
            strain = record.features[0].qualifiers["strain"][0]
        except KeyError:
            strain = "NaN"
        try:
            isolate = record.features[0].qualifiers["isolate"][0]
        except KeyError:
            isolate = "NaN"
        new_df = pd.DataFrame({"genbank_accession": accession, "species": species, "strain": strain, "isolate": isolate}, index=[0])
        df = pd.concat([df, new_df])

    # get unique strains...
    unique_df = df.drop_duplicates(subset=["genbank_accession", "species", "strain", "isolate"], keep="first").reset_index(drop=True)
    unique_strain_records = [record for record in all_records if (record.id in unique_df["genbank_accession"].to_list())]

    return unique_df, unique_strain_records


def writeFiles(unique_df, unique_strain_records, out_file_df, out_file_fasta):
    # filter problematic records
    record_to_filter = []
    for record in range(len(unique_strain_records)):
        try:
            unique_strain_records[record].seq[0]
        except:
            record_to_filter.append(record)

    record_to_filter.sort(reverse=True)
    for record in record_to_filter:
        print(f"removing problematic sequence record {unique_strain_records[record]}")
        unique_strain_records.pop(record)
    # write files...
    unique_df.to_csv(out_file_df, index=False, sep="\t")
    SeqIO.write(unique_strain_records, out_file_fasta, "fasta")


def main():
    mapped_taxids = snakemake.input[0]
    out_file_df = snakemake.output[0]
    out_file_fasta = snakemake.output[1]
    APImail = snakemake.params[0]
    APIkey = snakemake.params[1]
    number_taxids = snakemake.params[2]
    weight_diff = snakemake.params[3]
    max_sequence_length = snakemake.params[4]
    sequence_length_diff = snakemake.params[5]
    max_number_accessions = snakemake.params[6]
    use_NCBI_Taxa = snakemake.params[7]



    if APIkey and APImail:
        Entrez.email = APImail
        Entrez.api_key = APIkey

    taxids_to_query = getTaxIdsToQuery(mapped_taxids, number_taxids, weight_diff)

    if use_NCBI_Taxa:
        all_records = fetchDataNCBI(taxids_to_query, max_number_accessions, max_sequence_length, sequence_length_diff)
        if len(all_records) == 0:
            print("No Sequences found. Using default fetching!")
            all_records = fetchData(taxids_to_query, max_number_accessions, max_sequence_length, sequence_length_diff)
    else:
        all_records = fetchData(taxids_to_query, max_number_accessions, max_sequence_length, sequence_length_diff)

    unique_df, unique_strain_records = getStrainNames(all_records)
    writeFiles(unique_df, unique_strain_records, out_file_df, out_file_fasta)


if __name__ == "__main__":
    main()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from Bio import SeqIO

in_file = snakemake.input[0]
out_file = snakemake.output[0]

sequences = {}
counter = 0
for record in SeqIO.parse(in_file, "fasta"):
    counter += 1
    if record.seq not in sequences:
        sequences[record.seq] = record.id.split(" ")[0]
    else:
        r = record.id.split(" ")[0]
        sequences[record.seq] += f",{r}"

print(f"Number of sequences: {counter}")
print(f"Number of unique sequences: {len(sequences)}")


# write to fasta file
with open(out_file, "w") as f:
    for sequence in sequences:
        seq = str(sequence)
        f.write(">" + str(sequences[sequence]) + "\n")
        while seq:
            f.write(seq[:60] + "\n")
            seq = seq[60:]
        f.write("\n")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import subprocess
import time

concat_fasta = snakemake.input[0]
sixpack_temp = snakemake.output[1]

sample_path = snakemake.params[0] + "/sixpack/"
orfminsize = snakemake.params[1]
additional_sixpack_params = snakemake.params[2]


with open(concat_fasta, "r") as fasta:
    sequences = fasta.read()
    d = ">"
    sequences = [d+e for e in sequences.split(d)][1:]

with open(sixpack_temp, "w") as f:
    for i, sequence in enumerate(sequences):
        if additional_sixpack_params:
            process = subprocess.Popen(
                [
                    "sixpack",
                    "-sequence",
                    concat_fasta,
                    "-outfile",
                    f"{sample_path}{sequence[1:12]}.orfs",
                    "-outseq",
                    f"{sample_path}{sequence[1:12]}.fasta",
                    "-orfminsize",
                    f"{orfminsize}",
                    f"{additional_sixpack_params}"
                ],
                stdout=f,
                stderr=subprocess.STDOUT
            )
        else:
            process = subprocess.Popen(
                [
                    "sixpack",
                    "-sequence",
                    concat_fasta,
                    "-outfile",
                    f"{sample_path}{sequence[1:12]}.orfs",
                    "-outseq",
                    f"{sample_path}{sequence[1:12]}.fasta",
                    "-orfminsize",
                    f"{orfminsize}",
                ],
                stdout=f,
                stderr=subprocess.STDOUT
            )
        with open(concat_fasta, "w") as fasta:
            for seq in sequences[i:]:
                fasta.write(seq + "\n")
        time.sleep(1)  # because file system latency
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import pandas as pd
import numpy as np


peptide_shaker_report = snakemake.input[0]
strain_accessions = snakemake.input[1]

out_file = snakemake.output[0]

relevant_genomes = snakemake.params[0]


df = pd.read_csv(peptide_shaker_report, sep="\t", on_bad_lines="skip", usecols=["Protein(s)", "Confidence [%]"])
df.columns = ["ORF", "Confidence"]
df["weight"] = 1 / (df.ORF.str.count(",") + 1)
df["ORF"] = df.ORF.apply(lambda x: x.split(","))
df["psmid"] = df.index + 1
report_df = df.explode("ORF", ignore_index=True)
report_df["accession"] = report_df.ORF.apply(lambda x: x.split(".")[0])

genome_stats = pd.DataFrame({
    "accession": report_df["accession"],
    "counts": report_df.groupby("accession")["accession"].transform("count")
})


mean_conf_df = report_df.groupby("accession")["Confidence"].mean().reset_index()
mean_conf_df.rename(columns={"Confidence": "mean_confidence"}, inplace=True)
genome_stats = genome_stats.merge(mean_conf_df, on="accession")
genome_stats.drop_duplicates(inplace=True)
genome_stats.reset_index(drop=True, inplace=True)

accessions_df = pd.read_csv(strain_accessions, sep="\t")
accessions_df["genbank_accession"] = accessions_df.genbank_accession.apply(lambda x: x.split(".")[0])

merged_df = pd.merge(accessions_df, genome_stats, how="inner", left_on="genbank_accession", right_on="accession")
merged_df.replace(np.nan, "NaN", regex=True, inplace=True)
merged_df.sort_values("counts", ascending=False, inplace=True)
merged_df.reset_index(drop=True, inplace=True)
merged_df.drop(columns=["accession"])
merged_df = merged_df[:relevant_genomes]
merged_df.to_csv(out_file, index=False, sep="\t")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import mmh3
import numpy as np

in_file = snakemake.input[0]
out_file = snakemake.output[0]


def hash_database(in_file, out_file, seed=18):
    """
    Perform mmh3 hashing on input database.
    MurmurHash3 is a non-cryptographic hashing algorithm.
    For more information on mmh3 algorithm visit https://pypi.org/project/mmh3/.

    :param in_file: str, input file
    :param out_file: str, output file
    :param seed: int, hashing seed: use identical integer for identical hashing results
    :return: database: np.array, hashed database
    """
    # initialize database
    database = np.empty([sum(1 for _ in open(in_file))])
    # hash protein accessions
    with open(in_file, "r") as f:
        for line_num, line in enumerate(f, 1):
            database[line_num - 1] = mmh3.hash64(line, signed=False, seed=seed)[0]
    # save in numpy format
    np.save(out_file, database)


def main():
    hash_database(in_file, out_file)


if __name__ == "__main__":
    main()
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import pandas as pd


def getSpectraNames(report):

    """
    get spectrum titles from PeptideShaker output
    input: path to spectrum file
    returns: list of spectra tiltes
    """

    raw = pd.read_csv(report, sep="\t", on_bad_lines="skip", usecols=["Spectrum Title"])
    spectrumNames = raw["Spectrum Title"].to_list()
    return spectrumNames


def FilterMGF(SpectraToFilter, SpectrumFile, out_file, log_file):

    """
    filter spectra with title provided in a list from .mgf file
    input: list of spectrum titles, .mgf file of spectra
    output: filtered .mgf file
    """

    SpectraToFilter = set(SpectraToFilter)  # remove duplicates
    with open(SpectrumFile, "r", newline="\n") as mgf:
        LinesToWrite = []
        writeLine = True

        for line in mgf:
            lineNoN = line.rstrip()
            if lineNoN.startswith("TITLE"):
                if lineNoN[6:] in SpectraToFilter:
                    writeLine = False
                else:
                    writeLine = True
            if writeLine:
                LinesToWrite.append(line)

    with open(out_file, "w") as f:
        f.writelines(LinesToWrite)

    with open(log_file, "w") as f:
        f.write("filtered " + str(len(SpectraToFilter)) + " host spectra")


def main():
    mgf = snakemake.input[0]
    psm_report = snakemake.input[1]
    out_file = snakemake.output[0]
    log_file = snakemake.log[0]

    FilterList = getSpectraNames(psm_report)
    FilterMGF(FilterList, mgf, out_file, log_file)


if __name__ == "__main__":
    main()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
out_file = snakemake.output[0]

RescorePipeline = snakemake.params[0]
RescoreFeatures = snakemake.params[0]
RunPercolator = snakemake.params[0]
FragModel = snakemake.params[0]

with open(out_file, "w") as f:

    lines = [' {"$schema":"./config_schema.json",', '"general":{']
    lines.append(f'"pipeline":{RescorePipeline},')
    lines.append(f'"feature_sets":{RescoreFeatures},')
    lines.append(f'"run_percolator":{RunPercolator},')
    lines.append('"id_decoy_pattern": "DECOY",')
    lines.append('"num_cpu":' + "1" + ",")
    lines.append('"config_file":null,')
    lines.append('"tmp_path":null,')
    lines.append('"mgf_path":null,')
    lines.append('"output_filename":null,')
    lines.append('"log_level":"info",')
    lines.append('"plotting":false')
    lines.append("},")
    lines.append('"ms2pip":{')
    lines.append(f'"model":{FragModel},')
    lines.append('"modifications":[')
    # if mods:
    #     for mod in mods:
    #     lines.append(InputMod(mod[0],mod[1],mod[2],mod[3],mod[4],mod[5]))
    # lines[-1] = lines[-1][:-1]

    lines.append("]")
    lines.append("},")
    lines.append('"maxquant_to_rescore":{},')
    lines.append('"percolator":{}')
    lines.append("}")

    f.writelines([line + "\n" for line in lines])
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pandas as pd


def getSpectraNames(report):

    """
    get spectrum titles from PeptideShaker output
    input: path to spectrum file
    returns: list of spectra tiltes
    """

    raw = pd.read_csv(report, sep="\t", on_bad_lines="skip", usecols=["Spectrum Title"])
    spectrumNames = raw["Spectrum Title"].to_list()
    return spectrumNames


def FilterMGF(SpectraToFilter, SpectrumFile, out_file, log_file):

    """
    filter spectra with title provided in a list from .mgf file
    input: list of spectrum titles, .mgf file of spectra
    output: filtered .mgf file
    """

    SpectraToFilter = set(SpectraToFilter)  # remove duplicates
    with open(SpectrumFile, "r", newline="\n") as mgf:
        LinesToWrite = []
        writeLine = True

        for line in mgf:
            lineNoN = line.rstrip()
            if lineNoN.startswith("TITLE"):
                if lineNoN[6:] in SpectraToFilter:
                    writeLine = False
                else:
                    writeLine = True
            if writeLine:
                LinesToWrite.append(line)

    with open(out_file, "w") as f:
        f.writelines(LinesToWrite)

    with open(log_file, "w") as f:
        f.write("filtered " + str(len(SpectraToFilter)) + " host spectra")


def main():
    mgf = snakemake.input[0]
    psm_report = snakemake.input[1]
    out_file = snakemake.output[0]
    log_file = snakemake.log[0]
    FilterList = getSpectraNames(psm_report)
    FilterMGF(FilterList, mgf, out_file, log_file)


if __name__ == "__main__":
    main()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd


def mapTaxIDs(in_file, taxids, out_file):
    in_file = pd.read_csv(in_file, sep="\t", on_bad_lines="skip", usecols=["Protein(s)"])
    id_df = pd.read_csv(taxids, header=None, names=["protein", "taxid"], sep="\t")

    in_file.columns = ["accession"]
    in_file["weight"] = 1 / (in_file.accession.str.count(",") + 1)
    in_file["accession"] = in_file.accession.apply(lambda x: x.split(","))
    in_file["psmid"] = in_file.index + 1
    report_df = in_file.explode("accession", ignore_index=True)

    merged_df = pd.merge(report_df, id_df, how="inner", left_on="accession", right_on="protein")
    merged_df = merged_df.drop("protein", axis=1)

    merged_df.to_csv(out_file, sep="\t", header=["accession", "weight", "psmid", "taxid"])
    return merged_df


def score(df, out_file):

    df_score = df.groupby('taxid')['weight'].sum().reset_index()
    df_score = df_score.sort_values(by=['weight'], ascending=False)
    df_score.to_csv(out_file, index=False, sep="\t")


def main():
    raw_report = snakemake.input[0]
    taxids = snakemake.input[1]
    mapped_taxids = snakemake.output[0]
    top_scoring = snakemake.output[1]

    merged_df = mapTaxIDs(in_file=raw_report, taxids=taxids, out_file=mapped_taxids)
    score(df=merged_df, out_file=top_scoring)


if __name__ == "__main__":
    main()
ShowHide 35 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/BAMeScience/MultiStageSearch
Name: multistagesearch
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...