MetaPUF

public public 1yr ago 0 bookmarks

MetaPUF

An approach to integrate metagenomics, metatranscriptomics and metaproteomics data found in public resources such as MGnify (for metagenomics/metatranscriptomics) and the PRIDE database (for metaproteomics). When these omics tec

Code Snippets

38
39
40
41
42
43
44
45
shell:
    """
    if [ -z "{params.study}" ]; then
        python workflow/scripts/metagenomics_db/main.py -d {params.input_dir} -v {params.ver} -o {params.output_dir} -m {input.sample_metadata} -b {params.db_size} &> {log}
    else
        python workflow/scripts/metagenomics_db/main.py -s {params.study} -v {params.ver} -o {params.output_dir} -m {input.sample_metadata} -b {params.db_size} &> {log}
    fi
    """
37
38
shell:
    "python workflow/scripts/gff_generation/main.py -i {input.metap_sample_info} -r {params.reports_dir} -m {params.metag_dir} -p {params.pride_id}"
35
36
shell:
    "unzip {input.zip} -d {params.peptideshaker_tooldir} > {log}"
56
57
58
shell:
    "python workflow/scripts/searchgui_search.py -p -jar {input.jar} -in {input.info} "
    "-out {params.outputdir}"
28
29
shell:
    "python workflow/scripts/post_report_generation/main.py -s {input.info} -d {params.dir} -p {params.pid} &> {log}"
28
29
shell:
    "unzip {input.exe_zip} -d {params.thermorawfileparser_tool} > {log}"
46
47
shell:
    "python workflow/scripts/coping_raw_files.py -exe {input.exe} -info {input.info} -in {params.raw} -out {params.folder}"
36
37
shell:
    "tar -xzvf {input.zip} -C {params.searchgui_tooldir} > {log}"
57
58
59
60
shell:
    "python workflow/scripts/generating_decoy.py -jar {input.jar} -info {input.info} "
    "-human {input.human_db} -crap {input.crap_db} -sgpars '{params.searchgui_params}' "
    "-adir {params.assemblies_dir}"
83
84
85
shell:
    "python workflow/scripts/searchgui_search.py -s -jar {input.jar} -in {input.info} "
    "-raw {input.raw} -out {params.outdir}"
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
import argparse
import os
import subprocess
import sys

import pandas as pd


def get_args():
    """Collect the inputs for the executable scripts."""
    parser = argparse.ArgumentParser(
        description="""This script is going to map the samples, raw files and searching databases
            between PRIDE and MGnify, then execute shell scripts. """
    )

    parser.add_argument(
        "-info",
        "--sampleinfo",
        dest="info_file",
        help="the sample information file, where stores the mapping information for samples, databases and raw files",
    )
    parser.add_argument(
        "-in",
        "--input_folder",
        dest="input_folder",
        help="the folder path where raw files are",
    )
    parser.add_argument(
        "-out",
        "--output",
        dest="output_folder",
        help="the folder path for saving files",
    )
    parser.add_argument("-exe", "--exefile", dest="exe_file", help="the exec file")

    args = parser.parse_args()

    if args.info_file is None:
        sys.exit("Error: no info file (sample information file)")
    if args.output_folder is None:
        sys.exit("Error: no input folder path provided!")
    if args.exe_file is None:
        sys.exit("Error: no exec file provided!")

    return args


# sample_info = pd.read_csv("sample_info_final.csv", sep=',')
def thermorawfileparser(exe_file, info_file, input_folder, output_folder):
    """
    downloading raw file via FTP and convert them into peak list files based on the sample information
    :info_file:     sample information file contains raw files, samples, mapped searching databases
    :input_folder: input folder path for raw data (if not from rawurls)
    :output_folder: output folder path for peak list data
    """
    sample_info = pd.read_csv(info_file, sep=",")
    sample_info = sample_info[["Sample", "Raw file", "Raw file URLs"]].drop_duplicates()
    RawURLs = sample_info["Raw file URLs"].dropna().to_list()

    if len(RawURLs) > 0:
        samples = {
            k: g["Raw file URLs"].tolist() for k, g in sample_info.groupby("Sample")
        }
        for sample in samples.keys():
            # generate the folder if it is not there
            folder = os.path.join(output_folder, sample)
            os.makedirs(folder, exist_ok=True)
            # download the raw files if they are not provided locally.
            commandline = ""
            for url in samples[sample]:
                commandline = "wget -P " + folder + " -i " + url + ";"

            commandline += " mono " + exe_file + " -d=" + folder + " -o=" + folder
            # -f=1 means mzML format file
            commandline += " -f=0 -m=0 &> logs/" + sample + "_thermorawfileparser.log"
            subprocess.run(commandline, shell=True)

    else:
        samples = {k: g["Raw file"].tolist() for k, g in sample_info.groupby("Sample")}
        for sample in samples.keys():
            # generate the folder if it is not there
            in_folder = os.path.join(input_folder, sample)
            out_folder = os.path.join(output_folder, sample)
            os.makedirs(out_folder, exist_ok=True)
            commandline = f" mono {exe_file} -d={in_folder} -o={out_folder}"
            # -f=1 means mzML format file
            commandline += " -f=0 -m=0 &> logs/" + sample + "_thermorawfileparser.log"
            subprocess.run(commandline, shell=True)


def main():
    """
    Main function
    """
    args = get_args()

    thermorawfileparser(
        args.exe_file, args.info_file, args.input_folder, args.output_folder
    )


if __name__ == "__main__":
    main()
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import argparse
import json
import logging
import os
import subprocess
import sys
from pathlib import Path

import pandas as pd


def get_args():
    """Collect the inputs for the executable scripts."""
    parser = argparse.ArgumentParser(
        description="""This script is going to add the human and contanminat protein sequences
                into the generated databases and also add decoy into every database"""
    )

    parser.add_argument(
        "-info",
        "--sampleinfo",
        dest="info_file",
        help="the final sample information file, where also stores generaed databases",
    )
    parser.add_argument(
        "-human",
        "--humandb",
        dest="human_db",
        help="the human reference protein sequences",
    )
    parser.add_argument(
        "-crap", "--crapdb", dest="crap_db", help="the contanminat sequences"
    )
    parser.add_argument("-jar", "--jarfile", dest="jar_file", help="the jar file")
    parser.add_argument(
        "-sgpars",
        "--searchgui_params",
        dest="searchgui_params",
        help="json-like string of options to pass to searchgui",
    )
    parser.add_argument(
        "-adir",
        "--assemblies_dir",
        dest="assemblies_dir",
        help="the output assemblies directory path",
    )

    args = parser.parse_args()

    if args.info_file is None:
        sys.exit("Error: no info file (sample information file)")
    if args.human_db is None:
        sys.exit("Error: no human reference protein database provided!")
    if args.crap_db is None:
        sys.exit("Error: no contanminat protein database provided!")
    if args.jar_file is None:
        sys.exit("Error: no jar file provided!")

    return args


def searchgui_decoy(
    jar_file, info_file, human_db, crap_db, searchgui_params, assemblies_folder
):
    """
    adding human, contaminant and decoy to the protein databases.
    :jar_file:      the searchgui jar file (whole path)
    :info_file:     final sample information file also contains mapped searching databases
    :human_db:      the human protein reference sequences
    :crap_db:       the contanminat protein sequences
    :assemblies_folder: the output/assemblies dir path
    """
    sample_info = pd.read_csv(info_file, sep=",")
    proteins = sample_info["Db_name"].dropna().drop_duplicates().to_list()

    config = json.loads(searchgui_params)
    SEARCHGUI_PAR_PARAMS = " ".join(
        [
            "-%s %s" % (k, "'%s'" % v if isinstance(v, str) else str(v))
            for k, v in config.items()
        ]
    )

    for protein in proteins:
        protein_file = f"{assemblies_folder}/databases/{protein}"
        print(f"On {protein = } with {protein_file = }")
        fasta = Path(protein_file).resolve().with_suffix(".fasta").as_posix()
        par = f"{Path(protein_file).parent.resolve()}/{Path(protein_file).stem}_searchgui.par"
        db = f"{Path(fasta).parent.resolve()}/{Path(fasta).stem}_concatenated_target_decoy.fasta"
        log = Path(protein_file).stem + "_SearchGUI_params.log"

        commandline = (
            "cat " + protein_file + " " + human_db + " " + crap_db + " > " + fasta + ";"
        )
        # generating the protein decoy
        params = " -in " + fasta
        params += " -decoy &> "
        params += "logs/" + protein.split(".")[0] + "_SearchGUI_search.log;"

        commandline += " java -cp " + jar_file + " eu.isas.searchgui.cmd.FastaCLI"
        commandline += params

        # generating the protein parameter file
        params = f" -out {par} "
        params += f" -db {db} "
        params += SEARCHGUI_PAR_PARAMS
        params += f" &> logs/{log};"
        commandline += (
            " java -cp "
            + jar_file
            + " eu.isas.searchgui.cmd.IdentificationParametersCLI"
        )
        commandline += params

        print(f"Will run {commandline}")
        subprocess.run(commandline, shell=True)
        # print(commandline)

    # commandline = "mkdir -p " + s_dir
    # subprocess.run(commandline, shell=True)
    flag_txt = (
        f"{assemblies_folder}/databases/proteins_decoy_params_generated_check.txt"
    )
    with open(flag_txt, "w") as f:
        f.write("All protein databases have been processed!")


def main():
    """
    Main function
    """
    args = get_args()

    searchgui_decoy(
        args.jar_file,
        args.info_file,
        args.human_db,
        args.crap_db,
        args.searchgui_params,
        args.assemblies_dir,
    )


if __name__ == "__main__":
    main()
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import logging
import os
import time
from argparse import ArgumentParser

import gff_builder as gb
import pandas as pd


def dir_path(string):
    if os.path.isdir(string):
        return string
    else:
        raise NotADirectoryError(string)


def main():  # noqa: C901
    """
    Aggregate information from metaproteomics and metagenomics about the expressed proteins
    and generate GFF format files for visualisation
    """
    parser = ArgumentParser(
        description="Aggregate information from metaproteomics and metagenomics about the expressed proteins and generate GFF format files for visualisation"
    )
    parser.add_argument(
        "-i",
        "--sample_info",
        type=str,
        required=True,
        help="Absolute path of the sample metadata file",
    )
    parser.add_argument(
        "-r",
        "--reports_dir",
        type=dir_path,
        required=True,
        help="full path of the directory containing processed metaproteomics reports",
    )
    parser.add_argument(
        "-m",
        "--metag_dir",
        type=dir_path,
        required=True,
        help="full path of the directory containing contig metadata file",
    )
    parser.add_argument(
        "-p",
        "--pride_id",
        type=str,
        required=True,
        help="PRIDE id for the reanalysed study",
    )

    starttime = time.time()
    args = parser.parse_args()
    # read in sample_info file and get the sample names in a list
    sample_info = pd.read_csv(args.sample_info, sep=",")

    # check if the results folder exists or create the results folder
    results_folder = os.path.join(args.reports_dir, "results")
    os.makedirs(results_folder, exist_ok=True)

    # get the filenames as list and concatenate the expressed peptides from all the samples
    sample_file_list = [
        args.reports_dir + "/" + f
        for f in os.listdir(args.reports_dir)
        if f.endswith("_peptide_report.csv")
    ]
    csv_list = []

    for file in sorted(sample_file_list):
        csv_list.append(pd.read_csv(file))

    csv_merged = pd.concat(csv_list, ignore_index=True)
    csv_merged.to_csv(os.path.join(results_folder, "allsamples.csv"), index=False)
    proteins = list(set(csv_merged["Protein"].to_list()))
    # create a new dataframe with unique peptide digests
    unique_proteins = pd.DataFrame(proteins, columns=["digest"])
    # get the list of distinct assemblies in the study
    assemblies = list(set(sample_info["Assembly"].to_list()))

    for assembly in assemblies:
        temp_assembly = pd.read_csv(
            os.path.join(args.metag_dir, assembly + "_contig_info.txt"), sep="\t"
        )
        temp_assembly.columns = [
            "assembly",
            "contig_name",
            "digest",
            "partial_info",
            "protein_start",
            "protein_end",
            "strand",
            "caller",
        ]
        # get all the matching expressed proteins from the contigs_info.txt
        assembly_subset_of_proteins = pd.merge(
            unique_proteins, temp_assembly, on="digest", how="inner"
        )

        # get all the peptides expressed in the given assembly
        assembly_expressed_proteins = assembly_subset_of_proteins.merge(
            csv_merged, left_on="digest", right_on="Protein", how="inner"
        )

        if len(assembly_expressed_proteins) > 0:
            assembly_expressed_proteins["max_spectrum_count"] = gb.calculate_max_count(
                assembly_expressed_proteins
            )
            assembly_expressed_proteins.to_csv(
                os.path.join(results_folder, assembly + "_expressed_proteins.csv")
            )
            expressed_proteins = list(set(assembly_expressed_proteins["digest"]))
            attributes_file, max_spectrum_count = gb.protein_report_processing(
                assembly_expressed_proteins, expressed_proteins, args.pride_id
            )
            if len(attributes_file) >= 1:
                gb.gff_generation_unique(
                    attributes_file, assembly, results_folder, max_spectrum_count
                )

    logging.info("Completed")
    logging.info("Runtime is {} seconds".format(time.time() - starttime))


if __name__ == "__main__":
    log_file = "logs/gff_generate.log"
    logging.basicConfig(
        filename=log_file,
        filemode="a",
        level=logging.DEBUG,
        format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s",
        datefmt="%H:%M:%S",
    )
    main()
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import gzip
import logging
import os
import shutil
import subprocess
import sys
import time
from argparse import ArgumentParser
from collections import defaultdict

import fetch_data as fd
import get_clusters as gc
import pandas as pd
import screed
import sourmash


def dir_path(string):
    if os.path.isdir(string):
        return string
    else:
        raise NotADirectoryError(string)


def main():  # noqa: C901
    """
    Generates protein databases containing non-redundant proteins from
    metagenomics and/or metatranscriptomics assemblies
    """
    parser = ArgumentParser(
        description="Generate protein sequence database for Metagenomics and /or Metatranscriptomics datasets"
    )
    """
    The user can enter their own data using --input_dir option
    or
    select --study to analyse study published on MGnify website
    """

    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        "-s",
        "--study",
        type=str,
        help="Secondary study accession of the assembled study in MGnify starting with ERP/SRP/DRP ",
    )
    group.add_argument(
        "-d",
        "--input_dir",
        type=dir_path,
        help="full path of the study folder containing protein files ",
    )
    parser.add_argument(
        "-v",
        "--ver",
        type=str,
        required=True,
        help="pipeline version of MGnify analysis for the study",
    )
    parser.add_argument(
        "-o",
        "--output_dir",
        type=dir_path,
        required=True,
        help="full path of the study folder containing results",
    )
    parser.add_argument(
        "-m",
        "--metadata",
        type=str,
        required=True,
        help="full path of the sample-assembly mapping file (.csv) with filename.",
    )
    parser.add_argument(
        "-b",
        "--db_size",
        type=int,
        help="input the maximum size of the protein search database in bytes",
        default=1073741824,
    )

    starttime = time.time()
    args = parser.parse_args()
    sample_assembly_map = defaultdict(set)

    assembly_folder = os.path.join(args.output_dir, "assemblies")
    os.makedirs(assembly_folder, exist_ok=True)

    logging.info(f"Will read samples info from {args.metadata}")
    samples = pd.read_csv(args.metadata, sep=",")

    os.chdir(args.output_dir)
    if args.study:
        logging.info(f"Will download data from MGnify API for study {args.study}")
        fd.check_study_accession(args.study)
        # getting the sequnece data and predicted cds
        cmd_get_data = "  ".join(
            [
                "mg-toolkit -d bulk_download -a",
                args.study,
                "-p ",
                args.ver,
                "-g sequence_data",
            ]
        )
        subprocess.call(cmd_get_data, shell=True)
        sequence_dir = (
            args.output_dir + "/" + args.study + "/" + args.ver + "/sequence_data"
        )

    elif args.input_dir:
        logging.info(f"Will use MGnify data from input dir {args.input_dir}")
        ## Assembly input files should be gzipped and end with "_FASTA.fasta.gz"
        ## Prodigal (v.2.6.3) predicted CDS file should be gzipped and end with "_FASTA_predicted_cds.faa.gz"
        if len(os.listdir(args.input_dir)) == 0:
            sys.exit("{} is empty".format(args.input_dir))
        else:
            sequence_dir = str(args.input_dir)
        for input_file in os.listdir(args.input_dir):
            if input_file.endswith("_FASTA.fasta.gz"):
                logging.info(f"Input file {input_file} is correctly named")
                continue
            elif input_file.endswith("_FASTA_predicted_cds.faa.gz"):
                logging.info(f"Input file {input_file} is correctly named")
                continue
            else:
                logging.warning(
                    f"The input file {input_file} is not named with correct naming convention. Please check documentation for correct naming convention"
                )

    for idx, row in samples.iterrows():
        sample_assembly_map[row["Secondary Sample Accession"]].add(row["Assembly"])
    logging.info("Mapping between samples and the assemblies: ", sample_assembly_map)
    for k, v in sample_assembly_map.items():
        sample_file = os.path.join(assembly_folder, k + ".fasta.gz")
        logging.info(f"Reading assemblies from sample {sample_file}")
        with gzip.open(sample_file, "wt") as wf:
            for assembly in v:
                # renaming headers of the predicted cds in assemblies
                fd.rename_contigs(assembly, assembly_folder, sequence_dir)
                with gzip.open(
                    os.path.join(sequence_dir, assembly + "_FASTA.fasta.gz"), "rt"
                ) as infile:
                    shutil.copyfileobj(infile, wf)
    matrix_file = os.path.join(assembly_folder, "matrix_file.tsv")

    meta_genomes = [
        assembly_folder + "/" + k + ".fasta.gz" for k in sample_assembly_map.keys()
    ]
    minhashes = []
    for contigs in meta_genomes:
        mh = sourmash.MinHash(ksize=31, n=0, scaled=1000)
        for record in screed.open(contigs):
            query_seq = record.sequence
            mh.add_sequence(query_seq, True)
        minhashes.append(mh)
    with open(matrix_file, "w") as f_out:
        for i, e in enumerate(minhashes):
            basename = os.path.basename(meta_genomes[i])
            name = ".".join((basename).split(".")[:-2])
            f_out.write(name + ",")
            for j, e2 in enumerate(minhashes):
                x = e.jaccard(minhashes[j])
                f_out.write(str(round(x, 3)) + ",")
            f_out.write("\n")
    data = pd.read_csv(matrix_file, sep=",", header=None, index_col=[0])
    data = data.dropna(axis="columns", how="all")
    data.index.names = ["index"]
    col_names = data.index.to_list()
    data.columns = col_names
    logging.info(f"matrix data frame: {data}")
    # returns a dictionary containing assembly name and count of proteins in the assembly

    proteins_info = fd.count_proteins(sample_assembly_map, sequence_dir)
    logging.info(f"Proteins info: {proteins_info}")
    database_folder = os.path.join(assembly_folder, "databases")
    if not os.path.isdir(database_folder):
        subprocess.Popen(" ".join(["mkdir ", database_folder]), shell=True)
    os.makedirs(database_folder, exist_ok=True)
    # returns a dictionary with the group number and assemblies in the group
    samples_in_cluster = gc.generate_clusters(
        data,
        args.db_size,
        proteins_info,
        args.study or args.input_dir,
        database_folder,
        sample_assembly_map,
    )
    logging.info(f"Samples in the cluster: {samples_in_cluster}")
    if args.study:
        fd.build_db(
            args.study, database_folder, assembly_folder, samples, samples_in_cluster
        )
    elif args.input_dir:
        fd.build_db(
            "study", database_folder, assembly_folder, samples, samples_in_cluster
        )
    for file in os.listdir(database_folder):
        if file.endswith(".faa") and not file.startswith("unique"):
            protein_file = os.path.join(database_folder, file)
            fd.uniq_proteins(database_folder, file)
            fd.remove_file(protein_file)
    logging.info("Completed")
    logging.info("Runtime is {} seconds".format(time.time() - starttime))


if __name__ == "__main__":
    log_file = "logs/db_generate.log"
    logging.basicConfig(
        filename=log_file,
        filemode="a",
        level=logging.DEBUG,
        format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s",
        datefmt="%H:%M:%S",
    )
    main()
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import subprocess
from argparse import ArgumentParser
from pathlib import Path

import pandas as pd
import track_hubs as th


def dir_path(string):
    if os.path.isdir(string):
        return string
    else:
        raise NotADirectoryError(string)


def main():
    """
    Aggregate information from metaproteomics and metagenomics about the expressed proteins
    and generate processed peptide reports for gff file generation
    """
    parser = ArgumentParser(
        description="Metadata for the post processing of peptide reports"
    )
    parser.add_argument(
        "-s",
        "--sample_info",
        type=str,
        required=True,
        help="Absolute path of the sample metadata file",
    )
    parser.add_argument(
        "-d",
        "--dir",
        type=str,
        required=True,
        help="output directory",
    )
    parser.add_argument(
        "-p",
        "--pride_id",
        type=str,
        required=True,
        help="PRIDE id for the reanalysed study",
    )

    args = parser.parse_args()
    sample_info = pd.read_csv(args.sample_info, sep=",")
    samples = list(set(sample_info["Sample"].to_list()))

    commandline = "mkdir -p " + args.dir
    subprocess.run(commandline, shell=True)

    peptideshaker_dir = Path(args.dir).parent.absolute() / "peptideshaker"

    for sample in samples:
        th.get_track_beds(
            f"{peptideshaker_dir}/peptideshaker_"
            + sample
            + "_1_Default_Peptide_Report.txt",
            f"{peptideshaker_dir}/peptideshaker_"
            + sample
            + "_1_Default_Protein_Report.txt",
            os.path.join(args.dir, "processed_" + sample + "_peptide_report.csv"),
            args.pride_id,
        )


if __name__ == "__main__":
    main()
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import argparse
import subprocess
import sys

import pandas as pd


def get_args():
    """Collect the inputs for the executable scripts."""
    parser = argparse.ArgumentParser(
        description="""This script is going to map the samples, raw files and searching databases
            between PRIDE and MGnify, then execute shell scripts. """
    )

    parser.add_argument(
        "-in",
        "--input",
        dest="input_file",
        help="the sample information file, where stores the mapping information for samples, databases and raw files",
    )
    parser.add_argument(
        "-raw",
        "--raw_folder",
        dest="raw_folder",
        help="the folder path where .raw files are found",
        required=False,
    )
    parser.add_argument(
        "-out",
        "--output_folder",
        dest="output_folder",
        help="the folder path in which searchgui and peptideshaker folder will be saved/read",
    )
    parser.add_argument("-jar", "--javafile", dest="jar_file", help="the jar file")
    parser.add_argument(
        "-s",
        "--searchgui",
        dest="searchgui",
        help="seargui search process",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "-p",
        "--peptideshaker",
        dest="peptideshaker",
        help="peptideshaker load process",
        action="store_true",
        default=False,
    )
    args = parser.parse_args()

    if args.input_file is None:
        sys.exit("Error: no input file (sample information file)")
    if args.output_folder is None:
        sys.exit("Error: no output path provided!")
    if args.jar_file is None:
        sys.exit("Error: no jar file provided!")
    if not (args.searchgui or args.peptideshaker):
        sys.exit("Error: no processing statues is given: searchgui or peptideshaker")
    if args.searchgui + args.peptideshaker > 1:
        sys.exit("Error: more than one processing statues is provided")

    return args


def searchgui_search(jar_file, input_file, raw_folder, output_folder):
    """
    run searchgui shell script within a loop based on the mapping sample information
    :input_file:    sample information file contains raw files, samples, mapped searching databases
    :jar_file:      the searchgui jar file
    :raw_folder:    folder containing the .raw PRIDE files
    :output_folder: output folder path for searchgui search results
    """
    sample_info = pd.read_csv(input_file, sep=",")
    samples = sample_info["Sample"].drop_duplicates().to_list()

    for sample in samples:
        params = " -spectrum_files"
        params += f" {raw_folder}/" + sample
        # params += " -fasta_file"
        fasta = sample_info.loc[sample_info["Sample"] == sample, "Db_name"].iloc[0]
        print(f"On sample {sample = } with {fasta =}")
        fasta = fasta.split(".")[0]
        params += f" -output_folder {output_folder}/searchgui"
        params += " -id_params"
        params += f" {output_folder}/assemblies/databases/" + fasta + "_searchgui.par"
        params += " -xtandem 1 -msgf 1"
        params += " -output_default_name"
        params += " " + sample + "_searchgui"
        params += " -output_option 0 -output_data 1 -output_date 0"
        params += " -log logs/SearchGUI_search &> "
        params += "logs/" + sample + "_SearchGUI_search.log"
        params += " && touch "
        params += f"{output_folder}/searchgui/{sample}_searchgui.zip"

        commandline = "java -cp " + jar_file + " eu.isas.searchgui.cmd.SearchCLI"
        commandline += params

        print(f"Will run: {commandline}")
        subprocess.run(commandline, shell=True)


def peptideshaker_load(
    jar_file,
    input_file,
    output_folder,
):
    """
    run peptideshaker shell script within a loop based on the mapping sample information
    :input_file:    sample information file contains raw files, samples, mapped searching databases
    :jar_file:      the peptideshaker jar file
    :output_folder: output folder path for peptideshaker results
    """
    sample_info = pd.read_csv(input_file, sep=",")
    samples = sample_info["Sample"].drop_duplicates().to_list()

    for sample in samples:
        print(f"Load peptideshaker for sample {sample}")
        params = " -experiment 'peptideshaker' -sample " + sample + " -replicate 1 "
        params += (
            f" -identification_files {output_folder}/searchgui/{sample}_searchgui.zip"
        )
        params += f" -out_reports {output_folder}/peptideshaker"
        params += " -reports 6,9"
        params += (
            f" -output_file {output_folder}/peptideshaker/{sample}_peptideshaker.mzid"
        )
        params += " &> logs/" + sample + "_PeptideShaker_load.log"

        commandline = (
            "java -cp " + jar_file + " eu.isas.peptideshaker.cmd.PeptideShakerCLI"
        )
        commandline += params
        print(f"Will run peptideshaker command {commandline}")

        subprocess.run(commandline, shell=True)


def main():
    """
    Main function
    """
    args = get_args()

    if args.searchgui:
        searchgui_search(
            args.jar_file, args.input_file, args.raw_folder, args.output_folder
        )
    elif args.peptideshaker:
        peptideshaker_load(
            args.jar_file,
            args.input_file,
            args.output_folder,
        )
    else:
        sys.exit("BUG! this should not happen.")


if __name__ == "__main__":
    main()
ShowHide 12 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/PRIDE-reanalysis/MetaPUF
Name: metapuf
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: Apache License 2.0
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...