MetaPUF

public 1yr ago 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

MetaPUF

An approach to integrate metagenomics, metatranscriptomics and metaproteomics data found in public resources such as MGnify (for metagenomics/metatranscriptomics) and the PRIDE database (for metaproteomics). When these omics tec

Code Snippets

shell:
    """
    if [ -z "{params.study}" ]; then
        python workflow/scripts/metagenomics_db/main.py -d {params.input_dir} -v {params.ver} -o {params.output_dir} -m {input.sample_metadata} -b {params.db_size} &> {log}
    else
        python workflow/scripts/metagenomics_db/main.py -s {params.study} -v {params.ver} -o {params.output_dir} -m {input.sample_metadata} -b {params.db_size} &> {log}
    fi
    """

SnakeMake From line 38 of rules/generate_db.smk

shell:
    "python workflow/scripts/gff_generation/main.py -i {input.metap_sample_info} -r {params.reports_dir} -m {params.metag_dir} -p {params.pride_id}"

SnakeMake From line 37 of rules/gff_generation.smk

shell:
    "unzip {input.zip} -d {params.peptideshaker_tooldir} > {log}"

SnakeMake From line 35 of rules/peptide_shaker.smk

shell:
    "python workflow/scripts/searchgui_search.py -p -jar {input.jar} -in {input.info} "
    "-out {params.outputdir}"

SnakeMake SearchGUI From line 56 of rules/peptide_shaker.smk

shell:
    "python workflow/scripts/post_report_generation/main.py -s {input.info} -d {params.dir} -p {params.pid} &> {log}"

SnakeMake From line 28 of rules/post_processing.smk

shell:
    "unzip {input.exe_zip} -d {params.thermorawfileparser_tool} > {log}"

SnakeMake From line 28 of rules/raw_file_parse.smk

shell:
    "python workflow/scripts/coping_raw_files.py -exe {input.exe} -info {input.info} -in {params.raw} -out {params.folder}"

SnakeMake From line 46 of rules/raw_file_parse.smk

shell:
    "tar -xzvf {input.zip} -C {params.searchgui_tooldir} > {log}"

SnakeMake From line 36 of rules/searchgui.smk

shell:
    "python workflow/scripts/generating_decoy.py -jar {input.jar} -info {input.info} "
    "-human {input.human_db} -crap {input.crap_db} -sgpars '{params.searchgui_params}' "
    "-adir {params.assemblies_dir}"

SnakeMake From line 57 of rules/searchgui.smk

shell:
    "python workflow/scripts/searchgui_search.py -s -jar {input.jar} -in {input.info} "
    "-raw {input.raw} -out {params.outdir}"

SnakeMake SearchGUI From line 83 of rules/searchgui.smk

import argparse
import os
import subprocess
import sys

import pandas as pd


def get_args():
    """Collect the inputs for the executable scripts."""
    parser = argparse.ArgumentParser(
        description="""This script is going to map the samples, raw files and searching databases
            between PRIDE and MGnify, then execute shell scripts. """
    )

    parser.add_argument(
        "-info",
        "--sampleinfo",
        dest="info_file",
        help="the sample information file, where stores the mapping information for samples, databases and raw files",
    )
    parser.add_argument(
        "-in",
        "--input_folder",
        dest="input_folder",
        help="the folder path where raw files are",
    )
    parser.add_argument(
        "-out",
        "--output",
        dest="output_folder",
        help="the folder path for saving files",
    )
    parser.add_argument("-exe", "--exefile", dest="exe_file", help="the exec file")

    args = parser.parse_args()

    if args.info_file is None:
        sys.exit("Error: no info file (sample information file)")
    if args.output_folder is None:
        sys.exit("Error: no input folder path provided!")
    if args.exe_file is None:
        sys.exit("Error: no exec file provided!")

    return args


# sample_info = pd.read_csv("sample_info_final.csv", sep=',')
def thermorawfileparser(exe_file, info_file, input_folder, output_folder):
    """
    downloading raw file via FTP and convert them into peak list files based on the sample information
    :info_file:     sample information file contains raw files, samples, mapped searching databases
    :input_folder: input folder path for raw data (if not from rawurls)
    :output_folder: output folder path for peak list data
    """
    sample_info = pd.read_csv(info_file, sep=",")
    sample_info = sample_info[["Sample", "Raw file", "Raw file URLs"]].drop_duplicates()
    RawURLs = sample_info["Raw file URLs"].dropna().to_list()

    if len(RawURLs) > 0:
        samples = {
            k: g["Raw file URLs"].tolist() for k, g in sample_info.groupby("Sample")
        }
        for sample in samples.keys():
            # generate the folder if it is not there
            folder = os.path.join(output_folder, sample)
            os.makedirs(folder, exist_ok=True)
            # download the raw files if they are not provided locally.
            commandline = ""
            for url in samples[sample]:
                commandline = "wget -P " + folder + " -i " + url + ";"

            commandline += " mono " + exe_file + " -d=" + folder + " -o=" + folder
            # -f=1 means mzML format file
            commandline += " -f=0 -m=0 &> logs/" + sample + "_thermorawfileparser.log"
            subprocess.run(commandline, shell=True)

    else:
        samples = {k: g["Raw file"].tolist() for k, g in sample_info.groupby("Sample")}
        for sample in samples.keys():
            # generate the folder if it is not there
            in_folder = os.path.join(input_folder, sample)
            out_folder = os.path.join(output_folder, sample)
            os.makedirs(out_folder, exist_ok=True)
            commandline = f" mono {exe_file} -d={in_folder} -o={out_folder}"
            # -f=1 means mzML format file
            commandline += " -f=0 -m=0 &> logs/" + sample + "_thermorawfileparser.log"
            subprocess.run(commandline, shell=True)


def main():
    """
    Main function
    """
    args = get_args()

    thermorawfileparser(
        args.exe_file, args.info_file, args.input_folder, args.output_folder
    )


if __name__ == "__main__":
    main()

Python Pandas From line 1 of scripts/coping_raw_files.py

import argparse
import json
import logging
import os
import subprocess
import sys
from pathlib import Path

import pandas as pd


def get_args():
    """Collect the inputs for the executable scripts."""
    parser = argparse.ArgumentParser(
        description="""This script is going to add the human and contanminat protein sequences
                into the generated databases and also add decoy into every database"""
    )

    parser.add_argument(
        "-info",
        "--sampleinfo",
        dest="info_file",
        help="the final sample information file, where also stores generaed databases",
    )
    parser.add_argument(
        "-human",
        "--humandb",
        dest="human_db",
        help="the human reference protein sequences",
    )
    parser.add_argument(
        "-crap", "--crapdb", dest="crap_db", help="the contanminat sequences"
    )
    parser.add_argument("-jar", "--jarfile", dest="jar_file", help="the jar file")
    parser.add_argument(
        "-sgpars",
        "--searchgui_params",
        dest="searchgui_params",
        help="json-like string of options to pass to searchgui",
    )
    parser.add_argument(
        "-adir",
        "--assemblies_dir",
        dest="assemblies_dir",
        help="the output assemblies directory path",
    )

    args = parser.parse_args()

    if args.info_file is None:
        sys.exit("Error: no info file (sample information file)")
    if args.human_db is None:
        sys.exit("Error: no human reference protein database provided!")
    if args.crap_db is None:
        sys.exit("Error: no contanminat protein database provided!")
    if args.jar_file is None:
        sys.exit("Error: no jar file provided!")

    return args


def searchgui_decoy(
    jar_file, info_file, human_db, crap_db, searchgui_params, assemblies_folder
):
    """
    adding human, contaminant and decoy to the protein databases.
    :jar_file:      the searchgui jar file (whole path)
    :info_file:     final sample information file also contains mapped searching databases
    :human_db:      the human protein reference sequences
    :crap_db:       the contanminat protein sequences
    :assemblies_folder: the output/assemblies dir path
    """
    sample_info = pd.read_csv(info_file, sep=",")
    proteins = sample_info["Db_name"].dropna().drop_duplicates().to_list()

    config = json.loads(searchgui_params)
    SEARCHGUI_PAR_PARAMS = " ".join(
        [
            "-%s %s" % (k, "'%s'" % v if isinstance(v, str) else str(v))
            for k, v in config.items()
        ]
    )

    for protein in proteins:
        protein_file = f"{assemblies_folder}/databases/{protein}"
        print(f"On {protein = } with {protein_file = }")
        fasta = Path(protein_file).resolve().with_suffix(".fasta").as_posix()
        par = f"{Path(protein_file).parent.resolve()}/{Path(protein_file).stem}_searchgui.par"
        db = f"{Path(fasta).parent.resolve()}/{Path(fasta).stem}_concatenated_target_decoy.fasta"
        log = Path(protein_file).stem + "_SearchGUI_params.log"

        commandline = (
            "cat " + protein_file + " " + human_db + " " + crap_db + " > " + fasta + ";"
        )
        # generating the protein decoy
        params = " -in " + fasta
        params += " -decoy &> "
        params += "logs/" + protein.split(".")[0] + "_SearchGUI_search.log;"

        commandline += " java -cp " + jar_file + " eu.isas.searchgui.cmd.FastaCLI"
        commandline += params

        # generating the protein parameter file
        params = f" -out {par} "
        params += f" -db {db} "
        params += SEARCHGUI_PAR_PARAMS
        params += f" &> logs/{log};"
        commandline += (
            " java -cp "
            + jar_file
            + " eu.isas.searchgui.cmd.IdentificationParametersCLI"
        )
        commandline += params

        print(f"Will run {commandline}")
        subprocess.run(commandline, shell=True)
        # print(commandline)

    # commandline = "mkdir -p " + s_dir
    # subprocess.run(commandline, shell=True)
    flag_txt = (
        f"{assemblies_folder}/databases/proteins_decoy_params_generated_check.txt"
    )
    with open(flag_txt, "w") as f:
        f.write("All protein databases have been processed!")


def main():
    """
    Main function
    """
    args = get_args()

    searchgui_decoy(
        args.jar_file,
        args.info_file,
        args.human_db,
        args.crap_db,
        args.searchgui_params,
        args.assemblies_dir,
    )


if __name__ == "__main__":
    main()

Python Pandas JSON From line 1 of scripts/generating_decoy.py

import logging
import os
import time
from argparse import ArgumentParser

import gff_builder as gb
import pandas as pd


def dir_path(string):
    if os.path.isdir(string):
        return string
    else:
        raise NotADirectoryError(string)


def main():  # noqa: C901
    """
    Aggregate information from metaproteomics and metagenomics about the expressed proteins
    and generate GFF format files for visualisation
    """
    parser = ArgumentParser(
        description="Aggregate information from metaproteomics and metagenomics about the expressed proteins and generate GFF format files for visualisation"
    )
    parser.add_argument(
        "-i",
        "--sample_info",
        type=str,
        required=True,
        help="Absolute path of the sample metadata file",
    )
    parser.add_argument(
        "-r",
        "--reports_dir",
        type=dir_path,
        required=True,
        help="full path of the directory containing processed metaproteomics reports",
    )
    parser.add_argument(
        "-m",
        "--metag_dir",
        type=dir_path,
        required=True,
        help="full path of the directory containing contig metadata file",
    )
    parser.add_argument(
        "-p",
        "--pride_id",
        type=str,
        required=True,
        help="PRIDE id for the reanalysed study",
    )

    starttime = time.time()
    args = parser.parse_args()
    # read in sample_info file and get the sample names in a list
    sample_info = pd.read_csv(args.sample_info, sep=",")

    # check if the results folder exists or create the results folder
    results_folder = os.path.join(args.reports_dir, "results")
    os.makedirs(results_folder, exist_ok=True)

    # get the filenames as list and concatenate the expressed peptides from all the samples
    sample_file_list = [
        args.reports_dir + "/" + f
        for f in os.listdir(args.reports_dir)
        if f.endswith("_peptide_report.csv")
    ]
    csv_list = []

    for file in sorted(sample_file_list):
        csv_list.append(pd.read_csv(file))

    csv_merged = pd.concat(csv_list, ignore_index=True)
    csv_merged.to_csv(os.path.join(results_folder, "allsamples.csv"), index=False)
    proteins = list(set(csv_merged["Protein"].to_list()))
    # create a new dataframe with unique peptide digests
    unique_proteins = pd.DataFrame(proteins, columns=["digest"])
    # get the list of distinct assemblies in the study
    assemblies = list(set(sample_info["Assembly"].to_list()))

    for assembly in assemblies:
        temp_assembly = pd.read_csv(
            os.path.join(args.metag_dir, assembly + "_contig_info.txt"), sep="\t"
        )
        temp_assembly.columns = [
            "assembly",
            "contig_name",
            "digest",
            "partial_info",
            "protein_start",
            "protein_end",
            "strand",
            "caller",
        ]
        # get all the matching expressed proteins from the contigs_info.txt
        assembly_subset_of_proteins = pd.merge(
            unique_proteins, temp_assembly, on="digest", how="inner"
        )

        # get all the peptides expressed in the given assembly
        assembly_expressed_proteins = assembly_subset_of_proteins.merge(
            csv_merged, left_on="digest", right_on="Protein", how="inner"
        )

        if len(assembly_expressed_proteins) > 0:
            assembly_expressed_proteins["max_spectrum_count"] = gb.calculate_max_count(
                assembly_expressed_proteins
            )
            assembly_expressed_proteins.to_csv(
                os.path.join(results_folder, assembly + "_expressed_proteins.csv")
            )
            expressed_proteins = list(set(assembly_expressed_proteins["digest"]))
            attributes_file, max_spectrum_count = gb.protein_report_processing(
                assembly_expressed_proteins, expressed_proteins, args.pride_id
            )
            if len(attributes_file) >= 1:
                gb.gff_generation_unique(
                    attributes_file, assembly, results_folder, max_spectrum_count
                )

    logging.info("Completed")
    logging.info("Runtime is {} seconds".format(time.time() - starttime))


if __name__ == "__main__":
    log_file = "logs/gff_generate.log"
    logging.basicConfig(
        filename=log_file,
        filemode="a",
        level=logging.DEBUG,
        format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s",
        datefmt="%H:%M:%S",
    )
    main()

Python Pandas From line 16 of gff_generation/main.py

import gzip
import logging
import os
import shutil
import subprocess
import sys
import time
from argparse import ArgumentParser
from collections import defaultdict

import fetch_data as fd
import get_clusters as gc
import pandas as pd
import screed
import sourmash


def dir_path(string):
    if os.path.isdir(string):
        return string
    else:
        raise NotADirectoryError(string)


def main():  # noqa: C901
    """
    Generates protein databases containing non-redundant proteins from
    metagenomics and/or metatranscriptomics assemblies
    """
    parser = ArgumentParser(
        description="Generate protein sequence database for Metagenomics and /or Metatranscriptomics datasets"
    )
    """
    The user can enter their own data using --input_dir option
    or
    select --study to analyse study published on MGnify website
    """

    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        "-s",
        "--study",
        type=str,
        help="Secondary study accession of the assembled study in MGnify starting with ERP/SRP/DRP ",
    )
    group.add_argument(
        "-d",
        "--input_dir",
        type=dir_path,
        help="full path of the study folder containing protein files ",
    )
    parser.add_argument(
        "-v",
        "--ver",
        type=str,
        required=True,
        help="pipeline version of MGnify analysis for the study",
    )
    parser.add_argument(
        "-o",
        "--output_dir",
        type=dir_path,
        required=True,
        help="full path of the study folder containing results",
    )
    parser.add_argument(
        "-m",
        "--metadata",
        type=str,
        required=True,
        help="full path of the sample-assembly mapping file (.csv) with filename.",
    )
    parser.add_argument(
        "-b",
        "--db_size",
        type=int,
        help="input the maximum size of the protein search database in bytes",
        default=1073741824,
    )

    starttime = time.time()
    args = parser.parse_args()
    sample_assembly_map = defaultdict(set)

    assembly_folder = os.path.join(args.output_dir, "assemblies")
    os.makedirs(assembly_folder, exist_ok=True)

    logging.info(f"Will read samples info from {args.metadata}")
    samples = pd.read_csv(args.metadata, sep=",")

    os.chdir(args.output_dir)
    if args.study:
        logging.info(f"Will download data from MGnify API for study {args.study}")
        fd.check_study_accession(args.study)
        # getting the sequnece data and predicted cds
        cmd_get_data = "  ".join(
            [
                "mg-toolkit -d bulk_download -a",
                args.study,
                "-p ",
                args.ver,
                "-g sequence_data",
            ]
        )
        subprocess.call(cmd_get_data, shell=True)
        sequence_dir = (
            args.output_dir + "/" + args.study + "/" + args.ver + "/sequence_data"
        )

    elif args.input_dir:
        logging.info(f"Will use MGnify data from input dir {args.input_dir}")
        ## Assembly input files should be gzipped and end with "_FASTA.fasta.gz"
        ## Prodigal (v.2.6.3) predicted CDS file should be gzipped and end with "_FASTA_predicted_cds.faa.gz"
        if len(os.listdir(args.input_dir)) == 0:
            sys.exit("{} is empty".format(args.input_dir))
        else:
            sequence_dir = str(args.input_dir)
        for input_file in os.listdir(args.input_dir):
            if input_file.endswith("_FASTA.fasta.gz"):
                logging.info(f"Input file {input_file} is correctly named")
                continue
            elif input_file.endswith("_FASTA_predicted_cds.faa.gz"):
                logging.info(f"Input file {input_file} is correctly named")
                continue
            else:
                logging.warning(
                    f"The input file {input_file} is not named with correct naming convention. Please check documentation for correct naming convention"
                )

    for idx, row in samples.iterrows():
        sample_assembly_map[row["Secondary Sample Accession"]].add(row["Assembly"])
    logging.info("Mapping between samples and the assemblies: ", sample_assembly_map)
    for k, v in sample_assembly_map.items():
        sample_file = os.path.join(assembly_folder, k + ".fasta.gz")
        logging.info(f"Reading assemblies from sample {sample_file}")
        with gzip.open(sample_file, "wt") as wf:
            for assembly in v:
                # renaming headers of the predicted cds in assemblies
                fd.rename_contigs(assembly, assembly_folder, sequence_dir)
                with gzip.open(
                    os.path.join(sequence_dir, assembly + "_FASTA.fasta.gz"), "rt"
                ) as infile:
                    shutil.copyfileobj(infile, wf)
    matrix_file = os.path.join(assembly_folder, "matrix_file.tsv")

    meta_genomes = [
        assembly_folder + "/" + k + ".fasta.gz" for k in sample_assembly_map.keys()
    ]
    minhashes = []
    for contigs in meta_genomes:
        mh = sourmash.MinHash(ksize=31, n=0, scaled=1000)
        for record in screed.open(contigs):
            query_seq = record.sequence
            mh.add_sequence(query_seq, True)
        minhashes.append(mh)
    with open(matrix_file, "w") as f_out:
        for i, e in enumerate(minhashes):
            basename = os.path.basename(meta_genomes[i])
            name = ".".join((basename).split(".")[:-2])
            f_out.write(name + ",")
            for j, e2 in enumerate(minhashes):
                x = e.jaccard(minhashes[j])
                f_out.write(str(round(x, 3)) + ",")
            f_out.write("\n")
    data = pd.read_csv(matrix_file, sep=",", header=None, index_col=[0])
    data = data.dropna(axis="columns", how="all")
    data.index.names = ["index"]
    col_names = data.index.to_list()
    data.columns = col_names
    logging.info(f"matrix data frame: {data}")
    # returns a dictionary containing assembly name and count of proteins in the assembly

    proteins_info = fd.count_proteins(sample_assembly_map, sequence_dir)
    logging.info(f"Proteins info: {proteins_info}")
    database_folder = os.path.join(assembly_folder, "databases")
    if not os.path.isdir(database_folder):
        subprocess.Popen(" ".join(["mkdir ", database_folder]), shell=True)
    os.makedirs(database_folder, exist_ok=True)
    # returns a dictionary with the group number and assemblies in the group
    samples_in_cluster = gc.generate_clusters(
        data,
        args.db_size,
        proteins_info,
        args.study or args.input_dir,
        database_folder,
        sample_assembly_map,
    )
    logging.info(f"Samples in the cluster: {samples_in_cluster}")
    if args.study:
        fd.build_db(
            args.study, database_folder, assembly_folder, samples, samples_in_cluster
        )
    elif args.input_dir:
        fd.build_db(
            "study", database_folder, assembly_folder, samples, samples_in_cluster
        )
    for file in os.listdir(database_folder):
        if file.endswith(".faa") and not file.startswith("unique"):
            protein_file = os.path.join(database_folder, file)
            fd.uniq_proteins(database_folder, file)
            fd.remove_file(protein_file)
    logging.info("Completed")
    logging.info("Runtime is {} seconds".format(time.time() - starttime))


if __name__ == "__main__":
    log_file = "logs/db_generate.log"
    logging.basicConfig(
        filename=log_file,
        filemode="a",
        level=logging.DEBUG,
        format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s",
        datefmt="%H:%M:%S",
    )
    main()

Python Pandas screed From line 16 of metagenomics_db/main.py

import os
import subprocess
from argparse import ArgumentParser
from pathlib import Path

import pandas as pd
import track_hubs as th


def dir_path(string):
    if os.path.isdir(string):
        return string
    else:
        raise NotADirectoryError(string)


def main():
    """
    Aggregate information from metaproteomics and metagenomics about the expressed proteins
    and generate processed peptide reports for gff file generation
    """
    parser = ArgumentParser(
        description="Metadata for the post processing of peptide reports"
    )
    parser.add_argument(
        "-s",
        "--sample_info",
        type=str,
        required=True,
        help="Absolute path of the sample metadata file",
    )
    parser.add_argument(
        "-d",
        "--dir",
        type=str,
        required=True,
        help="output directory",
    )
    parser.add_argument(
        "-p",
        "--pride_id",
        type=str,
        required=True,
        help="PRIDE id for the reanalysed study",
    )

    args = parser.parse_args()
    sample_info = pd.read_csv(args.sample_info, sep=",")
    samples = list(set(sample_info["Sample"].to_list()))

    commandline = "mkdir -p " + args.dir
    subprocess.run(commandline, shell=True)

    peptideshaker_dir = Path(args.dir).parent.absolute() / "peptideshaker"

    for sample in samples:
        th.get_track_beds(
            f"{peptideshaker_dir}/peptideshaker_"
            + sample
            + "_1_Default_Peptide_Report.txt",
            f"{peptideshaker_dir}/peptideshaker_"
            + sample
            + "_1_Default_Protein_Report.txt",
            os.path.join(args.dir, "processed_" + sample + "_peptide_report.csv"),
            args.pride_id,
        )


if __name__ == "__main__":
    main()

Python Pandas From line 16 of post_report_generation/main.py

import argparse
import subprocess
import sys

import pandas as pd


def get_args():
    """Collect the inputs for the executable scripts."""
    parser = argparse.ArgumentParser(
        description="""This script is going to map the samples, raw files and searching databases
            between PRIDE and MGnify, then execute shell scripts. """
    )

    parser.add_argument(
        "-in",
        "--input",
        dest="input_file",
        help="the sample information file, where stores the mapping information for samples, databases and raw files",
    )
    parser.add_argument(
        "-raw",
        "--raw_folder",
        dest="raw_folder",
        help="the folder path where .raw files are found",
        required=False,
    )
    parser.add_argument(
        "-out",
        "--output_folder",
        dest="output_folder",
        help="the folder path in which searchgui and peptideshaker folder will be saved/read",
    )
    parser.add_argument("-jar", "--javafile", dest="jar_file", help="the jar file")
    parser.add_argument(
        "-s",
        "--searchgui",
        dest="searchgui",
        help="seargui search process",
        action="store_true",
        default=False,
    )
    parser.add_argument(
        "-p",
        "--peptideshaker",
        dest="peptideshaker",
        help="peptideshaker load process",
        action="store_true",
        default=False,
    )
    args = parser.parse_args()

    if args.input_file is None:
        sys.exit("Error: no input file (sample information file)")
    if args.output_folder is None:
        sys.exit("Error: no output path provided!")
    if args.jar_file is None:
        sys.exit("Error: no jar file provided!")
    if not (args.searchgui or args.peptideshaker):
        sys.exit("Error: no processing statues is given: searchgui or peptideshaker")
    if args.searchgui + args.peptideshaker > 1:
        sys.exit("Error: more than one processing statues is provided")

    return args


def searchgui_search(jar_file, input_file, raw_folder, output_folder):
    """
    run searchgui shell script within a loop based on the mapping sample information
    :input_file:    sample information file contains raw files, samples, mapped searching databases
    :jar_file:      the searchgui jar file
    :raw_folder:    folder containing the .raw PRIDE files
    :output_folder: output folder path for searchgui search results
    """
    sample_info = pd.read_csv(input_file, sep=",")
    samples = sample_info["Sample"].drop_duplicates().to_list()

    for sample in samples:
        params = " -spectrum_files"
        params += f" {raw_folder}/" + sample
        # params += " -fasta_file"
        fasta = sample_info.loc[sample_info["Sample"] == sample, "Db_name"].iloc[0]
        print(f"On sample {sample = } with {fasta =}")
        fasta = fasta.split(".")[0]
        params += f" -output_folder {output_folder}/searchgui"
        params += " -id_params"
        params += f" {output_folder}/assemblies/databases/" + fasta + "_searchgui.par"
        params += " -xtandem 1 -msgf 1"
        params += " -output_default_name"
        params += " " + sample + "_searchgui"
        params += " -output_option 0 -output_data 1 -output_date 0"
        params += " -log logs/SearchGUI_search &> "
        params += "logs/" + sample + "_SearchGUI_search.log"
        params += " && touch "
        params += f"{output_folder}/searchgui/{sample}_searchgui.zip"

        commandline = "java -cp " + jar_file + " eu.isas.searchgui.cmd.SearchCLI"
        commandline += params

        print(f"Will run: {commandline}")
        subprocess.run(commandline, shell=True)


def peptideshaker_load(
    jar_file,
    input_file,
    output_folder,
):
    """
    run peptideshaker shell script within a loop based on the mapping sample information
    :input_file:    sample information file contains raw files, samples, mapped searching databases
    :jar_file:      the peptideshaker jar file
    :output_folder: output folder path for peptideshaker results
    """
    sample_info = pd.read_csv(input_file, sep=",")
    samples = sample_info["Sample"].drop_duplicates().to_list()

    for sample in samples:
        print(f"Load peptideshaker for sample {sample}")
        params = " -experiment 'peptideshaker' -sample " + sample + " -replicate 1 "
        params += (
            f" -identification_files {output_folder}/searchgui/{sample}_searchgui.zip"
        )
        params += f" -out_reports {output_folder}/peptideshaker"
        params += " -reports 6,9"
        params += (
            f" -output_file {output_folder}/peptideshaker/{sample}_peptideshaker.mzid"
        )
        params += " &> logs/" + sample + "_PeptideShaker_load.log"

        commandline = (
            "java -cp " + jar_file + " eu.isas.peptideshaker.cmd.PeptideShakerCLI"
        )
        commandline += params
        print(f"Will run peptideshaker command {commandline}")

        subprocess.run(commandline, shell=True)


def main():
    """
    Main function
    """
    args = get_args()

    if args.searchgui:
        searchgui_search(
            args.jar_file, args.input_file, args.raw_folder, args.output_folder
        )
    elif args.peptideshaker:
        peptideshaker_load(
            args.jar_file,
            args.input_file,
            args.output_folder,
        )
    else:
        sys.exit("BUG! this should not happen.")


if __name__ == "__main__":
    main()