RepairNatrix - An example workflow

public 1yr ago 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

*DAG of an example workflow for RepairNatrix

Dependencies

Code Snippets

script:
    "../scripts/constraint_filtering.py"

SnakeMake From line 13 of rules/constraint_filtering.smk

script:
    "../scripts/constraints/check_constraints.py"

SnakeMake From line 37 of rules/constraint_filtering.smk

shell:
    "seqtk seq -a {input} > {output}"

SnakeMake seqtk From line 81 of rules/constraint_filtering.smk

shell:
    "seqtk seq {input} > {output}"

SnakeMake seqtk From line 10 of rules/contig_assembly.smk

script:
    "../scripts/contig_assembly.py"

SnakeMake From line 27 of rules/contig_assembly.smk

shell:
    "cp {input} {output}"

SnakeMake From line 35 of rules/contig_assembly.smk

script:
    "../scripts/demultiplexing.py"

SnakeMake From line 13 of rules/demultiplexing.smk

shell:
     "gunzip -c {input} > {output}"

SnakeMake From line 21 of rules/demultiplexing.smk

shell:
    "vsearch --derep_fulllength {input} --output {output} --log {log} --minuniquesize {params.minsize}"

SnakeMake VSEARCH From line 13 of rules/dereplication.smk

shell:
    "vsearch --cluster_size {input} --centroids {output.cent} --id {params.id} --clusters results/assembly/{wildcards.sample}_{wildcards.unit}/clust  --log {log} --threads {threads}"

SnakeMake VSEARCH From line 28 of rules/dereplication.smk

script:
    "../scripts/derep_quality.py"

SnakeMake From line 42 of rules/dereplication.smk

shell:
    "fastq-sort --mean-qual -r {input} > {output}"

SnakeMake From line 52 of rules/dereplication.smk

shell:
    "seqtk seq -a {input} > {output}"

SnakeMake seqtk From line 62 of rules/dereplication.smk

shell:
    "vsearch --usersort --cluster_smallmem {input} --centroid {output.cent} --id {params.id} --clusters results/assembly/{wildcards.sample}_{wildcards.unit}/clust --log {log} --threads {threads}"

SnakeMake VSEARCH From line 77 of rules/dereplication.smk

script:
    "../scripts/define_primer.py"

SnakeMake From line 27 of rules/read_assembly.smk

script:
    "../scripts/prinseq.py"

SnakeMake From line 44 of rules/read_assembly.smk

script:
    "../scripts/assembly.py"

SnakeMake From line 67 of rules/read_assembly.smk

shell:
    "seqtk seq -a {input} > {output}"

SnakeMake seqtk From line 79 of rules/read_assembly.smk

import re
import yaml
import dinopy
import logging
import subprocess
import numpy as np
import pandas as pd
from glob import glob

# Script to assemble paired end reads using PandaSeq.
# If the primer were previously not removed, PandaSeq will remove them.
# If the sequences are single end, the primer will be cut off
# and the remaining length of the sequences will be compared to the
# cutoff values defined in the config.

primer_table = pd.read_csv(snakemake.input.primer_t, index_col="Probe",
        na_filter=False).to_dict("index")

if snakemake.params.paired_end:
    if snakemake.params.prim_rm:
        subprocess.call(["pandaseq",
            "-f",snakemake.input[0], "-r", snakemake.input[1], "-B", "-a", "-F",
            "-g", str(snakemake.log),
            "-w", str(snakemake.output), "-N",
            "-T", str(snakemake.threads),
            "-t", str(snakemake.params.threshold),
            "-o", str(snakemake.params.minoverlap),
            "-l", str(snakemake.params.minlen),
            "-L", str(snakemake.params.maxlen),
            "-C" "min_phred:" + str(snakemake.params.minqual)])
    else:
        r1_primer = primer_table[snakemake.wildcards.sample + "_"
            + snakemake.wildcards.unit]["specific_forward_primer"]
        r2_primer = primer_table[snakemake.wildcards.sample + "_"
            + snakemake.wildcards.unit]["specific_reverse_primer"]

        subprocess.call(["pandaseq",
            "-f", snakemake.input[0], "-r", snakemake.input[1], "-B", "-a", "-F",
            "-g", str(snakemake.log),
            "-w", str(snakemake.output),"-N",
            "-p", r1_primer, "-q", r2_primer,
            "-T", str(snakemake.threads),
            "-t", str(snakemake.params.threshold),
            "-o", str(snakemake.params.minoverlap),
            "-l", str(snakemake.params.minlen),
            "-L", str(snakemake.params.maxlen),
            "-C" "min_phred:" + str(snakemake.params.minqual)])
else:
    logging.basicConfig(filename=str(snakemake.log),
            level=logging.DEBUG)
    iupac_dict_regex = {"M":"[AC]", "R":"[AG]", "W":"[AT]", "S":"[CG]",
            "Y":"[CT]","K":"[GT]", "V":"[ACG]", "H":"[ACT]",
            "D":"[AGT]", "B":"[CGT]", "X":"[ACGT]", "N":"[ACGT]"}

    # Function to remove the primer, barcode & polyN from the sequences
    # and compare the sequence length to the cutoffs defined in the config.
    def primer_len_filter(path, sample):
        sequence = dinopy.FastqReader(path)
        assembled = dinopy.FastqWriter(path.rsplit("_", 1)[0]
                + "_assembled.fastq")
        filt_out = dinopy.FastqWriter(path.rsplit("_", 1)[0]
                + "_filtered_out.fastq")
        assembled_counter = 0
        filt_out_counter = 0
        assembled.open()
        filt_out.open()
        for read in sequence.reads(quality_values=True):
            name = read.name.decode()
            seq = check_for_match(read.sequence.decode(), sample)
            if seq[0] and snakemake.params.maxlen >= len(seq[1]) >= snakemake.params.minlen:
                assembled.write(seq[1].encode(), name.split(" ")[0].encode(), read.quality)
                assembled_counter += 1
            else:
                filt_out.write(read.sequence, name.split(" ")[0].encode(), read.quality)
                filt_out_counter += 1
        logging.info("{}: {} sequences were kept, \
                {} sequences were filtered out".format(sample,
                    assembled_counter, filt_out_counter))
        assembled.close()
        filt_out.close()

    # Helper function for the IUPAC extended nucleotide base code.
    def iupac_replace(sequence, iupac_dict):
        for i, j in iupac_dict_regex.items():
            sequence = sequence.replace(i, j)
        return sequence

    # Function to search and remove primer and barcode sequences.
    # It will also search for matching sequences two bases shifted
    # to the left and right from the position defined by the primertable,
    # to account for potential errors in the polyN column of the primertable.
    def check_for_match(sequence, sample):
        if snakemake.params.prim_rm:
            return (True, sequence)
        else:
            poly_prim_bar = [primer_table[sample][key] for key
                    in primer_table[sample].keys() if key in ["poly_N",
                        "specific_forward_primer", "Barcode_forward"]]
            prim_bar = re.compile(poly_prim_bar[1]
                    + iupac_replace(poly_prim_bar[2], iupac_dict_regex))
            for i in [0, 1, -1, 2, -2]:
                start = np.clip(len(primer_table[sample]["poly_N"]) + i,
                        a_min=0, a_max=None)
                end = np.clip(len("".join(poly_prim_bar)) + i, a_min=0,
                        a_max=None)
                if prim_bar.match(sequence[start : end]):
                    return (True, sequence.replace(sequence[ : end], ""))
                else:
                    return (False, sequence)

    primer_len_filter(snakemake.input[0],
            snakemake.input[0].split("/")[-1].rsplit("_", 1)[0])

Python Pandas numpy PyYAML dinopy PANDAseq From line 1 of scripts/assembly.py

import json

import constraints

from multiprocessing import Pool
from itertools import zip_longest

import dinopy

fqr_1 = dinopy.FastqReader(str(snakemake.input[0]))
if(snakemake.params.paired):
    fqr_2 = dinopy.FastqReader(str(snakemake.input[1]))
else:
    fqr_2 = list()

filtered_1 = dinopy.FastqWriter(str(snakemake.output[0]))
if not snakemake.params.paired:
    filtered_out_1 = dinopy.FastqWriter(str(snakemake.output[1]))
else:
    filtered_2 = dinopy.FastqWriter(str(snakemake.output[1]))
    filtered_out_1 = dinopy.FastqWriter(str(snakemake.output[2]))
    filtered_out_2 = dinopy.FastqWriter(str(snakemake.output[3]))


filtered_1.open()
filtered_out_1.open()
if snakemake.params.paired:
    filtered_2.open()
    filtered_out_2.open()


def filter(reads):
    sequences = []
    sequences.append(reads[0][0].decode())
    if snakemake.params.paired:
        sequences.append(reads[1][0].decode())
    for i,sequence in enumerate(sequences):
        if snakemake.params.primer_length > 0:
            sequence = sequence[snakemake.params.primer_length:snakemake.params.sequence_length+snakemake.params.primer_length]
        for constraint in constraints.constraints:
            if (constraint.__name__ in snakemake.params.constraints['constraints']):
                if (constraint.__name__ == 'undesired_subsequences'):
                    failed = constraint(sequence)
                else:
                    failed = constraint(sequence, **snakemake.params.constraints[constraint.__name__])
                if failed:
                    return reads, failed, constraint.__name__, i+1
    return reads, False, '', None

def reads(reads):
    if isinstance(reads, list):
        return
    else:
        for read in reads.reads(quality_values=True):
            yield [read.sequence, read.name, read.quality]

pool = Pool(snakemake.threads)
results = pool.imap_unordered(filter, zip_longest(reads(fqr_1), reads(fqr_2)), 10000)
countdict = dict()
countdict[1] = { i : 0 for i in snakemake.params.constraints['constraints']}
countdict[1]['not_filtered'] = 0
countdict[1]['total'] = 0

if snakemake.params.paired:
    countdict[2] = {i: 0 for i in snakemake.params.constraints['constraints']}
    countdict[2]['not_filtered'] = 0
    countdict[2]['total'] = 0

for reads, failed, origin, r in results:
    countdict[1]['total'] += 1
    if snakemake.params.paired:
        countdict[2]['total'] += 1
    if failed:
        countdict[r][origin] += 1
        filtered_out_1.write(reads[0][0], reads[0][1], reads[0][2])
        if snakemake.params.paired:
            filtered_out_2.write(reads[1][0], reads[1][1], reads[1][2])
    else:
        countdict[1]['not_filtered'] += 1
        filtered_1.write(reads[0][0], reads[0][1], reads[0][2])
        if snakemake.params.paired:
            countdict[2]['not_filtered'] += 1
            filtered_2.write(reads[1][0], reads[1][1], reads[1][2])

with open(str(snakemake.log), 'w') as file:
    json.dump(countdict, file)
pool.close()  # 'TERM'
pool.join()   # 'KILL'

filtered_1.close()
filtered_out_1.close()
if snakemake.params.paired:
    filtered_2.close()
    filtered_out_2.close()

Python JSON dinopy constraints From line 1 of scripts/constraint_filtering.py

import functools
import glob
import os.path
from shutil import move
import json
import math
import typing
from collections import namedtuple

__FASTQ_EXT = ".fastq"

try:
    # we add these imorts to make the IDE happy about "snakemake.*" not existing...
    import failed_import, snakemake
except:
    pass

import logging
import multiprocessing
from tqdm import tqdm
from functools import partial
import numpy as np

pbar: tqdm = None

# TODO: use cdnarules where possible
import cdnarules
from FastDNARules import FastDNARules

rules = FastDNARules()
overall_gc_content_error_val = lambda data, low, high: [rules.overall_gc_content(data, calc_func=lambda x: 1.0 if (
        x < low or x > high) else 0.0)] * len(data)

from homopolymers import homopolymer_error_val
from kmer import kmer_counting_error_val
from undesired_subsequences import UndesiredSubSequenceFinder

allowed_bases = {"A", "C", "G", "T"}
DEBUG = False
quality_score_format_dict = {"Illumina 1.8+": 33, "PacBio": 33, "Sanger": 33,
                             "Solexa": 64, "Illumina 1.3+": 64, "Illumina 1.5+": 64}

"""
required config entries:
    - allowed min / max gc content ( <= / >= )
    - allowed max homopolymer length ( <= )
    - inplace repair: if corrupt sequences should remain in the output or be replaced with the repaired sequence  (bool)
    - repair quality-score: phred quality score to set for a changed/inserted/removed base (int)
        a deletion will change the base infront of the deletion to the quality score...
    kmer_k: kmer length (int)
    kmer_max_count: maximum number of kmer occurrences ( <= )

"""


def quality_score_to_phred_error_prob(quality_score, quality_score_format="Illumina 1.8+"):
    phred_offset = quality_score_format_dict[quality_score_format]
    quality_score = quality_score - phred_offset
    return 10 ** -(quality_score / 10)


def phred_error_prob_to_quality_score(phred_error_prob, quality_score_format="Illumina 1.8+"):
    phred_offset = quality_score_format_dict[quality_score_format]
    return -10 * np.log10(phred_error_prob) + phred_offset


def read_fasta(filename: str) -> typing.Dict:
    """

    :param filename:
    :return: iterator of tuples: (name, seqeunce)
    """
    fasta_dict = {}
    with open(filename, 'r') as f:
        for line in f:
            if line.startswith('>'):
                i = 0
                seq_name = line.strip().split()[0][1:] + str(i)
                while seq_name in fasta_dict:
                    i += 1
                    seq_name = line.strip().split()[0][1:] + str(i)
                fasta_dict[seq_name] = ''
            else:
                fasta_dict[seq_name] += line.strip()
    return fasta_dict


def read_fastq(filename: str) -> typing.Iterator:
    """

    :param filename:
    :return: iterator of tuples: (name, sequence, phread quality score)
    """
    fastq_list = []
    last_line = -1
    with open(filename, 'r') as f:
        for j, line in enumerate(f):
            if line.startswith('@'):
                if last_line == 3:
                    fastq_list.append((seq_name, dna_seq, comment_line, phred_score))
                seq_name = line.strip()[1:]
                last_line = 0
            elif j % 4 == 1 and last_line == 0:
                dna_seq = line.strip()
                last_line = 1
            elif line.startswith('+') and last_line == 1:
                comment_line = line.strip()[1:]
                last_line = 2
            elif j % 4 == 3 and last_line == 2:
                phred_score = line.strip()
                last_line = 3
        if last_line == 3:
            fastq_list.append((seq_name, dna_seq, comment_line, phred_score))
    return iter(fastq_list)


def write_fasta(filename: str, data):
    """
    : typing.Union[
    typing.List[typing.Tuple[str, str]], typing.Iterable[typing.NamedTuple[typing.Union[str, bytes], str]]]
    :param filename:
    :param data: data in the format: List(Tuple(name, sequence))
    :return:
    """
    with open(filename, "w") as fp:
        for elem in data[:-1]:
            fp.write(f">{elem[0]}\n{elem[1]}\n")
        fp.write(f">{data[-1][0]}\n{data[-1][1]}")


def write_fastq(filename: str, data: typing.Union[
    typing.List[typing.Tuple[str, str, str]], typing.Tuple[typing.Union[str, bytes], str, str]]):
    """
    :param filename:
    :param data:
    :return:
    """
    with open(filename, "w") as fp:
        for elem in data[:-1]:
            fp.write(f">{elem[0]}\n{elem[1]}\n{elem[2]}\n{elem[3]}\n")
        fp.write(f">{elem[0]}\n{elem[1]}\n{elem[2]}\n{elem[3]}")


try:
    logging.basicConfig(filename=str(snakemake.log),
                        level=logging.DEBUG)
    inplace_repair = snakemake.params.inplace_repair
    repair_quality_score = snakemake.params.repair_quality_score
    allowed_min_gc = snakemake.params.min_gc_content
    allowed_max_gc = snakemake.params.max_gc_content
    allowed_max_homopolymer_length = snakemake.params.max_homopolymer_length
    kmer_active = snakemake.params.kmer_active
    kmer_k = snakemake.params.kmer_k
    kmer_max_count = snakemake.params.kmer_max_count
    cores = snakemake.threads
    undesired_subsequences_file = snakemake.params.subsequences_file
    length = snakemake.params.sequence_length
    MAX_ITERATIONS = snakemake.params.maximum_repair_cycles
    USE_QUALITY_MAPPING = snakemake.params.use_quality_mapping
    snakemake_input_file_zero = snakemake.input[0]
except NameError as ne:
    logging.basicConfig(level=logging.DEBUG)
    logging.warning("No snakemake detected. Using default values for parameters.")
    inplace_repair = False
    repair_quality_score = 33
    allowed_min_gc = 0.4
    allowed_max_gc = 0.7
    allowed_max_homopolymer_length = 3
    kmer_active = True
    kmer_k = 10
    kmer_max_count = 20
    cores = 8
    length = 160
    undesired_subsequences_file = "undesired_subsequences.txt"
    MAX_ITERATIONS = 50
    USE_QUALITY_MAPPING = True
    snakemake_input_file_zero = "../../results/assembly/MOSLA2_A/MOSLA2_A_cluster.fasta"

logging.info(
    f"Repairing file: {snakemake_input_file_zero} {'using in-place repair' if inplace_repair else 'adding repaired sequences.'}")
undesired_subsequences_finder = UndesiredSubSequenceFinder(undesired_subsequences_file)

# create quality mapping dict to speedup potential dereplication and repair cycles
quality_mapping_file = snakemake_input_file_zero.replace("_assembled.fastq", "derep.fastq")
quality_mapping_dict = dict()

if USE_QUALITY_MAPPING:
    if not os.path.exists(quality_mapping_file):
        # we do NOT have a dereplicated fastq file, thus we have to extract this info from the assembly file
        quality_mapping_file = snakemake_input_file_zero
        quality_mapping_reads = read_fastq(quality_mapping_file)
        for name, sequence, comment, quality in quality_mapping_reads:
            if sequence not in quality_mapping_dict.keys() \
                    or min(quality_mapping_dict[sequence]) < min(quality) \
                    or np.average(
                [int(quality_score_to_phred_error_prob(ord(x))) for x in quality_mapping_dict[sequence]]) \
                    < np.average([int(quality_score_to_phred_error_prob(ord(x))) for x in quality]):
                quality_mapping_dict[sequence] = quality
    else:
        quality_mapping_reads = read_fastq(quality_mapping_file)
        # file is already dereplicated
        for name, sequence, comment, quality in quality_mapping_reads:
            quality_mapping_dict[sequence] = quality


def repair_single_cluster(single_cluster_data, desired_length=160):
    global pbar
    pbar.update(1)
    centroid, cluster = single_cluster_data
    all_violations = calc_errors(centroid)
    res = [sum(x) for x in zip(*all_violations)]
    if sum(res) == 0 and len(res) == desired_length:
        # there was no error in the centroid. OPTIONAL: mark as correct
        return "C", centroid  # Centroid was Correct
        # continue
    else:
        # there was an error in the centroid: iterate over the cluster
        #   and find the sequence that fulfills all constraints and is closest to the centroid
        found = False
        for seq in cluster:
            seq_violations = calc_errors(seq)
            res = [sum(x) for x in zip(*seq_violations)]
            if sum(res) == 0 and len(res) == desired_length:
                return "S_C", seq  # Centroid was substituted - new centroid is correct - no repair was needed!
            # else:
            # the sequence does not fulfill all constraints
            # continue
        if not found:
            # no sequence in the cluster fulfills all constraints
            #   -> repair the centroid
            possible_results = []
            pair_with_quality = namedtuple("cluster_repair", ["name", "seq", "quality"])
            for i, elem in enumerate([centroid] + cluster):
                q_score = quality_mapping_dict.get(elem.upper(), repair_quality_score)
                repair_res = try_repair(pair_with_quality(f"cluster_repair_{i}", elem, q_score), desired_length)
                repair_violations = calc_errors(repair_res[1])
                res = [sum(x) for x in zip(*repair_violations)]
                if sum(res) == 0 and len(res) == desired_length:
                    # run repair for all elements in the cluster
                    # and take the repaired sequence with the LEAST repairs needed!
                    # centroid was substituted - substituted sequence had to be repaired
                    possible_results.append((repair_res[4], (f"S_R_C_{repair_res[4]}", repair_res[1])))
                else:
                    continue
            if len(possible_results) > 0:
                return sorted(possible_results, key=lambda x: x[0])[0][1]
        return "F", centroid


def repair_clusters(desired_length):
    """
    clusters: list of Clusters, each cluster has a centroid and a list of sequences
    (with a distance-value to the centroid)
    IMPORTANT: clusters are sorted by distance to centroid!
    desired_length: int, desired length of the sequences in the clusters
    :returns: list of centroids with a quality score indicating the performed repairs together with the initial quality
    """
    global pbar
    try:
        input_file = snakemake.input.cent  # centroid file...
    except NameError:
        logging.info("Running in non-snakemake mode - this should be used for testing only")
        input_file = snakemake_input_file_zero
    logging.info(f"Parsing cluster for input file: {input_file}")

    clusters = []

    # get folder for a file string:
    input_folder = os.path.dirname(input_file)
    for cluster_file in glob.glob(f"{input_folder}/clust*"):
        cluster = [seq for name, seq in read_fasta(cluster_file).items()]
        clusters.append((cluster[0], cluster[1:]))
    clusters = sorted(clusters, key=lambda x: len(x[1]), reverse=True)
    output_file = input_file.replace("cluster.fasta", "repaired.fasta")
    output_cluster_mapping_file = input_file.replace("cluster.fasta", "repaired_clusters.json")
    # run different repairs based on the input-file:
    # if we have a list of clusters(+centroids) we want to run a special repair for each cluster:
    # - if the centroid does fulfill all constraints: return centroid as correct
    # - if the centroid does not fulfill all constraints:
    #     - search in the cluster for the sequence that is closest to the centroid and fulfills all constraints
    #     - if no such sequence exists:
    #         - first: repair only the centroid
    #         - if this fails after CHANGE_LIMIT changes:
    #           - repair _ALL_ sequences in the cluster and choose the one with the least changes
    #             _AND_ the shortest distance to the centroid
    # res_centroids = []
    pbar = tqdm(total=len(clusters))
    p = multiprocessing.Pool(cores)
    res_centroids = [x for x in
                     p.imap_unordered(partial(repair_single_cluster, desired_length=desired_length), iterable=clusters,
                                      chunksize=math.ceil(len(clusters) / (cores * 10)))]
    if not inplace_repair:
        # read all entries of input_file (containing all initial centroids)
        # initial_centroids = dinopy.FastqReader(input_file)
        res_seqs = set([b for a, b in res_centroids])
        # add original centroid to
        print(f"{input_file}")
        for _name, _sequence in read_fasta(input_file).items():
            if _sequence not in res_seqs:
                res_centroids.append((f"{_name.replace('_', '-')}_O_F", _sequence))

    if os.path.exists(output_file):
        renamed_file = output_file.replace(__FASTQ_EXT, "old_.fastq").replace(".fasta", "_old.fasta")
        logging.warning(f"[WARNING] File already exists, renaming old file to: {renamed_file}")
        move(output_file, renamed_file)
    write_fasta(output_file, [(str(i) + "_" + a, b) for i, (a, b) in
                              enumerate(sorted(res_centroids, key=lambda tpl: sort_results(tpl[0])))])
    if os.path.exists(output_cluster_mapping_file):
        renamed_file = output_cluster_mapping_file.replace(".json", "_old.json")
        logging.warning(f"[WARNING] File already exists, renaming old file to: {renamed_file}")
        move(output_cluster_mapping_file, renamed_file)
    with open(output_cluster_mapping_file, "w") as cluster_output:
        json.dump([x for x in zip([b for a, b in res_centroids], clusters)], fp=cluster_output)
    return res_centroids


def sort_results(in_str):
    if in_str is None:
        return MAX_ITERATIONS + 5000
    try:
        # ensure we have str and not bytes
        in_str = in_str.decode()
    except:
        pass
    if ":" in in_str:
        in_str = in_str.split(":")[-1]
        in_str = in_str[in_str.find("_") + 1:]
    if in_str == "C":  # centroid was correct
        return -1000
    elif in_str == "S_C":  # centroid was substituted from cluster, new centroid correct
        return -500
    elif in_str.startswith("S_R_C_"):  # centroid was (potentially) substituted; new centroid was repaired (X repairs)
        return int(in_str.split("_")[-1])
    elif in_str == "F":  # repair failed (repair limit reached for all elements in the cluster!)
        return MAX_ITERATIONS + 1000
    elif in_str.startswith("O_F"):  # in case of INPLACE_REPAIR = False, we want to flag invalid original seqs as failed
        return MAX_ITERATIONS + 1001
    else:
        logging.error(f"sorting found not handled case: {in_str}")
        return MAX_ITERATIONS


def allmax(a, return_index=True):
    if len(a) == 0:
        return []
    all_ = [0]
    max_ = a[0]
    all_max_ = [a[0]]
    for i in range(1, len(a)):
        if a[i] > max_:
            all_ = [i]
            max_ = a[i]
            all_max_ = [a[i]]
        elif a[i] == max_:
            all_.append(i)
            all_max_.append(a[i])
    if return_index:
        return all_
    else:
        return all_max_


def calc_errors(seq):
    """
    Calculates the error-values for each constraint.
    :param seq: sequence to test
    :return: list of lists of error-values for each constraint:
        [undesired_subsequences, overall_gc_content, homopolymer_error, kmer_counting, list of 0 for minus optimization]
    """
    results = [undesired_subsequences_finder.undesired_subsequences_val(seq),
               overall_gc_content_error_val(seq, float(allowed_min_gc), float(allowed_max_gc)),
               homopolymer_error_val(seq, int(allowed_max_homopolymer_length), True),
               kmer_counting_error_val(seq, int(kmer_k), int(kmer_max_count), kmer_active),
               [0.0] * len(seq)]
    return results


def try_repair(seq_instance: typing.NamedTuple, desired_length, update_pbar=False):
    if update_pbar:
        pbar.update(1)
    if "quality" not in seq_instance or seq_instance[2] is None:
        # we might be in a FASTA-File and thus have no access to the quality scores
        phred_score = [repair_quality_score] * len(seq_instance[1])
    else:
        phred_score = seq_instance[2]

    if not isinstance(seq_instance[1], str):
        seq = seq_instance[1].decode("utf-8")
    else:
        seq = seq_instance[1]
    org_seq = seq
    iterations = 0
    no_changes = 0
    results = calc_errors(seq)
    res = [sum(x) for x in zip(*results)]
    assert len(phred_score) == len(seq)
    check_and_modified_positions = set()
    while len(seq) < desired_length and no_changes < MAX_ITERATIONS:
        # DELETION ERROR:
        # this is a bit more complicated: we cant simply pick the position with the highest error-value
        # because errors might appear _after_ a possible deletion
        # however we _can_ be sure that the deletion should be _before_

        # find the positions with the highest error-value
        max_error_pos = allmax(res)
        if len(max_error_pos) == 0 or res[max_error_pos[0]] == 0:
            max_error_pos = [len(seq) - 1]  # we want to append at the end of the sequence...
        # from this set search for the positions with the lowest phred score
        max_arg = [max_error_pos[x] for x in allmax([-phred_score[y] for y in max_error_pos])]
        # filter out all positions that were already checked
        max_arg = [x for x in max_arg if x not in check_and_modified_positions]
        if len(max_arg) == 0:
            break
        # choose the middle position from the remaining positions
        max_arg = max_arg[int(len(max_arg) / 2)]

        possible_bases = allowed_bases - {seq[max_arg]}
        min_seq = seq
        min_phred_score = phred_score
        min_sum_res = sum(res)
        min_results = results
        min_res = res
        insert = 0
        for base in possible_bases:
            seq = seq[:max_arg] + base + seq[max_arg + insert:]  # for the first run we insert instead of replacing..
            phred_score = phred_score[:max_arg] + [repair_quality_score] + phred_score[max_arg + insert:]
            insert = 1
            results = calc_errors(seq)
            res = [sum(x) for x in zip(*results)]
            # if the error value does not change, we should take the base that would more likely repair in the long run:
            #   -> we should select the base that moves the GC content closer to the allowed range
            if sum(res) < min_sum_res or len(check_and_modified_positions) == len(seq):
                min_seq = seq
                min_phred_score = phred_score
                min_sum_res = sum(res)
                min_results = results
                min_res = res
            if sum(res) == 0:
                break
        check_and_modified_positions.add(max_arg)
        if seq != min_seq:
            no_changes = 0
            seq = min_seq
            phred_score = min_phred_score
            results = min_results
            res = min_res
            # we need to shift all already checked positions that are bigger than the current base by one
            tmp = set()
            for pos in check_and_modified_positions:
                if pos <= max_arg:
                    tmp.add(pos)
                else:
                    tmp.add(pos + 1)
            check_and_modified_positions = tmp
            if DEBUG: print(f"[I] {seq}")
        iterations += 1
    # check if length of seq is too long & homopolymer exists
    while len(seq) > desired_length:
        # INSERTION ERROR:
        # extract cumulative error position for all homopolymer locations
        possible_insertions = [res[i] if results[2][i] > 0.0 else -1.0 for i in range(len(res))]
        if np.max(possible_insertions) == -1:
            # if the gc-content is off:
            # choose the _first_ subsequence of correct length that has a correct gc-content
            # OR: use a sliding window to find the best subsequence:
            # first or last window should be choosen (either there was an offset at the beginning or
            # there were additional bases at the end)

            seq = seq[:-1]
            phred_score = phred_score[:-1]
        else:
            # since we are only removing homopolymers we can take ANY position with the highest error
            seq = seq[:np.argmax(possible_insertions)] + seq[np.argmax(possible_insertions) + 1:]
            phred_score = phred_score[:np.argmax(possible_insertions)] + phred_score[
                                                                         np.argmax(possible_insertions) + 1:]
        if DEBUG: print(f"[D] {seq}")
        no_changes += 1  # easy since we always remove one base
        # re-calculate error values:
        results = calc_errors(seq)
        res = np.array([sum(x) for x in zip(*results)], dtype=np.float64)
        assert len(phred_score) == len(seq)
    # if no homopolymer exists but sequence is too long, try heuristic repair but tag as low quality

    while sum([sum(x) for x in zip(*results[:-1])]) > 0 and iterations < MAX_ITERATIONS:
        # MUTATION ERROR:
        max_arg = allmax([-phred_score[x] for x in allmax(res)])
        max_arg = max_arg[int(len(max_arg) / 2)]
        possible_bases = allowed_bases - {seq[max_arg]}
        min_seq = seq
        min_sum_res = sum(res)
        for base in possible_bases:
            seq = seq[:max_arg] + base + seq[max_arg + 1:]
            phred_score = phred_score[:max_arg] + [repair_quality_score] + phred_score[max_arg + 1:]
            results = calc_errors(seq)
            res = [sum(x) for x in zip(*results[:-1])]
            if sum(res) < min_sum_res:
                min_seq = seq
                min_sum_res = sum(res)
            if sum(res) == 0:
                break
        results[4][max_arg] = -min_sum_res  # we do not want to touch the same base again...
        res = [sum(x) for x in zip(*results)]
        no_changes += 1
        seq = min_seq
        if DEBUG: print(f"[M] {seq}")
        iterations += 1
    return org_seq, seq, phred_score, seq_instance[0], no_changes


def main(desired_length=160):
    global pbar
    repaired_tuples = []
    phred_error_prob_to_quality_score(quality_score_to_phred_error_prob(30))
    # Read in the fasta file _AFTER_ clustering
    try:
        input_file = snakemake.input[0]
    except NameError:
        logging.info("Running in non-snakemake mode - this should be used for testing only")
        input_file = snakemake_input_file_zero
    logging.info(f"Input files: {input_file}")

    if input_file.endswith(__FASTQ_EXT):
        seqs: typing.List[typing.List] = [[name, sequence, quality] for name, sequence, comment, quality in
                                          read_fastq(input_file)]
    else:
        logging.info("Fasta-File - No phred score given. Using default value.")
        seqs: typing.List[typing.List] = [[name, sequence, repair_quality_score] for name, sequence in
                                          read_fasta(input_file).items()]
    pbar = tqdm(total=len(seqs))
    p = multiprocessing.Pool(cores)
    a = [x for x in p.imap_unordered(partial(try_repair, desired_length=desired_length, update_pbar=True),
                                     iterable=seqs,
                                     chunksize=math.ceil(len(seqs) / (cores * 10)))]
    out_data = []
    if not inplace_repair:
        for seq in seqs:
            seq_name = seq[0].decode() + "_org"
            if "quality" not in seq or seq[2] is None:
                # we might be in a FASTA-File and thus have no access to the quality scores
                phred_score = [int(repair_quality_score)] * len(seq[1])
            else:
                phred_score = [int(x) for x in seq[2]]
            out_data.append((seq[1], seq_name.encode(), [chr(int(x)) for x in phred_score]))
    for elem in sorted(a, key=lambda x: x[-1]):
        # store result as a fastq file to reflect
        bytes_quality_values = bytes(
            [int(phred_error_prob_to_quality_score(quality_score_to_phred_error_prob(x))) for x in elem[2]])
        seq_name = elem[3].decode() + "_" + str(elem[4])
        # only add if the sequence was repaired
        if elem[4] > 0:
            repaired_tuples.append([elem[0], elem[1]])
        try:
            res = elem[1].decode("utf-8")
        except AttributeError:  # support for non-snakemake runs
            res = elem[1]
        out_data.append((res.encode("utf-8"), seq_name.encode(), bytes_quality_values))
    out_path = input_file.replace(__FASTQ_EXT, "_constraint_repaired.fastq").replace(".fasta",
                                                                                     "_constraint_repaired.fastq")
    if os.path.exists(out_path):
        renamed_file = out_path.replace(__FASTQ_EXT, "_old.fastq").replace(".fasta", "_old.fasta")
        logging.warning(f"[WARNING] File already exists, renaming old file to: {renamed_file}")
        move(out_path, renamed_file)
    write_fastq(out_path, out_data)
    logging.info(f"Saving result: {out_path}")

    out_path = input_file.replace(__FASTQ_EXT, "_constraint_repaired_mapping.json").replace(".fasta",
                                                                                            "_constraint_repaired_mapping.json")
    if os.path.exists(out_path):
        renamed_file = out_path.replace(".json", "_old.json")
        logging.warning(f"[WARNING] File already exists, renaming old file to: {renamed_file}")
        move(out_path, renamed_file)
    with open(out_path, "w") as f:
        json.dump(repaired_tuples, f)
    logging.info(f"Saving result-mapping for alternative centroid selection during decoding: {out_path}")


repair_clusters(length)

Python Snakemake numpy JSON tqdm fastDNA kMer From line 1 of constraints/check_constraints.py

import os
#import sys
from tqdm import tqdm
#import dinopy
import subprocess

print(snakemake.params.blast_db)
blast_db_targets = snakemake.params.blast_db[str(snakemake.params.sample).split('-')[-1]]
os.makedirs(snakemake.params.spades_dir, exist_ok=True)

print(f"Processing sample")

sample_in_path = snakemake.input.derep

print("Filtering biological and non-biological reads")

sample_blast_hits_path = snakemake.output.blast_hits
sample_data_hits_path = snakemake.output.data_hits

blast_process = subprocess.run(["blastn","-db", blast_db_targets, "-query",sample_in_path, "-outfmt","7 sseqid", "-out", sample_blast_hits_path])
if blast_process.returncode != 0:
    print("Something went wrong with blast.")
    exit(1)

data_hits = set()
with tqdm(total=os.path.getsize(sample_blast_hits_path)) as progress:
    with open(sample_blast_hits_path) as file:
        current_query = ""
        for line in file:
            if line.startswith("# "):
                information = line.split(" ")
                if information[1] == "Query:":
                    current_query = information[2].rstrip()
                elif information[1] in ["BLASTN", "Database:","Fields:","BLAST"]:
                    pass
                else:
                    if int(information[1]) == 0:
                        data_hits.add(current_query)
            progress.update(len(line.encode("utf-8")))

'''
far = dinopy.FastaReader(sample_in_path)
sample_data_hits_path = snakemake.output.data_hits
faw = dinopy.FastaWriter(sample_data_hits_path)
faw.open()

with tqdm(total=os.path.getsize(sample_in_path)) as progress:
    for seq, name in far.reads(read_names=True):
        if name.decode() in data_hits:
            faw.write_entry((seq, name))
        progress.update(len(seq)+len(name))

faw.close()
'''
with tqdm(total=os.path.getsize(sample_in_path)) as progress:
    with open(sample_in_path, "r") as inp, open(sample_data_hits_path, "w") as outp:
        while True:
            name = inp.readline().rstrip()
            seq = inp.readline().rstrip()
            if not seq:
                break
            assert name[0] == ">"
            if name[1:] in data_hits:
                outp.write(name + "\n")
                outp.write(seq + "\n")
            progress.update((len(seq.encode("utf-8")) + len(name.encode("utf-8"))))


print("Assembling data reads.")
spades_out_folder = snakemake.params.spades_dir
spades_process = subprocess.run(["spades.py", "-o", spades_out_folder,"-s", sample_data_hits_path, "--only-assembler",
                                 "-k", "21,33,55,77,99"])
if spades_process.returncode != 0:
    print("Something went wrong with spades.")
    exit(1)
#
#print("Extracting largest sequence.")
#sample_contigs_path = snakemake.output.contigs
#data_assembly_path = snakemake.output.assembly
#largest_contig = []
#with open(sample_contigs_path, 'r') as contigs:
#    first = True
#    for line in contigs:
#        if line.startswith('>'):
#            if first:
#                largest_contig.append(line)
#                first = False
#            else:
#                break
#        else:
#            largest_contig.append(line)
#with open(data_assembly_path, 'w') as assembly_outfile:
#    assembly_outfile.writelines(largest_contig)

print("Done.")

Python BLAST tqdm SPAdes From line 1 of scripts/contig_assembly.py

import pandas as pd

# Script to define the primer used by pandaseq. The defined
# primer depends on the existence of the barcode, if the reads are
# single or paired end and if pandaseq should use an offset of
# the primer length instead of the sequence. Using an offset can
# be useful if the sequence has many uncalled bases in the primer
# region, preventing a nucleotide primer from matching.

primertable = pd.read_csv(str(snakemake.input), index_col="Probe")

if snakemake.params.all_removed:
    pass
else:
    if snakemake.params.paired_end:
        if snakemake.params.offset:
            if snakemake.params.bar_removed:
                primertable["f_primer"] = (
                    primertable[["poly_N",
                    "specific_forward_primer"]].sum(axis=1).str.len())
                primertable["r_primer"] = (
                    primertable[["poly_N_rev",
                    "specific_reverse_primer"]].sum(axis=1).str.len())
            else:
                primertable["f_primer"] = (
                    primertable[["poly_N", "Barcode_forward",
                    "specific_forward_primer"]].sum(axis=1).str.len())
                primertable["r_primer"] = (
                    primertable[["poly_N_rev","Barcode_reverse",
                    "specific_reverse_primer"]].sum(axis=1).str.len())
        else:
            if snakemake.params.bar_removed:
                primertable["f_primer"] = (
                    primertable["specific_forward_primer"])
                primertable["r_primer"] = (
                    primertable["specific_reverse_primer"])
            else:
                primertable["f_primer"] = (
                    primertable[["Barcode_forward",
                    "specific_forward_primer"]].sum(axis=1))
                primertable["r_primer"] = (
                    primertable[["Barcode_reverse",
                    "specific_reverse_primer"]].sum(axis=1))
    else:
        if snakemake.params.offset:
            if snakemake.params.bar_removed:
                primertable["f_primer"] = (
                    primertable[["poly_N",
                    "specific_forward_primer"]].sum(axis=1).str.len())
            else:
                primertable["f_primer"] = (
                    primertable[["poly_N", "Barcode_forward",
                    "specific_forward_primer"]].sum(axis=1).str.len())
        else:
            if snakemake.params.bar_removed:
                primertable["f_primer"] = (
                    primertable["specific_forward_primer"])
            else:
                primertable["f_primer"] = (
                    primertable[["Barcode_forward",
                    "specific_forward_primer"]].sum(axis=1))

primertable.to_csv(str(snakemake.output))

Python Pandas PANDAseq From line 1 of scripts/define_primer.py

import pandas as pd
import numpy as np
import dinopy
import os
import re
import yaml
import glob
import sys
import shutil
import subprocess
import pathlib
# This script is used for moving already assembled fastq files to the correct folder for further processing,
# demultiplexing samples that were pooled in-lab (instead of at the sequencing company) and
# to manually sort reads depending on their primersequence.

p_table = pd.read_csv(snakemake.params.primertable, index_col='Probe')
primertable = p_table.to_dict('index')
data_folder = str(snakemake.params.filename)
file_path_list = sorted(glob.glob(data_folder + "/*.fast*"))

# Regex substitution dict.
iupac_dict_regex = {'M':'[AC]', 'R':'[AG]', 'W':'[AT]', 'S':'[CG]', 'Y':'[CT]',
                    'K':'[GT]', 'V':'[ACG]', 'H':'[ACT]', 'D':'[AGT]',
                    'B':'[CGT]', 'X':'[ACGT]', 'N':'[ACGT]'}


# Substitution helper function.
def iupac_replace(sequence, iupac_dict):
    for i, j in iupac_dict_regex.items():
        sequence = sequence.replace(i, j)
    return sequence


def define_direction_demulti(polyN, prim, barcode):
    def check_for_match_demulti(sequence, sample):
        poly_prim_bar = [primertable[sample][key] for key
                        in primertable[sample].keys() if key
                        in [polyN, prim, barcode]]
        prim_bar = re.compile(poly_prim_bar[1] + iupac_replace(poly_prim_bar[2],
                                iupac_dict_regex))
        for i in [0, 1, -1, 2, -2]:
            start = np.clip(len(primertable[sample][polyN]) + i, a_min=0,
                                a_max=None)
            end = np.clip(len(''.join(poly_prim_bar)) + i, a_min=0,
                                a_max=None)
            if prim_bar.match(sequence[start : end]):
                return True
        else:
            return False
    return check_for_match_demulti


# These are the variations of the check_for_match closure for the
# forward and reverse primer, the arguments are the column indices
# of the corresponding polyN col, the col after the primer
# (as the slice beginning is inclusive, end is exclusive) and the
# col index of the barcode.
check_for_match_fwd_demulti = define_direction_demulti('poly_N', 'specific_forward_primer',
                                        'Barcode_forward')
check_for_match_rev_demulti = define_direction_demulti('poly_N_rev', 'specific_reverse_primer',
                                       'Barcode_reverse')


def define_direction_sort(prim):
    def check_for_match_sort(sequence, sample):
        prim_regex = re.compile(iupac_replace(primertable[sample][prim],
                                              iupac_dict_regex))
        if prim_regex.match(sequence[:len(primertable[sample][prim])]):
            return True
        else:
            return False
    return check_for_match_sort


check_for_match_sort_fwd = define_direction_sort('specific_forward_primer')
check_for_match_sort_rev = define_direction_sort('specific_reverse_primer')


# Create a dict of Dinopy writer instances and write the sequences
# according to their barcode and primer sequence in the corresponding
# files defined in the primertable.
def demultiplexer(file_path_list):
    samples = []
    output_filepaths = []
    for sample in primertable.keys():
        samples.append(sample + '_R1')
        samples.append(sample + '_R2')
        output_filepaths.append('demultiplexed/' + sample + '_R1.fastq.gz')
        output_filepaths.append('demultiplexed/' + sample + '_R2.fastq.gz')

    # Create a dict of writers.
    writers = {name: dinopy.FastqWriter(path) for name, path in
               zip(samples, output_filepaths)}

    # Open all writers.
    for writer in writers.values():
        writer.open()

    # Start writing.
    for sample in file_path_list:
        sequence = dinopy.FastqReader(sample)
        for read in sequence.reads(quality_values=True):
            for sample in primertable.keys():
                if check_for_match_fwd_demulti(read.sequence.decode(), sample):
                    writers[sample + '_R1'].write(read.sequence, read.name,
                            read.quality)
                elif check_for_match_rev_demulti(read.sequence.decode(), sample):
                    writers[sample + '_R2'].write(read.sequence, read.name,
                            read.quality)
                else:
                    pass

    # Close all writers.
    for writer in writers.values():
        writer.close()

def read_sorter(primertable):
    if not os.path.exists('demultiplexed/not_sorted'):
        os.mkdir('demultiplexed/not_sorted')
    samples = []
    output_filepaths = []
    for sample in primertable.keys():
        samples.append(sample + snakemake.params.name_ext[:-1] + '1')
        samples.append(sample + snakemake.params.name_ext[:-1] + '2')
        samples.append(sample + '_not_sorted')
        output_filepaths.append('demultiplexed/' + sample + '_R1.fastq.gz')
        output_filepaths.append('demultiplexed/' + sample + '_R2.fastq.gz')
        output_filepaths.append('demultiplexed/not_sorted/' + sample + '_not_sorted.fastq.gz')

    # Create a dict of writers.
    writers = {name: dinopy.FastqWriter(path) for name, path in
               zip(samples, output_filepaths)}

    # Open all writers.
    for writer in writers.values():
        writer.open()

    # Start writing.
    for sample in primertable.keys():
        fwd = dinopy.FastqReader('../' + data_folder + '/' + sample + str(snakemake.params.name_ext)[:-1] + '1.fastq.gz')
        rev = dinopy.FastqReader('../' + data_folder + '/' + sample + str(snakemake.params.name_ext)[:-1] + '2.fastq.gz')
        for read_f, read_r in zip(fwd.reads(quality_values=True), rev.reads(quality_values=True)):
            if check_for_match_sort_fwd(read_f.sequence.decode(),
                sample.split('/')[-1]) and check_for_match_sort_rev(read_r.sequence.decode(),
                    sample.split('/')[-1]):
                writers[sample + '_R1'].write(read_f.sequence, read_f.name,
                                read_f.quality)
                writers[sample + '_R2'].write(read_r.sequence, read_r.name,
                                read_r.quality)
            elif check_for_match_sort_rev(read_f.sequence.decode(),
                sample.split('/')[-1]) and check_for_match_sort_fwd(read_r.sequence.decode(),
                    sample.split('/')[-1]):
                writers[sample + '_R2'].write(read_f.sequence, read_f.name,
                                read_f.quality)
                writers[sample + '_R1'].write(read_r.sequence, read_r.name,
                                read_r.quality)
            else:
                writers[sample + '_not_sorted'].write(read_f.sequence, read_f.name,
                                read_f.quality)
                writers[sample + '_not_sorted'].write(read_r.sequence, read_r.name,
                                read_r.quality)

    # Close all writers.
    for writer in writers.values():
        writer.close()


def already_assembled(primertable, file_path_list):
    for f_ in file_path_list:
        if '.gz' in f_:
            subprocess.run(['gunzip', f_])
            f_ = f_.split('.gz')[0]
        for sample in primertable.keys():
            pathlib.Path('../results/assembly/' + sample).mkdir(parents=True, exist_ok=True)
            if sample in f_:
                shutil.copy(f_, '../results/assembly/' + sample + '/' + sample + '_assembled.fastq')
                shutil.copy(f_, '../demultiplexed/')


# Run the demultiplexing / read sorting script.
if snakemake.params.demultiplexing:
    print('1')
    demultiplexer(file_path_list)
elif snakemake.params.read_sorting:
    read_sorter(primertable)
    print('2')
elif snakemake.params.assembled:
    already_assembled(primertable, file_path_list)
    print('3')
else:
    print('4')
    # If the files do not need demultiplexing / sorting, just copy them to
    # the demultiplexed folder. Leave original files in input folder
    for file in file_path_list:
        print(file)
        shutil.copy(file, 'demultiplexed/')

Python Pandas numpy PyYAML dinopy From line 1 of scripts/demultiplexing.py

from Bio import SeqIO as sio
from Bio.SeqRecord import SeqRecord


def derep(inp):
    seqs = {}
    for record in sio.parse(inp, "fastq"):
        if record.seq not in seqs.keys():
            seqs[record.seq] = {}
            seqs[record.seq]["quality"] = record.letter_annotations["phred_quality"]
            seqs[record.seq]["size"] = 1
        else:
            new_phred = sum(record.letter_annotations["phred_quality"])/len(record.letter_annotations["phred_quality"])
            old_phred = sum(seqs[record.seq]["quality"])/len(seqs[record.seq]["quality"])
            if new_phred > old_phred:
                seqs[record.seq]["quality"] = record.letter_annotations["phred_quality"]
            seqs[record.seq]["size"] += 1
    return seqs


def get_records(seq_entries):
    for seq, vals in seq_entries.items():
        if vals["size"] >= snakemake.params.minsize:
            yield SeqRecord(seq, id='0', description=';size={};'.format(vals["size"]),
                            letter_annotations={"phred_quality": vals["quality"]})


derep_seqs = derep(str(snakemake.input))
with open(str(snakemake.output), "w") as outp:
    sio.write(get_records(derep_seqs), outp, "fastq")

Python Biopython From line 1 of scripts/derep_quality.py

from subprocess import call

# If the sequences are single-end, they still need the read identifier
# (R1), otherwise, the string slicing should remove the last six symbols
# instead of the last 8.

if(len(snakemake.input)) == 2:
    output_edit = str(snakemake.output[0])[:-8]
    output_bad = str(snakemake.output[0])[:-8] + "_low_qual"
else:
    output_edit = str(snakemake.output[0])[:-6]
    output_bad = str(snakemake.output[0])[:-6] + "_low_qual"

call_list = [
    "prinseq-lite.pl",
    "-verbose",
    "-fastq", snakemake.input[0],
    "-ns_max_n", "0",
    "-min_qual_mean", str(snakemake.params.mq),
    "-out_good", output_edit,
    "-out_bad", output_bad,
    "-log", str(snakemake.log)
]
if(len(snakemake.input)) == 2:
    call_list.extend(["-fastq2", snakemake.input[1]])

if(snakemake.params.trim_to_length > 0):
    call_list.extend([
        "-trim_to_len", str(snakemake.params.trim_to_length)
    ])

call(call_list)