BioWorkflows: Reproducible workflow for SARS-CoV-2 recombinant sequence detection.

import pandas as pd
import click
import os
import logging
import requests
import sys
import time
from Bio import Phylo

NO_DATA_CHAR = "NA"

LAPIS_LINEAGE_COL = "pangoLineage"
LAPIS_OPEN_BASE = (
    "https://lapis.cov-spectrum.org/open/v1/sample/aggregated"
    + "?fields={lineage_col}".format(lineage_col=LAPIS_LINEAGE_COL)
    + "&nucMutations={mutations}"
)

LAPIS_GISAID_BASE = (
    "https://lapis.cov-spectrum.org/gisaid/v1/sample/aggregated"
    + "?accessKey={access_key}"
    + "&fields={lineage_col}".format(lineage_col=LAPIS_LINEAGE_COL)
    + "&nucMutations={mutations}"
)

LINEAGE_PROP_THRESHOLD = 0.01
LAPIS_SLEEP_TIME = 0
# Consider a breakpoint match if within 50 base pairs
BREAKPOINT_APPROX_BP = 50

DATAFRAME_COLS = [
    "sc2rf_status",
    "sc2rf_details",
    "sc2rf_lineage",
    "sc2rf_clades_filter",
    "sc2rf_regions_filter",
    "sc2rf_regions_length",
    "sc2rf_breakpoints_filter",
    "sc2rf_num_breakpoints_filter",
    "sc2rf_breakpoints_motif",
    "sc2rf_unique_subs_filter",
    "sc2rf_alleles_filter",
    "sc2rf_intermission_allele_ratio",
    "cov-spectrum_parents",
    "cov-spectrum_parents_confidence",
    "cov-spectrum_parents_subs",
    "sc2rf_parents_conflict",
]


def create_logger(logfile=None):
    # create logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    # create file handler which logs even debug messages
    if logfile:
        handler = logging.FileHandler(logfile)
    else:
        handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.DEBUG)
    logger.addHandler(handler)
    return logger


def reverse_iter_collapse(
    regions,
    min_len,
    max_breakpoint_len,
    start_coord,
    end_coord,
    clade,
):
    """Collapse adjacent regions from the same parent into one region."""

    coord_list = list(regions.keys())
    coord_list.reverse()

    for coord in coord_list:
        prev_start_coord = coord
        prev_end_coord = regions[prev_start_coord]["end"]
        prev_region_len = (prev_end_coord - prev_start_coord) + 1
        prev_clade = regions[coord]["clade"]
        breakpoint_len = start_coord - prev_end_coord

        # If the previous region was too short AND from a different clade
        # Delete that previous region, it's an intermission
        if prev_region_len < min_len and clade != prev_clade:
            del regions[prev_start_coord]

        # If the previous breakpoint was too long AND from a different clade
        # Don't add the current region
        elif (
            start_coord != prev_start_coord
            and clade != prev_clade
            and (max_breakpoint_len != -1 and breakpoint_len > max_breakpoint_len)
        ):
            break

        # Collapse the current region into the previous one
        elif clade == prev_clade:
            regions[prev_start_coord]["end"] = end_coord
            break

        # Otherwise, clades differ and this is the start of a new region
        else:
            regions[start_coord] = {"clade": clade, "end": end_coord}
            break

    # Check if the reveres iter collapse wound up deleting all the regions
    if len(regions) == 0:
        regions[start_coord] = {"clade": clade, "end": end_coord}


@click.command()
@click.option(
    "--csv",
    help="CSV output from sc2rf, multiple files separate by commas.",
    required=True,
)
@click.option(
    "--ansi",
    help="ANSI output from sc2rf, multiple files separate by commas.",
    required=False,
)
@click.option("--motifs", help="TSV of breakpoint motifs", required=False)
@click.option(
    "--prefix",
    help="Prefix for output files.",
    required=False,
    default="sc2rf.recombinants",
)
@click.option(
    "--min-len",
    help="Minimum region length (-1 to disable filter).",
    required=False,
    default=-1,
)
@click.option(
    "--min-consec-allele",
    help="Minimum number of consecutive alleles in a region (-1 to disable filter).",
    required=False,
    default=-1,
)
@click.option(
    "--max-breakpoint-len",
    help="Maximum breakpoint length (-1 to disable filter).",
    required=False,
    default=-1,
)
@click.option(
    "--max-parents",
    help="Maximum number of parents (-1 to disable filter).",
    required=False,
    default=-1,
)
@click.option("--outdir", help="Output directory", required=False, default=".")
@click.option(
    "--aligned",
    help="Extract recombinants from this alignment (Note: requires seqkit)",
    required=False,
)
@click.option(
    "--nextclade",
    help="Nextclade TSV output from the sars-cov-2.",
    required=False,
)
@click.option(
    "--nextclade-no-recomb",
    help="Nextclade TSV output from the sars-cov-2-no-recomb dataset.",
    required=False,
)
@click.option(
    "--nextclade-auto-pass",
    help="CSV list of lineage assignments that will be called positive",
    required=False,
)
@click.option(
    "--max-breakpoints",
    help="The maximum number of breakpoints (-1 to disable breakpoint filtering)",
    required=False,
    default=-1,
)
@click.option(
    "--lapis",
    help="Enable the LAPIS API to identify parental lineages from covSPECTRUM.",
    is_flag=True,
    required=False,
    default=False,
)
@click.option(
    "--gisaid-access-key",
    help="The accessKey to query GISAID data with LAPIS",
    required=False,
)
@click.option(
    "--issues",
    help="Issues TSV metadata from pango-designation",
    required=False,
)
@click.option(
    "--lineage-tree",
    help="Newick tree of pangolin lineage hierarchies",
    required=False,
)
@click.option(
    "--metadata",
    help="Sample metadata TSV, used to include negative samples in the final output.",
    required=False,
)
@click.option("--log", help="Path to a log file", required=False)
@click.option(
    "--dup-method",
    help=(
        "Method for resolving duplicate results:\n"
        + "\nfirst: Use first positive results found.\n"
        + "\nlast: Use last positive results found.\n"
        + "\nmin_bp: Use fewest breakpoints."
        + "\nmin_uncertainty: Use small breakpoint intervals."
    ),
    type=click.Choice(
        ["first", "last", "min_bp", "min_uncertainty"],
        case_sensitive=False,
    ),
    required=False,
    default="lineage",
)
def main(
    csv,
    ansi,
    prefix,
    min_len,
    min_consec_allele,
    max_breakpoint_len,
    outdir,
    aligned,
    nextclade,
    nextclade_auto_pass,
    nextclade_no_recomb,
    max_parents,
    issues,
    max_breakpoints,
    motifs,
    log,
    lineage_tree,
    metadata,
    dup_method,
    lapis,
    gisaid_access_key,
):
    """Detect recombinant seqences from sc2rf. Dependencies: pandas, click"""

    # Check for directory
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    # create logger
    logger = create_logger(logfile=log)

    # -----------------------------------------------------------------------------
    # Import Optional Data Files

    # (Optional) issues.tsv of pango-designation issues
    #            lineage assignment by parent+breakpoint matching
    if issues:

        logger.info("Parsing issues: {}".format(issues))

        breakpoint_col = "breakpoints_curated"
        parents_col = "parents_curated"
        breakpoint_df = pd.read_csv(issues, sep="\t")
        breakpoint_df.fillna(NO_DATA_CHAR, inplace=True)
        drop_rows = breakpoint_df[breakpoint_df[breakpoint_col] == NO_DATA_CHAR].index
        breakpoint_df.drop(drop_rows, inplace=True)

        # Convert CSV to lists
        breakpoint_df[breakpoint_col] = [
            bp.split(",") for bp in breakpoint_df[breakpoint_col]
        ]
        breakpoint_df[parents_col] = [p.split(",") for p in breakpoint_df[parents_col]]

    # (Optional) motifs dataframe
    if motifs:
        logger.info("Parsing motifs: {}".format(motifs))
        motifs_df = pd.read_csv(motifs, sep="\t")

    # (Optional) nextclade tsv dataframe
    if nextclade:
        logger.info("Parsing nextclade: {}".format(nextclade))
        nextclade_df = pd.read_csv(nextclade, sep="\t")
        nextclade_df["seqName"] = [str(s) for s in nextclade_df["seqName"]]
        nextclade_df.set_index("seqName", inplace=True)
        nextclade_df.fillna(NO_DATA_CHAR, inplace=True)

        if nextclade_auto_pass:
            nextclade_auto_pass_lineages = nextclade_auto_pass.split(",")
            # Identify samples to autopass
            auto_pass_df = nextclade_df[
                (nextclade_df["Nextclade_pango"] != NO_DATA_CHAR)
                & (nextclade_df["Nextclade_pango"].isin(nextclade_auto_pass_lineages))
            ]

    # (Optional) nextclade tsv dataframe no-recomb dataset
    if nextclade_no_recomb:
        logger.info(
            "Parsing nextclade no-recomb output: {}".format(nextclade_no_recomb)
        )
        nextclade_no_recomb_df = pd.read_csv(nextclade_no_recomb, sep="\t")
        nextclade_no_recomb_df["seqName"] = [
            str(s) for s in nextclade_no_recomb_df["seqName"]
        ]
        nextclade_no_recomb_df.set_index("seqName", inplace=True)
        nextclade_no_recomb_df.fillna(NO_DATA_CHAR, inplace=True)

    # (Optional) metadata tsv dataframe to find negatives missing from sc2rf
    if metadata:
        logger.info("Parsing metadata tsv: {}".format(metadata))
        metadata_df = pd.read_csv(metadata, sep="\t")
        metadata_df["strain"] = [str(s) for s in metadata_df["strain"]]
        metadata_df.fillna(NO_DATA_CHAR, inplace=True)

    # (Optional) phylogenetic tree of pangolineage lineages
    if lineage_tree:
        logger.info("Parsing lineage tree: {}".format(lineage_tree))
        tree = Phylo.read(lineage_tree, "newick")

    # -----------------------------------------------------------------------------
    # Import Dataframes of Potential Positive Recombinants

    # note: Add the end of this section, only positives and false positives will
    #       be in the dataframe.

    # sc2rf csv output (required)
    df = pd.DataFrame()
    csv_split = csv.split(",")
    # Store a dict of duplicate strains
    duplicate_strains = {}

    for csv_file in csv_split:
        logger.info("Parsing csv: {}".format(csv_file))

        try:
            temp_df = pd.read_csv(csv_file, sep=",")
        except pd.errors.EmptyDataError:
            logger.warning("No records in csv: {}".format(csv_file))
            temp_df = pd.DataFrame()

        # Add column to indicate which csv file results come from (debugging)
        temp_df.insert(
            loc=len(temp_df.columns),
            column="csv_file",
            value=os.path.basename(csv_file),
        )

        # Convert to str type
        temp_df["sample"] = [str(s) for s in temp_df["sample"]]
        temp_df.set_index("sample", inplace=True)

        # Add column to hold original strain name before marking duplicates
        temp_df.insert(
            loc=len(temp_df.columns),
            column="strain",
            value=temp_df.index,
        )

        # If the df has no records, this is the first csv
        if len(df) == 0:
            df = temp_df
        else:
            # Keep all dups for now, only retain best results at end
            for strain in temp_df.index:
                if strain in list(df["strain"]):
                    if strain not in duplicate_strains:
                        duplicate_strains[strain] = 1

                    # add suffix "_dup<i>", remove at the end of scripts
                    dup_1 = strain + "_dup{}".format(duplicate_strains[strain])
                    df.rename(index={strain: dup_1}, inplace=True)

                    duplicate_strains[strain] += 1

                    dup_2 = strain + "_dup{}".format(duplicate_strains[strain])
                    temp_df.rename(index={strain: dup_2}, inplace=True)

            # Combine primary and secondary data frames
            df = pd.concat([df, temp_df])

    df.fillna("", inplace=True)

    # -------------------------------------------------------------------------
    # Initialize new stat columns to NA

    # note: Add the end of this section, only positives and false positives will
    #       be in the sc2rf_details_dict

    for col in DATAFRAME_COLS:
        df[col] = [NO_DATA_CHAR] * len(df)

    # Initialize a dictionary of sc2rf details and false positives
    # key: strain, value: list of reasons
    false_positives_dict = {}
    sc2rf_details_dict = {strain: [] for strain in df.index}

    # -------------------------------------------------------------------------
    # Add in the Negatives (if metadata was specified)
    # Avoiding the Bio module, I just need names not sequences

    if metadata:

        logger.info("Reporting non-recombinants in metadata as negatives")
        for strain in list(metadata_df["strain"]):
            # Ignore this strain if it's already in dataframe (it's a recombinant)
            if strain in list(df["strain"]):
                continue
            # Otherwise add it, with no data as default
            df.loc[strain] = NO_DATA_CHAR
            df.at[strain, "strain"] = strain
            df.at[strain, "sc2rf_status"] = "negative"
            sc2rf_details_dict[strain] = []

    # ---------------------------------------------------------------------
    # Auto-pass lineages from nextclade assignment, that were also detected by sc2rf

    if nextclade and nextclade_auto_pass:

        logger.info(
            "Auto-passing lineages: {}".format(",".join(nextclade_auto_pass_lineages))
        )

        for rec in auto_pass_df.iterrows():

            # Note the strain is the true strain, not _dup suffix
            strain = rec[0]

            lineage = auto_pass_df.loc[strain]["Nextclade_pango"]
            details = "nextclade-auto-pass {}".format(lineage)

            # Case 1 : In df with NO DUPS, set the status to positive, update details
            if strain in list(df.index):

                sc2rf_details_dict[strain].append(details)
                df.at[strain, "sc2rf_status"] = "positive"
                df.at[strain, "sc2rf_details"] = ";".join(sc2rf_details_dict[strain])

            # Case 2: In df WITH DUPS, set the dups status to positive, update details
            elif strain in list(df["strain"]):

                dup_strains = list(df[df["strain"] == strain].index)
                for dup in dup_strains:

                    sc2rf_details_dict[dup].append(details)
                    df.at[dup, "sc2rf_status"] = "positive"
                    df.at[dup, "sc2rf_details"] = ";".join(sc2rf_details_dict[dup])

            # Case 3: If it's not in the dataframe add it
            else:
                sc2rf_details_dict[strain] = [details]

                # new row
                row_dict = {col: [NO_DATA_CHAR] for col in df.columns}
                row_dict["strain"] = strain
                row_dict["sc2rf_status"] = "positive"
                row_dict["sc2rf_details"] = ";".join(sc2rf_details_dict[strain])

                row_df = pd.DataFrame(row_dict).set_index("strain")
                df = pd.concat([df, row_df], ignore_index=False)

    # -------------------------------------------------------------------------
    # Begin Post-Processing
    logger.info("Post-processing table")

    # Iterate through all positive and negative recombinantions
    for rec in df.iterrows():

        # Skip over negative recombinants
        if rec[1]["sc2rf_status"] == "negative":
            continue

        strain = rec[0]

        # Check if this strain was auto-passed
        strain_auto_pass = False
        for detail in sc2rf_details_dict[strain]:
            if "auto-pass" in detail:
                strain_auto_pass = True

        regions_str = rec[1]["regions"]
        regions_split = regions_str.split(",")

        alleles_str = rec[1]["alleles"]
        alleles_split = alleles_str.split(",")

        unique_subs_str = rec[1]["unique_subs"]
        unique_subs_split = unique_subs_str.split(",")

        # Keys are going to be the start coord of the region
        regions_filter = {}
        unique_subs_filter = []
        alleles_filter = []
        breakpoints_filter = []

        # Store alleles for filtering
        intermission_alleles = []
        alleles_by_parent = {}

        prev_clade = None
        prev_start_coord = 0
        prev_end_coord = 0

        # ---------------------------------------------------------------------
        # FIRST PASS

        # If the region is NA, this is an auto-passed recombinant
        # and sc2rf couldn't find any breakpoints, skip the rest of processing
        # and continue on to next strain

        if regions_split == [NO_DATA_CHAR]:
            continue

        for region in regions_split:

            coords = region.split("|")[0]
            clade = region.split("|")[1]
            start_coord = int(coords.split(":")[0])
            end_coord = int(coords.split(":")[1])
            region_len = (end_coord - start_coord) + 1
            coord_list = list(regions_filter)
            coord_list.reverse()

            # Just ignore singletons, no calculation necessary
            if region_len == 1:
                continue

            # Is this the first region?
            if not prev_clade:
                regions_filter[start_coord] = {"clade": clade, "end": end_coord}
                prev_clade = clade
                prev_start_coord = start_coord

            # Moving 3' to 5', collapse adjacent regions from the same parent
            # Modifies regions in place
            reverse_iter_collapse(
                regions=regions_filter,
                min_len=min_len,
                max_breakpoint_len=max_breakpoint_len,
                start_coord=start_coord,
                end_coord=end_coord,
                clade=clade,
            )

            # These get updated regardless of condition
            prev_clade = clade
            prev_end_coord = end_coord

        # Check the last region for length
        if len(regions_filter) > 1:
            start_coord = list(regions_filter)[-1]
            end_coord = regions_filter[start_coord]["end"]
            region_len = end_coord - start_coord
            if region_len < min_len:
                del regions_filter[start_coord]

        # -----------------------------------------------------------------
        # SECOND PASS: UNIQUE SUBSTITUTIONS

        regions_filter_collapse = {}

        for start_coord in list(regions_filter):
            clade = regions_filter[start_coord]["clade"]
            end_coord = regions_filter[start_coord]["end"]

            region_contains_unique_sub = False

            for sub in unique_subs_split:

                sub_coord = int(sub.split("|")[0])
                sub_parent = sub.split("|")[1]

                if (
                    sub_coord >= start_coord
                    and sub_coord <= end_coord
                    and sub_parent == clade
                ):
                    region_contains_unique_sub = True
                    unique_subs_filter.append(sub)

            # If it contains a unique sub, check if we should
            # collapse into previous parental region
            if region_contains_unique_sub:
                reverse_iter_collapse(
                    regions=regions_filter_collapse,
                    min_len=min_len,
                    max_breakpoint_len=max_breakpoint_len,
                    start_coord=start_coord,
                    end_coord=end_coord,
                    clade=clade,
                )

        regions_filter = regions_filter_collapse

        # -----------------------------------------------------------------
        # THIRD PASS: CONSECUTIVE ALLELES

        regions_filter_collapse = {}

        for start_coord in list(regions_filter):
            clade = regions_filter[start_coord]["clade"]
            end_coord = regions_filter[start_coord]["end"]

            num_consec_allele = 0

            for allele in alleles_split:

                allele_coord = int(allele.split("|")[0])
                allele_parent = allele.split("|")[1]

                if (
                    allele_coord >= start_coord
                    and allele_coord <= end_coord
                    and allele_parent == clade
                ):
                    num_consec_allele += 1
                    alleles_filter.append(allele)

            # If there are sufficient consecutive alleles, check if we should
            # collapse into previous parental region
            if num_consec_allele >= min_consec_allele:
                reverse_iter_collapse(
                    regions=regions_filter_collapse,
                    min_len=min_len,
                    max_breakpoint_len=max_breakpoint_len,
                    start_coord=start_coord,
                    end_coord=end_coord,
                    clade=clade,
                )

        regions_filter = regions_filter_collapse

        # -----------------------------------------------------------------
        # Check if all the regions were collapsed
        if len(regions_filter) < 2:
            sc2rf_details_dict[strain].append("single parent")
            # if this is an auto-pass lineage, don't add to false positives
            if not strain_auto_pass:
                false_positives_dict[strain] = ""

        # -----------------------------------------------------------------
        # FOURTH PASS: BREAKPOINT DETECTION

        prev_start_coord = None
        for start_coord in regions_filter:

            end_coord = regions_filter[start_coord]["end"]

            # Skip the first record for breakpoints
            if prev_start_coord:
                breakpoint_start = prev_end_coord + 1
                breakpoint_end = start_coord - 1
                breakpoint = "{}:{}".format(breakpoint_start, breakpoint_end)
                breakpoints_filter.append(breakpoint)

            prev_start_coord = start_coord
            prev_end_coord = end_coord

        # check if the number of breakpoints changed
        # the filtered breakpoints should only ever be equal or less
        # 2022-06-17: Why? Under what conditions does the filtered breakpoints increase?
        # Except! If the breakpoints were initially 0
        # num_breakpoints = df["breakpoints"][strain]
        num_breakpoints_filter = len(breakpoints_filter)

        # Check for too many breakpoints
        if max_breakpoints != -1:
            if num_breakpoints_filter > max_breakpoints:
                details = "{} breakpoints > {} max breakpoints".format(
                    num_breakpoints_filter,
                    max_breakpoints,
                )
                sc2rf_details_dict[strain].append(details)
                # if this is an auto-pass lineage, don't add to false positives
                if not strain_auto_pass:
                    false_positives_dict[strain] = ""

        # Identify the new filtered clades
        clades_filter = [regions_filter[s]["clade"] for s in regions_filter]
        # clades_filter_csv = ",".join(clades_filter)
        num_parents = len(set(clades_filter))
        if max_parents != -1:
            if num_parents > max_parents:
                details = "{} parents > {}".format(num_parents, max_parents)
                sc2rf_details_dict[strain].append(details)
                # if this is an auto-pass lineage, don't add to false positives
                if not strain_auto_pass:
                    false_positives_dict[strain] = ""

        # ---------------------------------------------------------------------
        # Intermission/Minor Parent Allele Ratio
        # ie. alleles that conflict with the parental region
        #   be lenient if there were more parents initially reported by sc2rf (ex. >2)
        #   because this wil lead to large numbers of unresolved intermissions

        original_parents = rec[1]["examples"]
        num_original_parents = len(original_parents.split(","))

        if num_original_parents > num_parents:
            intermission_allele_ratio = "NA"
        else:
            for start_coord in regions_filter:
                end_coord = regions_filter[start_coord]["end"]
                clade = regions_filter[start_coord]["clade"]

                for allele in alleles_split:

                    allele_coord = int(allele.split("|")[0])
                    allele_clade = allele.split("|")[1]
                    allele_nuc = allele.split("|")[2]

                    # Skip if this allele is not found in the current region
                    if allele_coord < start_coord or allele_coord > end_coord:
                        continue

                    # Skip missing data
                    if allele_nuc == "N":
                        continue

                    # Check if this allele's origins conflicts with the parental region
                    if allele_clade != clade:
                        intermission_alleles.append(allele)
                    else:
                        if clade not in alleles_by_parent:
                            alleles_by_parent[clade] = []
                        alleles_by_parent[clade].append(allele)

            # Add in alleles that were not assigned to any region
            for allele in alleles_split:
                allele_nuc = allele.split("|")[2]
                # Skip missing data
                if allele_nuc == "N":
                    continue

                allele_coord = int(allele.split("|")[0])
                allele_in_region = False
                for start_coord in regions_filter:
                    end_coord = regions_filter[start_coord]["end"]
                    if allele_coord >= start_coord and allele_coord <= end_coord:
                        allele_in_region = True

                # Alleles not assigned to any region are counted as intermissions
                if not allele_in_region:
                    intermission_alleles.append(allele)

            # Identify the "minor" parent (least number of alleles)
            # minor_parent = None
            minor_num_alleles = len(alleles_split)

            for parent in alleles_by_parent:
                num_alleles = len(alleles_by_parent[parent])
                if num_alleles <= minor_num_alleles:
                    # minor_parent = parent
                    minor_num_alleles = num_alleles

            intermission_allele_ratio = len(intermission_alleles) / minor_num_alleles

            # When the ratio is above 1, that means there are more intermission than
            # minor parent alleles. But don't override the false_positive status
            # if this strain was already flagged as a false_positive previously
            if intermission_allele_ratio >= 1:
                sc2rf_details_dict[strain].append("intermission_allele_ratio >= 1")
                # if this is an auto-pass lineage, don't add to false positives
                if not strain_auto_pass:
                    false_positives_dict[strain] = ""

        # --------------------------------------------------------------------------
        # Extract the lengths of each region
        regions_length = [str(regions_filter[s]["end"] - s) for s in regions_filter]

        # Construct the new filtered regions
        regions_filter = [
            "{}:{}|{}".format(s, regions_filter[s]["end"], regions_filter[s]["clade"])
            for s in regions_filter
        ]

        # --------------------------------------------------------------------------
        # Identify lineage based on breakpoint and parents!
        # But only if we've suppled the issues.tsv for pango-designation
        if issues:
            sc2rf_lineage = ""
            sc2rf_lineages = {bp_s: [] for bp_s in breakpoints_filter}

            for bp_s in breakpoints_filter:
                start_s = int(bp_s.split(":")[0])
                end_s = int(bp_s.split(":")[1])

                match_found = False

                for bp_rec in breakpoint_df.iterrows():

                    # Skip over this potential lineage if parents are wrong
                    bp_parents = bp_rec[1][parents_col]
                    if bp_parents != clades_filter:
                        continue

                    for bp_i in bp_rec[1][breakpoint_col]:

                        start_i = int(bp_i.split(":")[0])
                        end_i = int(bp_i.split(":")[1])
                        start_diff = abs(start_s - start_i)
                        end_diff = abs(end_s - end_i)

                        if (
                            start_diff <= BREAKPOINT_APPROX_BP
                            and end_diff <= BREAKPOINT_APPROX_BP
                        ):

                            sc2rf_lineages[bp_s].append(bp_rec[1]["lineage"])
                            match_found = True

                if not match_found:
                    sc2rf_lineages[bp_s].append(NO_DATA_CHAR)

            # if len(sc2rf_lineages) == num_breakpoints_filter:
            collapse_lineages = []
            for bp in sc2rf_lineages.values():
                for lineage in bp:
                    collapse_lineages.append(lineage)

            collapse_lineages = list(set(collapse_lineages))

            # When there are multiple breakpoint, a match must be the same for all!
            collapse_lineages_filter = []
            for lin in collapse_lineages:

                if lin == NO_DATA_CHAR:
                    continue
                # By default, assume they all match
                matches_all_bp = True
                for bp_s in sc2rf_lineages:
                    # If the lineage is missing, it's not in all bp
                    if lin not in sc2rf_lineages[bp_s]:
                        matches_all_bp = False
                        break

                # Check if we should drop it
                if matches_all_bp:
                    collapse_lineages_filter.append(lin)

            if len(collapse_lineages_filter) == 0:
                collapse_lineages_filter = [NO_DATA_CHAR]

            sc2rf_lineage = ",".join(collapse_lineages_filter)
            df.at[strain, "sc2rf_lineage"] = sc2rf_lineage

        # check for breakpoint motifs, to override lineage call
        # all breakpoints must include a motif!
        # ---------------------------------------------------------------------
        if motifs:
            breakpoints_motifs = []
            for bp in breakpoints_filter:
                bp_motif = False
                bp_start = int(bp.split(":")[0])
                bp_end = int(bp.split(":")[1])

                # Add buffers
                bp_start = bp_start - BREAKPOINT_APPROX_BP
                bp_end = bp_end + BREAKPOINT_APPROX_BP

                for motif_rec in motifs_df.iterrows():
                    motif_start = motif_rec[1]["start"]
                    motif_end = motif_rec[1]["end"]

                    # Is motif contained within the breakpoint
                    # Allow fuzzy matching
                    if motif_start >= bp_start and motif_end <= bp_end:
                        # print("\t\t", motif_start, motif_end)
                        bp_motif = True

                breakpoints_motifs.append(bp_motif)

            # If there's a lone "False" value, it gets re-coded to an
            # empty string on export. To prevent that, force it to be
            # the NO_DATA_CHAR (ex. 'NA')
            if len(breakpoints_motifs) == 0:
                breakpoints_motifs_str = [NO_DATA_CHAR]
            else:
                breakpoints_motifs_str = [str(m) for m in breakpoints_motifs]

            df.at[strain, "sc2rf_breakpoints_motif"] = ",".join(breakpoints_motifs_str)

            # Override the linaege call if one breakpoint had no motif
            if False in breakpoints_motifs:
                sc2rf_details_dict[strain].append("missing breakpoint motif")
                # if this is an auto-pass lineage, don't add to false positives
                if not strain_auto_pass:
                    false_positives_dict[strain] = ""

        df.at[strain, "sc2rf_clades_filter"] = ",".join(clades_filter)
        df.at[strain, "sc2rf_regions_filter"] = ",".join(regions_filter)
        df.at[strain, "sc2rf_regions_length"] = ",".join(regions_length)
        df.at[strain, "sc2rf_breakpoints_filter"] = ",".join(breakpoints_filter)
        df.at[strain, "sc2rf_num_breakpoints_filter"] = num_breakpoints_filter
        df.at[strain, "sc2rf_unique_subs_filter"] = ",".join(unique_subs_filter)
        df.at[strain, "sc2rf_alleles_filter"] = ",".join(alleles_filter)
        df.at[strain, "sc2rf_intermission_allele_ratio"] = intermission_allele_ratio

        if strain not in false_positives_dict:
            df.at[strain, "sc2rf_status"] = "positive"
            # A sample can be positive with no breakpoints, if auto-pass
            if len(breakpoints_filter) != 0:
                sc2rf_details_dict[strain] = ["recombination detected"]
            # For auto-pass negatives, update the csv_file
            else:
                df.at[strain, "csv_file"] = NO_DATA_CHAR
        else:
            df.at[strain, "sc2rf_status"] = "false_positive"

        df.at[strain, "sc2rf_details"] = ";".join(sc2rf_details_dict[strain])

    # ---------------------------------------------------------------------
    # Resolve strains with duplicate results

    logger.info("Reconciling duplicate results with method: {}".format(dup_method))
    for strain in duplicate_strains:

        # Check if this strain was auto-passed
        strain_auto_pass = True if strain in auto_pass_df.index else False
        strain_df = df[df["strain"] == strain]
        strain_bp = list(set(strain_df["sc2rf_breakpoints_filter"]))

        num_dups = duplicate_strains[strain]
        # Which duplicates should we keep (1), which should we remove (many)
        keep_dups = []
        remove_dups = []

        for i in range(1, num_dups + 1):
            dup_strain = strain + "_dup{}".format(i)
            dup_status = df["sc2rf_status"][dup_strain]

            if dup_status == "positive":
                keep_dups.append(dup_strain)
            else:
                remove_dups.append(dup_strain)

        # Case 1. No keep dups were found, retain first removal dup, remove all else
        if len(keep_dups) == 0:
            keep_strain = remove_dups[0]
            keep_dups.append(keep_strain)
            remove_dups.remove(keep_strain)

        # Case 2. Multiple keeps found, but it's an auto-pass and they're all negative
        # Just take the first csv
        elif strain_auto_pass and strain_bp == [""]:
            remove_dups += keep_dups[1:]
            keep_dups = [keep_dups[0]]

        # Case 3. Multiple keeps found, use dup_method
        elif len(keep_dups) > 1:

            # First try to match to a published lineage
            # Key = dup_strain, value = csv of lineages
            lineage_strains = {}

            for dup_strain in keep_dups:
                dup_lineage = df["sc2rf_lineage"][dup_strain]

                if dup_lineage != NO_DATA_CHAR:
                    lineage_strains[dup_strain] = dup_lineage

            lineage_strains_matches = list(set(lineage_strains.values()))

            # Check if a match was found, but not more than 1 candidate lineage
            if len(lineage_strains) > 0 and len(lineage_strains_matches) < 2:
                keep_strain = list(lineage_strains)[0]
                # Remove all strains except the min one
                remove_dups = keep_dups + remove_dups
                remove_dups.remove(keep_strain)
                keep_dups = [keep_strain]

            # Otherwise, use a duplicate resolving method
            elif dup_method == "first":
                remove_dups += keep_dups[1:]
                keep_dups = [keep_dups[0]]

            elif dup_method == "last":
                remove_dups += keep_dups[0:-1]
                keep_dups = [keep_dups[-1]]

            elif dup_method == "min_bp":
                min_bp = None
                min_bp_strain = None
                for dup_strain in keep_dups:
                    num_bp = df["sc2rf_num_breakpoints_filter"][dup_strain]
                    if not min_bp:
                        min_bp = num_bp
                        min_bp_strain = dup_strain
                    elif num_bp < min_bp:
                        min_bp = num_bp
                        min_bp_strain = dup_strain
                # Remove all strains except the min one
                remove_dups = keep_dups + remove_dups
                remove_dups.remove(min_bp_strain)
                keep_dups = [min_bp_strain]

            elif dup_method == "min_uncertainty":

                min_uncertainty = None
                min_uncertainty_strain = None

                for dup_strain in keep_dups:

                    # '8394:12879,13758:22000'
                    bp = df["sc2rf_breakpoints_filter"][dup_strain]
                    # ['8394:12879','13758:22000']
                    # Skip if this is an auto-pass lineage with no breakpoints
                    if bp == "":
                        continue
                    bp = bp.split(",")
                    # [4485, 8242]
                    bp = [int(c.split(":")[1]) - int(c.split(":")[0]) for c in bp]
                    bp_uncertainty = sum(bp)

                    # Set to the first strain by default
                    if not min_uncertainty:
                        min_uncertainty_strain = dup_strain
                        min_uncertainty = bp_uncertainty
                    elif bp_uncertainty < min_uncertainty:
                        min_uncertainty_strain = dup_strain
                        min_uncertainty = bp_uncertainty

                # Remove all strains except the min one
                remove_dups = keep_dups + remove_dups
                remove_dups.remove(min_uncertainty_strain)
                keep_dups = [min_uncertainty_strain]

        # If this is an auto-pass negative, indicate no csvfile was used
        if strain_bp == [""]:
            keep_csv_file = NO_DATA_CHAR
        else:
            keep_csv_file = df["csv_file"][keep_dups[0]]
        logger.info(
            "Reconciling duplicate results for {}, retaining output from: {}".format(
                strain, keep_csv_file
            )
        )
        # Rename the accepted duplicate
        df.rename(index={keep_dups[0]: strain}, inplace=True)
        # Drop the rejected duplicates
        df.drop(labels=remove_dups, inplace=True)

    # ---------------------------------------------------------------------
    # Fix up duplicate strain names in false_positives

    false_positives_filter = {}
    for strain in false_positives_dict:

        strain_orig = strain.split("_dup")[0]
        status = df["sc2rf_status"][strain_orig]
        # If the original strain isn't positive
        # All duplicates were false positives and should be removed
        if status != "positive":
            false_positives_filter[strain_orig] = false_positives_dict[strain]
            sc2rf_details_dict[strain_orig] = sc2rf_details_dict[strain]

    false_positives_dict = false_positives_filter

    # ---------------------------------------------------------------------
    # Identify parent lineages by querying cov-spectrum mutations

    # We can only do this if:
    # 1. A nextclade no-recomb tsv file was specified with mutations
    # 2. Multiple regions were detected (not collapsed down to one parent region)
    # 3. The lapis API was enabled (--lapis)

    if nextclade_no_recomb and lapis:

        logger.info(
            "Identifying parent lineages based on nextclade no-recomb substitutions"
        )

        # Check if we're using open data or GISAID
        if gisaid_access_key:
            lapis_url_base = LAPIS_GISAID_BASE.format(
                access_key=gisaid_access_key, mutations="{mutations}"
            )
        else:
            lapis_url_base = LAPIS_OPEN_BASE

        positive_df = df[df["sc2rf_status"] == "positive"]
        total_positives = len(positive_df)
        progress_i = 0

        # keys = query, value = json
        query_subs_dict = {}

        for rec in positive_df.iterrows():

            strain = rec[0]
            strain_bp = rec[1]["sc2rf_breakpoints_filter"]

            # If this was an auto-pass negative strain, no regions for us to query
            if strain_bp == NO_DATA_CHAR:
                continue

            parent_clades = rec[1]["sc2rf_clades_filter"].split(",")

            progress_i += 1
            logger.info("{} / {}: {}".format(progress_i, total_positives, strain))

            regions_filter = positive_df["sc2rf_regions_filter"][strain].split(",")

            parent_lineages = []
            parent_lineages_confidence = []
            parent_lineages_subs = []

            substitutions = nextclade_no_recomb_df["substitutions"][strain].split(",")
            unlabeled_privates = nextclade_no_recomb_df[
                "privateNucMutations.unlabeledSubstitutions"
            ][strain].split(",")

            # Remove NA char
            if NO_DATA_CHAR in substitutions:
                substitutions.remove(NO_DATA_CHAR)
            if NO_DATA_CHAR in unlabeled_privates:
                unlabeled_privates.remove(NO_DATA_CHAR)

            # Exclude privates from mutations to query
            for private in unlabeled_privates:
                # Might not be in there if it's an indel
                if private in substitutions:
                    substitutions.remove(private)

            # Split mutations by region
            for region, parent_clade in zip(regions_filter, parent_clades):

                # If the parental clade is a recombinant, we'll allow the covlineages
                # to be recombinants
                parent_clade_is_recombinant = False
                for label in parent_clade.split("/"):
                    if label.startswith("X"):
                        parent_clade_is_recombinant = True

                region_coords = region.split("|")[0]
                region_start = int(region_coords.split(":")[0])
                region_end = int(region_coords.split(":")[1])
                region_subs = []

                for sub in substitutions:
                    sub_coord = int(sub[1:-1])
                    if sub_coord >= region_start and sub_coord <= region_end:
                        region_subs.append(sub)

                region_subs_csv = ",".join(region_subs)

                # Check if we already fetched this subs combo
                if region_subs_csv in query_subs_dict:
                    logger.info("\tUsing cache for region {}".format(region_coords))
                    lineage_data = query_subs_dict[region_subs_csv]
                # Otherwise, query cov-spectrum for these subs
                else:
                    query_subs_dict[region_subs_csv] = ""
                    url = lapis_url_base.format(mutations=region_subs_csv)
                    logger.info(
                        "Querying cov-spectrum for region {}".format(region_coords)
                    )
                    r = requests.get(url)
                    # Sleep after fetching
                    time.sleep(LAPIS_SLEEP_TIME)
                    result = r.json()
                    lineage_data = result["data"]
                    query_subs_dict[region_subs_csv] = lineage_data

                # Have keys be counts
                lineage_dict = {}

                for rec in lineage_data:
                    lineage = rec[LAPIS_LINEAGE_COL]

                    # Ignore None lineages
                    if not lineage:
                        continue

                    # If parent clade is not a recombinant, don't include
                    # recombinant lineages in final dict and count
                    if lineage.startswith("X") and not parent_clade_is_recombinant:
                        continue

                    count = rec["count"]
                    lineage_dict[count] = lineage

                # Sort in order
                lineage_dict = {
                    k: lineage_dict[k] for k in sorted(lineage_dict, reverse=True)
                }

                # If no matches were found, report NA for lineage
                if len(lineage_dict) == 0:
                    max_lineage = NO_DATA_CHAR
                    max_prop = NO_DATA_CHAR
                else:
                    # Temporarily set to fake data
                    total_count = sum(lineage_dict.keys())
                    max_count = max(lineage_dict.keys())
                    max_prop = max_count / total_count
                    max_lineage = lineage_dict[max_count]

                    # Don't want to report recombinants as parents
                    # UNLESS, the parental clade is a recombinant (ex. XBB)
                    while (
                        max_lineage is None
                        or max_lineage.startswith("X")
                        or max_lineage == "Unassigned"
                    ):

                        # Allow the max_lineage to be recombinant if clade is also
                        if max_lineage.startswith("X") and parent_clade_is_recombinant:
                            break
                        lineage_dict = {
                            count: lineage
                            for count, lineage in lineage_dict.items()
                            if lineage != max_lineage
                        }
                        # If there are no other options, set to NA
                        if len(lineage_dict) == 0:
                            max_lineage = NO_DATA_CHAR
                            break
                        # Otherwise try again!
                        else:
                            # For now, deliberately don't update total_count
                            max_count = max(lineage_dict.keys())
                            max_prop = max_count / total_count
                            max_lineage = lineage_dict[max_count]

                    # If we ended with an empty dictionary,
                    # there were not usable lineages
                    if len(lineage_dict) == 0:
                        max_lineage = NO_DATA_CHAR
                        max_prop = NO_DATA_CHAR

                    # Combine counts of sublineages into the max lineage total
                    # This requires the pangolin lineage tree!
                    if lineage_tree:
                        max_lineage_tree = [c for c in tree.find_clades(max_lineage)]
                        # Make sure we found this lineage in the tree
                        if len(max_lineage_tree) == 1:
                            max_lineage_tree = max_lineage_tree[0]
                            max_lineage_children = [
                                c.name for c in max_lineage_tree.find_clades()
                            ]

                            # Search for counts in the lapis data that
                            # descend from the max lineage
                            for count, lineage in lineage_dict.items():
                                if (
                                    lineage in max_lineage_children
                                    and lineage != max_lineage
                                ):
                                    max_count += count
                                    max_prop = max_count / total_count

                            # Add a "*" suffix to the max lineage, to indicate
                            # this includes descendant counts
                            max_lineage = max_lineage + "*"

                parent_lineages_sub_str = "{}|{}".format(region_subs_csv, max_lineage)

                parent_lineages.append(max_lineage)
                parent_lineages_confidence.append(max_prop)
                parent_lineages_subs.append(parent_lineages_sub_str)

            # Update the dataframe columns
            df.at[strain, "cov-spectrum_parents"] = ",".join(parent_lineages)
            df.at[strain, "cov-spectrum_parents_confidence"] = ",".join(
                str(round(c, 3)) if type(c) == float else NO_DATA_CHAR
                for c in parent_lineages_confidence
            )
            df.at[strain, "cov-spectrum_parents_subs"] = ";".join(parent_lineages_subs)

    # ---------------------------------------------------------------------
    # Identify parent conflict

    if nextclade_no_recomb and lapis and lineage_tree:

        logger.info("Identifying parental conflict between lineage and clade.")

        positive_df = df[df["sc2rf_status"] == "positive"]

        for rec in positive_df.iterrows():

            strain = rec[0]
            parent_clades = rec[1]["sc2rf_clades_filter"].split(",")
            parent_lineages = rec[1]["cov-spectrum_parents"].split(",")
            conflict = False
            lineage_is_descendant = False
            lineage_is_ancestor = False

            # Clade format can be:
            #   Lineage (BA.2.3.20)
            #   WHO_LABEL/Nextstrain clade (Delta/21J)
            #   WHO_LABEL/Lineage/Nextstrain clade (Omicron/BA.1/21K)
            #   WHO_LABEL/Lineage/Nextstrain clade (Omicron/XBB/22F)
            #   Lineage/Nextstrain clade (B.1.177/20E)

            for i, labels in enumerate(parent_clades):
                parent = NO_DATA_CHAR
                labels_split = labels.split("/")
                for label in labels_split:
                    # Need to find the lineage which contains "."
                    if "." in label:
                        parent = label
                    # Or a recombinant, which doesn't have a "."
                    elif label.startswith("X"):
                        parent = label
                parent_clades[i] = parent

            for c, l in zip(parent_clades, parent_lineages):

                # We can't compare to NA
                if c == NO_DATA_CHAR or l == NO_DATA_CHAR:
                    continue
                # Remove the asterisk indicating all descendants
                l = l.replace("*", "")
                # If they are the same, there is no conflict, continue
                if c == l:
                    continue

                # Check if parents_lineage is descendant of parents_clade
                c_node = [c for c in tree.find_clades(c)][0]
                l_node = [c for c in c_node.find_clades(l)]
                if len(l_node) != 1:
                    lineage_is_descendant = True

                # Check if parents_lineage is ancestor of parents_clade
                l_node = [c for c in tree.find_clades(l)][0]
                c_node = [c for c in l_node.find_clades(c)]
                if len(c_node) != 1:
                    lineage_is_ancestor = True

                if not lineage_is_descendant and not lineage_is_ancestor:
                    conflict = True

            df.at[strain, "sc2rf_parents_conflict"] = conflict

    # ---------------------------------------------------------------------
    # Write exclude strains (false positives)

    outpath_exclude = os.path.join(outdir, prefix + ".exclude.tsv")
    logger.info("Writing strains to exclude: {}".format(outpath_exclude))
    if len(false_positives_dict) > 0:
        with open(outpath_exclude, "w") as outfile:
            for strain in false_positives_dict:
                details = ";".join(sc2rf_details_dict[strain])
                outfile.write(strain + "\t" + details + "\n")
    else:
        cmd = "touch {outpath}".format(outpath=outpath_exclude)
        os.system(cmd)

    # -------------------------------------------------------------------------
    # write output table

    # Drop old columns, if there were only negative samples, these columns don't exist
    logger.info("Formatting output columns.")
    if set(df["sc2rf_status"]) != set(["negative"]):
        df.drop(
            [
                "examples",
                "intermissions",
                "breakpoints",
                "regions",
                "unique_subs",
                "alleles",
            ],
            axis="columns",
            inplace=True,
        )

    # Move the strain column to the beginning
    df.drop(columns="strain", inplace=True)
    df.insert(loc=0, column="strain", value=df.index)
    df.rename(
        {
            "sc2rf_clades_filter": "sc2rf_parents",
            "sc2rf_regions_filter": "sc2rf_regions",
            "sc2rf_breakpoints_filter": "sc2rf_breakpoints",
            "sc2rf_num_breakpoints_filter": "sc2rf_num_breakpoints",
            "sc2rf_unique_subs_filter": "sc2rf_unique_subs",
            "sc2rf_alleles_filter": "sc2rf_alleles",
        },
        axis="columns",
        inplace=True,
    )
    # Sort by status
    df.sort_values(by=["sc2rf_status", "sc2rf_lineage"], inplace=True, ascending=False)
    outpath_rec = os.path.join(outdir, prefix + ".tsv")

    logger.info("Writing the output table: {}".format(outpath_rec))
    df.to_csv(outpath_rec, sep="\t", index=False)

    # -------------------------------------------------------------------------
    # write output strains
    outpath_strains = os.path.join(outdir, prefix + ".txt")
    strains_df = df[
        (df["sc2rf_status"] != "negative") & (df["sc2rf_status"] != "false_positive")
    ]
    strains = list(strains_df.index)
    strains_txt = "\n".join(strains)
    with open(outpath_strains, "w") as outfile:
        outfile.write(strains_txt)

    # -------------------------------------------------------------------------
    # filter the ansi output
    if ansi:

        ansi_split = ansi.split(",")
        outpath_ansi = os.path.join(outdir, prefix + ".ansi.txt")

        for i, ansi_file in enumerate(ansi_split):
            logger.info("Parsing ansi: {}".format(ansi_file))
            if len(false_positives_dict) > 0:
                cmd = (
                    "cut -f 1 "
                    + "{exclude} | grep -v -f - {inpath} {operator} {outpath}".format(
                        exclude=outpath_exclude,
                        inpath=ansi_file,
                        operator=">" if i == 0 else ">>",
                        outpath=outpath_ansi,
                    )
                )
            else:
                cmd = "cat {inpath} {operator} {outpath}".format(
                    inpath=ansi_file,
                    operator=">" if i == 0 else ">>",
                    outpath=outpath_ansi,
                )
            logger.info("Writing filtered ansi: {}".format(outpath_ansi))
            # logger.info(cmd)
            os.system(cmd)

    # -------------------------------------------------------------------------
    # write alignment
    if aligned:
        outpath_fasta = os.path.join(outdir, prefix + ".fasta")
        logger.info("Writing filtered alignment: {}".format(outpath_fasta))

        cmd = "seqkit grep -f {outpath_strains} {aligned} > {outpath_fasta};".format(
            outpath_strains=outpath_strains,
            aligned=aligned,
            outpath_fasta=outpath_fasta,
        )
        os.system(cmd)


if __name__ == "__main__":
    main()

Python Pandas Biopython JSON seqkit CLICK requests nextclade Pangolin From line 3 of sc2rf/postprocess.py

import requests
import click
import pandas as pd
import numpy as np
import sys
from datetime import datetime

NO_DATA_CHAR = "NA"
rate_limit_url = "https://api.github.com/rate_limit"
base_url = "https://api.github.com/repos/cov-lineages/pango-designation/issues"
params = "?state=all&per_page=100&page={page_num}"
query_url = base_url + params

# These issues have problems, and are manually curated in the breakpoints file
# Typically, they describe multiple lineages in a single issue
EXCLUDE_ISSUES = [
    808,  # Chronic B.1.438.1 recombinant with BA.1.1.16
    844,  # XAY and XBA
]


@click.command()
@click.option("--token", help="Github API Token", required=False)
@click.option(
    "--breakpoints",
    help=(
        "TSV of curated breakpoints with columns 'lineage', 'issue', and 'breakpoints'"
    ),
    required=False,
)
def main(token, breakpoints):
    """Fetch pango-designation issues"""

    breakpoints_curated = breakpoints

    # Construct the header
    header_cols = [
        "issue",
        "lineage",
        "status",
        "status_description",
        "countries",
        "breakpoints",
        "title",
        "date_created",
        "date_updated",
        "date_closed",
    ]
    df = pd.DataFrame(columns=header_cols)

    # Iterate through the pages of issues
    pages = range(1, 100)
    for page_num in pages:

        # Is the user supplied an API token, use it
        headers = ""
        if token:
            headers = {"Authorization": "token {}".format(token)}

        # Check the current rate limt
        r = requests.get(rate_limit_url, headers=headers)
        api_stats = r.json()
        requests_limit = api_stats["resources"]["core"]["limit"]
        requests_remaining = api_stats["resources"]["core"]["remaining"]

        reset_time = api_stats["resources"]["core"]["reset"]
        reset_date = datetime.fromtimestamp(reset_time)

        if requests_remaining == 0:
            msg = "ERROR: Hourly API limit of {} requests exceeded,".format(
                requests_limit
            )
            msg += " rate limit will reset after {}.".format(reset_date)
            print(msg, file=sys.stderr)
            sys.exit(1)

        # We're under the rate limit, and can proceed
        r = requests.get(query_url.format(page_num=page_num), headers=headers)
        issues_json = r.json()

        # The issues json will be empty if we ran out of pages
        if len(issues_json) == 0:
            break

        # Iterate through issues
        for issue in issues_json:

            # Assume not a recombinant
            recombinant = False
            number = issue["number"]

            # Check for excluded issues
            if number in EXCLUDE_ISSUES:
                continue

            # sanitize quotes out of title
            title = issue["title"].replace('"', "")
            # sanitize markdown formatters
            title = title.replace("*", "")

            # If title includes recombinant or recombinantion
            if "recombinant" in title.lower() or "recombination" in title.lower():
                recombinant = True

            # Designated lineages are stored under the milestone title
            lineage = ""
            milestone = issue["milestone"]
            if milestone:
                lineage = milestone["title"]

            # Detect recombinant by lineage nomenclature (X)
            if lineage:
                if lineage.startswith("X"):
                    recombinant = True
                else:
                    continue

            # Parse labels
            labels = issue["labels"]

            # labels are a our last chance, so if it doesn't have any skip
            if not recombinant and len(labels) == 0:
                continue

            status = ""
            status_description = ""
            if len(labels) > 0:
                status = labels[0]["name"]
                status_description = labels[0]["description"]

            # Check if the label (status) includes recombinant
            if "recombinant" in status.lower():
                recombinant = True

            # Skip to the next record if this is not a recombinant issue
            if not recombinant:
                continue

            # If a lineage hasn't been assigned,
            # use the propose# nomenclature from UShER
            if not lineage:
                lineage = "proposed{}".format(number)

            # Try to extract info from the body
            body = issue["body"]
            breakpoints = []
            countries = []

            # Skip issues without a body
            if not body:
                continue

            for line in body.split("\n"):

                line = line.strip().replace("*", "")

                # Breakpoints
                if "breakpoint:" in line.lower():
                    breakpoints.append(line)
                elif "breakpoint" in line.lower():
                    breakpoints.append(line)

                # Countries (nicely structures)
                if "countries circulating" in line.lower():
                    line = line.replace("Countries circulating:", "")
                    countries.append(line)

            breakpoints = ";".join(breakpoints)
            countries = ";".join(countries)

            # Dates
            date_created = issue["created_at"]
            date_updated = issue["updated_at"]
            date_closed = issue["closed_at"]

            if not date_closed:
                date_closed = NO_DATA_CHAR

            # Create the output data
            data = {col: "" for col in header_cols}
            data["issue"] = number
            data["lineage"] = lineage
            data["status"] = status
            data["status_description"] = status_description
            data["title"] = title
            data["countries"] = countries
            data["breakpoints"] = breakpoints
            data["date_created"] = date_created
            data["date_updated"] = date_updated
            data["date_closed"] = date_closed

            # Change data vals to lists for pandas
            data = {k: [v] for k, v in data.items()}
            # Convert dict to dataframe
            df_data = pd.DataFrame(data)
            # Add data to the main dataframe
            df = pd.concat([df, df_data], ignore_index=True)

    # -------------------------------------------------------------------------
    # Curate breakpoints

    if breakpoints_curated:
        df_breakpoints = pd.read_csv(breakpoints_curated, sep="\t")

        if (
            "breakpoints" in df_breakpoints.columns
            and "breakpoints_curated" not in df.columns
        ):
            df.insert(5, "breakpoints_curated", [""] * len(df))

        if "parents" in df_breakpoints.columns and "parents_curated" not in df.columns:
            df.insert(5, "parents_curated", [""] * len(df))

        for rec in df.iterrows():
            issue = rec[1]["issue"]
            lineage = rec[1]["lineage"]

            if issue not in list(df_breakpoints["issue"]):
                continue

            match = df_breakpoints[df_breakpoints["issue"] == issue]
            bp = list(match["breakpoints"])[0]
            parents = list(match["parents"])[0]

            if bp != np.nan:
                df.at[rec[0], "breakpoints_curated"] = bp
            if parents != np.nan:
                df.at[rec[0], "parents_curated"] = parents

        # Check for lineages with curated breakpoints that are missing
        # Ex. "XAY" and "XBA" were grouped into one issue
        for rec in df_breakpoints.iterrows():
            lineage = rec[1]["lineage"]

            if lineage in list(df["lineage"]):
                continue

            issue = rec[1]["issue"]
            breakpoints = rec[1]["breakpoints"]
            parents = rec[1]["parents"]

            row_data = {col: [""] for col in df.columns}
            row_data["lineage"][0] = lineage
            row_data["issue"][0] = rec[1]["issue"]
            row_data["breakpoints_curated"][0] = rec[1]["breakpoints"]
            row_data["parents_curated"][0] = rec[1]["parents"]

            row_df = pd.DataFrame(row_data)
            df = pd.concat([df, row_df])

    # -------------------------------------------------------------------------
    # Sort the final dataframe
    status_order = {"designated": 0, "recombinant": 1, "monitor": 2, "not accepted": 3}

    # Change empty cells to NaN so they'll be sorted to the end
    df = df.replace({"lineage": "", "status": ""}, np.nan)
    df.sort_values(
        [
            "status",
            "lineage",
        ],
        inplace=True,
        key=lambda x: x.map(status_order),
    )
    df.to_csv(sys.stdout, sep="\t", index=False)


if __name__ == "__main__":
    main()

Python Pandas numpy JSON CLICK requests From line 3 of scripts/download_issues.py

import click
import os
import pandas as pd
from datetime import datetime, date
import statistics

# Hard-coded constants

NO_DATA_CHAR = "NA"


# Select and rename columns from linelist
LINEAGE_COLS = [
    "cluster_id",
    "status",
    "lineage",
    "recombinant_lineage_curated",
    "parents_clade",
    "parents_lineage",
    "parents_lineage_confidence",
    "breakpoints",
    "issue",
    "sequences",
    "growth_score",
    "earliest_date",
    "latest_date",
    "rbd_level",
    "immune_escape",
    "ace2_binding",
    "cluster_privates",
    "cov-spectrum_query",
]


@click.command()
@click.option(
    "--input", help="Input file of recombinant sequences (tsv).", required=True
)
@click.option(
    "--output", help="Output file of recombinant lineages (tsv)", required=True
)
@click.option("--geo", help="Geography column", required=False, default="country")
def main(
    input,
    output,
    geo,
):
    """Create a table of recombinant lineages"""

    # -------------------------------------------------------------------------
    # Setup
    # -------------------------------------------------------------------------

    # Misc variables
    outdir = os.path.dirname(output)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    df = pd.read_csv(input, sep="\t")
    df.fillna(NO_DATA_CHAR, inplace=True)

    issues_format = []

    for issue in df["issue"]:
        if type(issue) == int:
            issues_format.append(int(issue))
        elif type(issue) == float:
            issue = str(int(issue))
            issues_format.append(issue)
        elif type(issue) == str:
            issues_format.append(issue)

    df["issue"] = issues_format

    # Issue #168 NULL dates are allowed
    # Set to today instead
    # https://github.com/ktmeaton/ncov-recombinant/issues/168
    seq_date = []
    for d in list(df["date"]):
        try:
            d = datetime.strptime(d, "%Y-%m-%d").date()
        except ValueError:
            # Set NA dates to today
            d = date.today()

        seq_date.append(d)
    df["datetime"] = seq_date

    # -------------------------------------------------------------------------
    # Create the recombinants table (recombinants.tsv)
    # -------------------------------------------------------------------------

    recombinants_data = {col: [] for col in LINEAGE_COLS}
    recombinants_data[geo] = []

    for cluster_id in set(df["cluster_id"]):

        match_df = df[df["cluster_id"] == cluster_id]

        earliest_date = min(match_df["datetime"])
        latest_date = max(match_df["datetime"])
        sequences = len(match_df)

        # Summaize rbd_level by the mode
        rbd_level = statistics.mode(match_df["rbd_level"])
        # Summarize immune escape by mean, Immune escape can be NA
        immune_escape = [
            val for val in match_df["immune_escape"].values if val != NO_DATA_CHAR
        ]
        if len(immune_escape) > 0:
            immune_escape = statistics.mean(match_df["immune_escape"].values)
        else:
            immune_escape = NO_DATA_CHAR

        # Summarize ace2_binding by mean, ace2_binding can be NA
        ace2_binding = [
            val for val in match_df["ace2_binding"].values if val != NO_DATA_CHAR
        ]
        if len(ace2_binding) > 0:
            ace2_binding = statistics.mean(match_df["ace2_binding"].values)
        else:
            ace2_binding = NO_DATA_CHAR

        recombinants_data["cluster_id"].append(cluster_id)
        recombinants_data["status"].append(match_df["status"].values[0])
        recombinants_data["lineage"].append(match_df["lineage"].values[0])
        recombinants_data["recombinant_lineage_curated"].append(
            match_df["recombinant_lineage_curated"].values[0]
        )
        recombinants_data["parents_clade"].append(match_df["parents_clade"].values[0])
        recombinants_data["parents_lineage"].append(
            match_df["parents_lineage"].values[0]
        )
        recombinants_data["parents_lineage_confidence"].append(
            match_df["parents_lineage_confidence"].values[0]
        )
        recombinants_data["breakpoints"].append(match_df["breakpoints"].values[0])
        recombinants_data["issue"].append(match_df["issue"].values[0])
        recombinants_data["sequences"].append(sequences)
        recombinants_data["earliest_date"].append(earliest_date)
        recombinants_data["latest_date"].append(latest_date)
        recombinants_data["cluster_privates"].append(
            match_df["cluster_privates"].values[0]
        )
        recombinants_data["cov-spectrum_query"].append(
            match_df["cov-spectrum_query"].values[0]
        )
        # Immune stats
        recombinants_data["rbd_level"].append(rbd_level)
        recombinants_data["immune_escape"].append(immune_escape)
        recombinants_data["ace2_binding"].append(ace2_binding)

        geo_list = list(set(match_df[geo]))
        geo_list.sort()
        geo_counts = []
        for loc in geo_list:
            loc_df = match_df[match_df[geo] == loc]
            num_sequences = len(loc_df)
            geo_counts.append("{} ({})".format(loc, num_sequences))

        recombinants_data[geo].append(", ".join(geo_counts))

        # Growth Calculation
        growth_score = 0
        duration = (latest_date - earliest_date).days + 1
        growth_score = round(sequences / duration, 2)
        recombinants_data["growth_score"].append(growth_score)

    recombinants_df = pd.DataFrame(recombinants_data)
    recombinants_df.sort_values(by="sequences", ascending=False, inplace=True)
    recombinants_df.to_csv(output, index=False, sep="\t")


if __name__ == "__main__":
    main()

Python Pandas CLICK From line 2 of scripts/lineages.py

import click
import os
from Bio import Phylo
from Bio.Phylo.BaseTree import Clade
import requests
from pango_aliasor.aliasor import Aliasor

LINEAGES_URL = (
    "https://raw.githubusercontent.com/cov-lineages/pango-designation/master/"
    + "lineage_notes.txt"
)


@click.command()
@click.option("--output", help="Output newick phylogeny.", required=True)
def main(output):
    """Create a nomenclature tree of pango lineages."""

    # Create output directory if it doesn't exist
    outdir = os.path.dirname(output)
    if not os.path.exists(outdir) and outdir != "":
        os.mkdir(outdir)

    # -------------------------------------------------------------------------
    # Download Latest designated lineages from pango-designation

    print("Downloading list of lineages: {}".format(LINEAGES_URL))
    r = requests.get(LINEAGES_URL)
    lineage_text = r.text

    # Convert the text table to list
    lineages = []
    for line in lineage_text.split("\n"):
        if "Withdrawn" in line or line.startswith("Lineage"):
            continue

        lineage = line.split("\t")[0]
        if lineage == "":
            continue
        lineages.append(lineage)

    # Initialize the aliasor, which will download the latest aliases
    aliasor = Aliasor()

    # -------------------------------------------------------------------------
    # Construct Tree

    print("Constructing tree.")

    # Create a tree with a root node "MRCA"
    tree = Clade(name="MRCA", clades=[], branch_length=1)
    # Add an "X" parent for recombinants
    clade = Clade(name="X", clades=[], branch_length=1)
    tree.clades.append(clade)

    for lineage in lineages:

        # Identify the parent
        lineage_uncompress = aliasor.uncompress(lineage)
        parent_uncompress = ".".join(lineage_uncompress.split(".")[0:-1])
        parent = aliasor.compress(parent_uncompress)

        # Manual parents setting for A and B
        if lineage == "A":
            parent = "MRCA"

        elif lineage == "B":
            parent = "A"

        # Special handling for recombinants
        elif lineage.startswith("X") and parent == "":
            parent = "X"

        parent_clade = [c for c in tree.find_clades(parent)]
        # If we found a parent, as long as the input list is formatted correctly
        # this should always be true
        if len(parent_clade) == 1:
            parent_clade = parent_clade[0]
            clade = Clade(name=lineage, clades=[], branch_length=1)
            parent_clade.clades.append(clade)

    # -------------------------------------------------------------------------
    # Export

    tree_outpath = os.path.join(output)
    print("Exporting tree: {}".format(tree_outpath))
    Phylo.write(tree, tree_outpath, "newick")


if __name__ == "__main__":
    main()

Python Biopython CLICK requests pango-aliasor From line 3 of scripts/lineage_tree.py

import click
import os
import pandas as pd
import copy
from datetime import datetime
import numpy as np
from functions import create_logger
from Bio import Phylo

# Hard-coded constants

NO_DATA_CHAR = "NA"

PIPELINE = "ncov-recombinant"

# There are alternative lineage names for sequencing dropouts
SC2RF_LINEAGE_ALT = {
    "XBB_ARTICv4.1": "XBB",
    "XAY_dropout": "XAY",
}

# Select and rename columns from summary
LINELIST_COLS = {
    "strain": "strain",
    "sc2rf_lineage": "lineage_sc2rf",
    "sc2rf_status": "status_sc2rf",
    "Nextclade_pango": "lineage_nextclade",
    "Nextclade_clade": "clade_nextclade",
    "sc2rf_parents": "parents_clade",
    "cov-spectrum_parents": "parents_lineage",
    "sc2rf_parents_conflict": "parents_conflict",
    "cov-spectrum_parents_confidence": "parents_lineage_confidence",
    "cov-spectrum_parents_subs": "parents_subs",
    "sc2rf_breakpoints": "breakpoints",
    "sc2rf_regions": "regions",
    "date": "date",
    "country": "country",
    "privateNucMutations.reversionSubstitutions": "subs_reversion",
    "privateNucMutations.unlabeledSubstitutions": "subs_unlabeled",
    "privateNucMutations.labeledSubstitutions": "subs_labeled",
    "substitutions": "subs",
    "rbd_level": "rbd_level",
    "rbd_substitutions": "rbd_substitutions",
    "immune_escape": "immune_escape",
    "ace2_binding": "ace2_binding",
    "ncov-recombinant_version": "ncov-recombinant_version",
    "nextclade_version": "nextclade_version",
    "nextclade_dataset": "nextclade_dataset",
}


@click.command()
@click.option("--input", help="Summary (tsv).", required=True)
@click.option(
    "--issues",
    help="pango-designation issues table",
    required=True,
    default="resources/issues.tsv",
)
@click.option(
    "--extra-cols",
    help="Extra columns (csv) to extract from the summary",
    required=False,
)
@click.option(
    "--outdir",
    help="Output directory for linelists",
    required=True,
)
@click.option(
    "--min-lineage-size",
    help="For lineage sizes larger than this, investigate -like status.",
    default=10,
    required=False,
)
@click.option(
    "--min-private-muts",
    help="If more than this number of private mutations, investigate -like status.",
    required=False,
    default=3,
)
@click.option(
    "--lineage-tree",
    help="Newick tree of pangolin lineage hierarchies",
    required=False,
)
@click.option("--log", help="Logfile", required=False)
def main(
    input,
    issues,
    extra_cols,
    outdir,
    log,
    min_lineage_size,
    min_private_muts,
    lineage_tree,
):
    """Create a linelist and recombinant report"""

    # -------------------------------------------------------------------------
    # Setup
    # -------------------------------------------------------------------------

    # create logger
    logger = create_logger(logfile=log)

    # Misc variables
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    # Import Summary Dataframe
    logger.info("Parsing table: {}".format(input))
    summary_df = pd.read_csv(input, sep="\t")
    summary_df.fillna(NO_DATA_CHAR, inplace=True)

    # Import Issues Summary Dataframe
    logger.info("Parsing issues: {}".format(issues))
    issues_df = pd.read_csv(issues, sep="\t")
    issues_df.fillna(NO_DATA_CHAR, inplace=True)

    # (Optional) Extract columns from summary
    if extra_cols:
        for col in extra_cols.split(","):
            LINELIST_COLS[col] = col

    # (Optional) phylogenetic tree of pangolineage lineages
    if lineage_tree:
        logger.info("Parsing lineage tree: {}".format(lineage_tree))
        tree = Phylo.read(lineage_tree, "newick")

    cols_list = list(LINELIST_COLS.keys())

    # Setup the linelist dataframe
    linelist_df = copy.deepcopy(summary_df[cols_list])
    linelist_df.rename(columns=LINELIST_COLS, inplace=True)

    # The nature of the rbd_level values (0, 1, ...) causes issues with dataframe
    # issues later in this script. A bandaid solution is to convert to strings.
    linelist_df["rbd_level"] = [str(l) for l in list(linelist_df["rbd_level"])]

    # Initialize columns
    linelist_df.insert(1, "status", [NO_DATA_CHAR] * len(linelist_df))
    linelist_df.insert(2, "lineage", [NO_DATA_CHAR] * len(linelist_df))
    linelist_df.insert(3, "issue", [NO_DATA_CHAR] * len(linelist_df))
    linelist_df["privates"] = [NO_DATA_CHAR] * len(linelist_df)
    linelist_df["cov-spectrum_query"] = [NO_DATA_CHAR] * len(linelist_df)

    # -------------------------------------------------------------------------
    # Lineage Consensus
    # -------------------------------------------------------------------------

    # Use lineage calls by sc2rf and nextclade to classify recombinants status

    logger.info("Comparing lineage assignments across tools")

    for rec in linelist_df.iterrows():
        strain = rec[1]["strain"]
        status = rec[1]["status_sc2rf"]
        breakpoints = rec[1]["breakpoints"]

        issue = NO_DATA_CHAR
        is_recombinant = False
        lineage = NO_DATA_CHAR

        # ---------------------------------------------------------------------
        # Lineage Assignments (Designated)

        # Nextclade can only have one lineage assignment
        lineage_nextclade = rec[1]["lineage_nextclade"]

        # sc2rf can be multiple lineages, same breakpoint, multiple possible lineages
        lineages_sc2rf = rec[1]["lineage_sc2rf"].split(",")
        status = rec[1]["status_sc2rf"]

        # Rename sc2rf lineages for alts (ex. ARTIC XBB)
        for i, l_sc2rf in enumerate(lineages_sc2rf):
            if l_sc2rf in SC2RF_LINEAGE_ALT:
                lineages_sc2rf[i] = SC2RF_LINEAGE_ALT[l_sc2rf]

        # Save a flag to indicate whether nextclade is sublineage of sc2rf
        nextclade_is_sublineage = False

        # Check if sc2rf confirmed its a recombinant
        if status == "positive":
            is_recombinant = True

        # Case #1: By default use nextclade
        lineage = lineage_nextclade
        classifier = "nextclade"

        # Case #2: unless sc2rf found a definitive match, that is not a "proposed"
        if (
            lineages_sc2rf[0] != NO_DATA_CHAR
            and len(lineages_sc2rf) == 1
            and not lineages_sc2rf[0].startswith("proposed")
        ):

            # Case #2a. nextclade is a sublineage of sc2rf
            if lineage_tree:
                sc2rf_lineage_tree = [c for c in tree.find_clades(lineages_sc2rf[0])]
                # Make sure we found this lineage in the tree
                if len(sc2rf_lineage_tree) == 1:
                    sc2rf_lineage_tree = sc2rf_lineage_tree[0]
                    sc2rf_lineage_children = [
                        c.name for c in sc2rf_lineage_tree.find_clades()
                    ]
                    # check if nextclade is sublineage of sc2rf
                    if lineage_nextclade in sc2rf_lineage_children:
                        nextclade_is_sublineage = True
                        # We don't need to update the lineage, since we
                        # use nextclade by default

            # Case #2b. nextclade is not a sublineage of sc2rf
            if not nextclade_is_sublineage:
                lineage = lineages_sc2rf[0]
                classifier = "sc2rf"

        # Case #1: designated recombinants called as false_positive
        # As of v0.4.0, sometimes XN and XP will be detected by sc2rf, but then labeled
        # as a false positive, since all parental regions are collapsed
        parents_clade = rec[1]["parents_clade"]
        if (lineage == "XN" or lineage == "XP") and parents_clade != NO_DATA_CHAR:
            status = "positive"
            is_recombinant = True

            # If we actually found breakpoints but not sc2rf lineage, this is a "-like"
            if breakpoints != NO_DATA_CHAR and lineages_sc2rf[0] == NO_DATA_CHAR:
                lineage = lineage + "-like"

        # Case #2: designated recombinants that sc2rf cannot detect
        # As of v0.4.2, XAS cannot be detected by sc2rf
        elif parents_clade == NO_DATA_CHAR and (lineage == "XAS"):
            status = "positive"
            is_recombinant = True

        # Case #3: nextclade thinks it's a recombinant but sc2rf doesn't
        elif lineage.startswith("X") and breakpoints == NO_DATA_CHAR:
            status = "false_positive"

        # Case #4: nextclade and sc2rf disagree, flag it as X*-like
        elif (
            len(lineages_sc2rf) >= 1
            and lineage.startswith("X")
            and lineage not in lineages_sc2rf
            and not nextclade_is_sublineage
        ):
            lineage = lineage + "-like"

        # ---------------------------------------------------------------------
        # Issue

        # Identify the possible pango-designation issue this is related to
        issues = []

        for lin in set(lineages_sc2rf + [lineage_nextclade]):
            # There are two weird examples in sc2rf lineages
            # which are proposed808-1 and proposed808-2
            # because two lineages got the same issue post
            # Transform proposed808-1 to proposed808
            if lin.startswith("proposed") and "-" in lin:
                lin = lin.split("-")[0]

            if lin in list(issues_df["lineage"]):
                match = issues_df[issues_df["lineage"] == lin]
                issue = match["issue"].values[0]
                issues.append(issue)

        if len(issues) >= 1:
            issue = ",".join([str(iss) for iss in issues])
        else:
            issue = NO_DATA_CHAR

        # ---------------------------------------------------------------------
        # Private Substitutions

        privates = []

        # The private subs are only informative if we're using the nextclade lineage
        if classifier == "nextclade":
            reversions = rec[1]["subs_reversion"].split(",")
            labeled = rec[1]["subs_labeled"].split(",")
            unlabeled = rec[1]["subs_unlabeled"].split(",")

            privates_dict = {}

            for sub in reversions + labeled + unlabeled:
                if sub == NO_DATA_CHAR:
                    continue
                # Labeled subs have special formatting
                if "|" in sub:
                    sub = sub.split("|")[0]
                coord = int(sub[1:-1])
                privates_dict[coord] = sub

            # Convert back to sorted list
            for coord in sorted(privates_dict):
                sub = privates_dict[coord]
                privates.append(sub)

        # ---------------------------------------------------------------------
        # Status

        # Fine-tune the status of a positive recombinant
        if is_recombinant:
            if lineage.startswith("X") and not lineage.endswith("like"):
                status = "designated"
            elif issue != NO_DATA_CHAR:
                status = "proposed"
            else:
                status = "unpublished"

        # ---------------------------------------------------------------------
        # Update Dataframe

        linelist_df.at[rec[0], "lineage"] = lineage
        linelist_df.at[rec[0], "status"] = str(status)
        linelist_df.at[rec[0], "issue"] = str(issue)
        linelist_df.at[rec[0], "privates"] = privates

    # -------------------------------------------------------------------------
    # Lineage Assignment (Undesignated)
    # -------------------------------------------------------------------------

    # Group sequences into potential lineages that have a unique combination of
    #   1. Lineage
    #   2. Parent clades (sc2rf)
    #   3. Parent lineages (sc2rf, lapis cov-spectrum)
    #   4. Breakpoints (sc2rf)
    #   5. Private substitutions (nextclade)

    logger.info("Grouping sequences into lineages.")

    # Create a dictionary of recombinant lineages seen
    rec_seen = {}
    seen_i = 0

    for rec in linelist_df.iterrows():

        strain = rec[1]["strain"]
        status = rec[1]["status"]

        if status == "negative" or status == "false_positive":
            continue

        # 1. Lineage assignment (nextclade or sc2rf)
        # 2. Parents by clade (ex. 21K,21L)
        # 3. Parents by lineage (ex. BA.1.1,BA.2.3)
        # 4. Breakpoints
        lineage = rec[1]["lineage"]
        parents_clade = rec[1]["parents_clade"]
        parents_lineage = rec[1]["parents_lineage"]
        breakpoints = rec[1]["breakpoints"]
        privates = rec[1]["privates"]

        # in v0.5.1, we used the parents subs
        # Format substitutions into a tidy list
        # "C234T,A54354G|Omicron/BA.1/21K;A423T|Omicron/BA.2/21L"
        # parents_subs_raw = rec[1]["parents_subs"].split(";")
        # # ["C234T,A54354G|Omicron/BA.1/21K", "A423T|Omicron/BA.2/21L"]
        # parents_subs_csv = [sub.split("|")[0] for sub in parents_subs_raw]
        # # ["C234T,A54354G", "A423T"]
        # parents_subs_str = ",".join(parents_subs_csv)
        # # "C234T,A54354G,A423T"
        # parents_subs_list = parents_subs_str.split(",")
        # # ["C234T","A54354G","A423T"]

        # in v0.5.2, we use all subs
        # "C241T,A385G,G407A,..."
        subs_list = rec[1]["subs"].split(",")
        # ["C241T","A385G","G407A", ...]

        match = None

        for i in rec_seen:
            rec_lin = rec_seen[i]
            # lineage, parents, breakpoints, and subs have to match
            if (
                rec_lin["lineage"] == lineage
                and rec_lin["parents_clade"] == parents_clade
                and rec_lin["parents_lineage"] == parents_lineage
                and rec_lin["breakpoints"] == breakpoints
            ):
                match = i
                break

        # If we found a match, increment our dict
        if match is not None:
            # Add the strain to this lineage
            rec_seen[match]["strains"].append(strain)

            # Adjust the cov-spectrum subs /parents subs to include the new strain

            # in v0.5.1, cov-spectrum_query was based only on parental subs (pre-recomb)
            # See issue #180: https://github.com/ktmeaton/ncov-recombinant/issues/180
            # lineage_parents_subs = rec_seen[match]["cov-spectrum_query"]
            # for sub in lineage_parents_subs:
            #    if sub not in parents_subs_list:
            #        rec_seen[match]["cov-spectrum_query"].remove(sub)

            # in v0.5.2, cov-spectrum_query is based on all subs
            subs = rec_seen[match]["cov-spectrum_query"]
            for sub in subs:
                if sub not in subs_list:
                    rec_seen[match]["cov-spectrum_query"].remove(sub)

            # Adjust the private subs to include the new strain
            lineage_private_subs = rec_seen[match]["privates"]
            for sub in lineage_private_subs:
                if sub not in privates:
                    rec_seen[match]["privates"].remove(sub)

        # This is the first appearance, initialize values
        else:
            rec_seen[seen_i] = {
                "lineage": lineage,
                "breakpoints": breakpoints,
                "parents_clade": parents_clade,
                "parents_lineage": parents_lineage,
                "strains": [strain],
                "cov-spectrum_query": subs_list,
                "privates": privates,
            }
            seen_i += 1

    # -------------------------------------------------------------------------
    # Cluster ID
    # Assign an id to each lineage based on the first sequence collected

    logger.info("Assigning cluster IDs to lineages.")

    linelist_df["cluster_id"] = [NO_DATA_CHAR] * len(linelist_df)
    linelist_df["cluster_privates"] = [NO_DATA_CHAR] * len(linelist_df)

    for i in rec_seen:
        earliest_datetime = datetime.today()
        earliest_strain = None

        rec_strains = rec_seen[i]["strains"]
        rec_privates = ",".join(rec_seen[i]["privates"])
        rec_df = linelist_df[linelist_df["strain"].isin(rec_strains)]

        earliest_datetime = min(rec_df["date"])
        earliest_strain = rec_df[rec_df["date"] == earliest_datetime]["strain"].values[
            0
        ]
        subs_query = rec_seen[i]["cov-spectrum_query"]
        if subs_query != NO_DATA_CHAR:
            subs_query = ",".join(subs_query)

        # indices are preserved from the original linelist_df
        for strain in rec_strains:
            strain_i = rec_df[rec_df["strain"] == strain].index[0]

            linelist_df.loc[strain_i, "cluster_id"] = earliest_strain
            linelist_df.loc[strain_i, "cov-spectrum_query"] = subs_query
            linelist_df.loc[strain_i, "cluster_privates"] = rec_privates

    # -------------------------------------------------------------------------
    # Mimics

    logger.info("Checking for mimics with too many private mutations.")
    # Check for designated lineages that have too many private mutations
    # This may indicate this is a novel lineage
    for i in rec_seen:
        lineage = rec_seen[i]["lineage"]
        rec_strains = rec_seen[i]["strains"]
        lineage_size = len(rec_strains)
        num_privates = len(rec_seen[i]["privates"])

        # Mark these lineages as "X*-like"
        if (
            lineage.startswith("X")
            and not lineage.endswith("like")
            and lineage_size >= min_lineage_size
            and num_privates >= min_private_muts
        ):
            lineage = lineage + "-like"
            rec_rename = linelist_df["strain"].isin(rec_strains)
            linelist_df.loc[rec_rename, "lineage"] = lineage
            linelist_df.loc[rec_rename, "status"] = "proposed"

    # -------------------------------------------------------------------------
    # Assign status and curated lineage

    logger.info("Assigning curated recombinant lineages.")

    for rec in linelist_df.iterrows():
        lineage = rec[1]["lineage"]
        status = rec[1]["status"]
        cluster_id = rec[1]["cluster_id"]

        # By default use cluster ID for curated lineage
        linelist_df.at[rec[0], "recombinant_lineage_curated"] = cluster_id

        if status == "negative":
            linelist_df.at[rec[0], "recombinant_lineage_curated"] = "negative"

        elif status == "false_positive":
            linelist_df.at[rec[0], "recombinant_lineage_curated"] = "false_positive"

        # If designated or "-like", override with actual lineage
        elif status == "designated" or lineage.endswith("like"):
            linelist_df.at[rec[0], "recombinant_lineage_curated"] = lineage

    # -------------------------------------------------------------------------
    # Pipeline Versions
    logger.info("Recording pipeline versions.")

    pipeline_ver = linelist_df["ncov-recombinant_version"].values[0]
    linelist_df.loc[linelist_df.index, "pipeline_version"] = "{pipeline}".format(
        pipeline="ncov-recombinant:{}".format(pipeline_ver)
    )
    nextclade_ver = linelist_df["nextclade_version"].values[0]
    nextclade_dataset = linelist_df["nextclade_dataset"].values[0]
    linelist_df.loc[
        linelist_df.index, "recombinant_classifier_dataset"
    ] = "{nextclade}:{dataset}".format(
        nextclade="nextclade:{}".format(nextclade_ver),
        dataset=nextclade_dataset,
    )

    # -------------------------------------------------------------------------
    # Save to File

    logger.info("Sorting output tables.")

    # Drop Unnecessary columns
    linelist_df.drop(
        columns=[
            "status_sc2rf",
            "clade_nextclade",
            "subs_reversion",
            "subs_labeled",
            "subs_unlabeled",
            "subs",
        ],
        inplace=True,
    )

    # Convert privates from list to csv
    linelist_df["privates"] = [",".join(p) for p in linelist_df["privates"]]

    # Recode NA
    linelist_df.fillna(NO_DATA_CHAR, inplace=True)

    # Sort
    status_order = {
        "designated": 0,
        "proposed": 1,
        "unpublished": 2,
        "false_positive": 3,
        "negative": 4,
    }

    # Change empty cells to NaN so they'll be sorted to the end
    linelist_df = linelist_df.replace({"lineage": "", "status": ""}, np.nan)
    linelist_df.sort_values(
        [
            "status",
            "lineage",
        ],
        inplace=True,
        key=lambda x: x.map(status_order),
    )

    # All
    outpath = os.path.join(outdir, "linelist.tsv")
    logger.info("Writing output: {}".format(outpath))
    linelist_df.to_csv(outpath, sep="\t", index=False)

    # Positives
    positive_df = linelist_df[
        (linelist_df["status"] != "false_positive")
        & (linelist_df["status"] != "negative")
    ]
    outpath = os.path.join(outdir, "positives.tsv")
    logger.info("Writing output: {}".format(outpath))
    positive_df.to_csv(outpath, sep="\t", index=False)

    # False Positives
    false_positive_df = linelist_df[linelist_df["status"] == "false_positive"]
    outpath = os.path.join(outdir, "false_positives.tsv")
    logger.info("Writing output: {}".format(outpath))
    false_positive_df.to_csv(outpath, sep="\t", index=False)

    # Negatives
    negative_df = linelist_df[linelist_df["status"] == "negative"]
    outpath = os.path.join(outdir, "negatives.tsv")
    logger.info("Writing output: {}".format(outpath))
    negative_df.to_csv(outpath, sep="\t", index=False)


if __name__ == "__main__":
    main()

Python Pandas numpy Biopython functions CLICK nextclade Pangolin From line 2 of scripts/linelist.py

import click
import os
import pandas as pd
from datetime import datetime, date

# Hard-coded constants

NO_DATA_CHAR = "NA"


# Select and rename columns from linelist
PARENTS_COLS = [
    "parents_clade",
    "sequences",
    "earliest_date",
    "latest_date",
]


@click.command()
@click.option(
    "--input", help="Input file of recombinant sequences (tsv).", required=True
)
@click.option(
    "--output", help="Output file of recombinant lineages (tsv)", required=True
)
def main(
    input,
    output,
):
    """Create a table of recombinant sequences by parent"""

    # -------------------------------------------------------------------------
    # Setup
    # -------------------------------------------------------------------------

    # Misc variables
    outdir = os.path.dirname(input)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    df = pd.read_csv(input, sep="\t")
    df.fillna(NO_DATA_CHAR, inplace=True)

    # Issue #168 NULL dates are allowed
    # Set to today instead
    # https://github.com/ktmeaton/ncov-recombinant/issues/168
    seq_date = []
    for d in list(df["date"]):
        try:
            d = datetime.strptime(d, "%Y-%m-%d").date()
        except ValueError:
            # Set NA dates to today
            d = date.today()

        seq_date.append(d)
    df["datetime"] = seq_date

    # -------------------------------------------------------------------------
    # Create the parents table (parents.tsv)
    # -------------------------------------------------------------------------

    data = {col: [] for col in PARENTS_COLS}

    for parents_clade in set(df["parents_clade"]):

        match_df = df[df["parents_clade"] == parents_clade]

        if parents_clade == NO_DATA_CHAR:
            parents_clade = "Unknown"

        earliest_date = min(match_df["datetime"])
        latest_date = max(match_df["datetime"])
        sequences = len(match_df)

        data["parents_clade"].append(parents_clade)
        data["sequences"].append(sequences)
        data["earliest_date"].append(earliest_date)
        data["latest_date"].append(latest_date)

    parents_df = pd.DataFrame(data)
    parents_df.sort_values(by="sequences", ascending=False, inplace=True)

    outpath = os.path.join(outdir, "parents.tsv")
    parents_df.to_csv(outpath, index=False, sep="\t")


if __name__ == "__main__":
    main()

Python Pandas CLICK From line 2 of scripts/parents.py

import click
import os
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import patches, colors, lines, collections
from functions import categorical_palette
import math

NO_DATA_CHAR = "NA"
# This is the aspect ratio/dpi for ppt embeds
# The dimensions are set in the powerpoint template (resources/template.pttx)
DPI = 96 * 2
FIGSIZE = [6.75, 5.33]
FIG_Y_PER_LINEAGE = 1

# Breakpoint Plotting
GENOME_LENGTH = 29903
X_BUFF = 1000
BREAKPOINT_COLOR = "lightgrey"
UNKNOWN_COLOR = "dimgrey"
COORD_ITER = 5000

# This is the number of characters than can fit width-wise across the legend
LEGEND_FONTSIZE = 6
LEGEND_CHAR_WIDTH = 90
# The maximum columns in the legend is dictated by the char width, but more
# importantly, in the categorical_palette function, we restrict it to the
# first 5 colors of the tap10 palette, and make different shades within it
LEGEND_MAX_COL = 5

# Show the first N char of the label in the plot
LEGEND_LABEL_MAX_LEN = 15

# Select and rename columns from linelist
LINEAGES_COLS = [
    "cluster_id",
    "status",
    "lineage",
    "parents_clade",
    "parents_lineage",
    "breakpoints",
    "issue",
    "sequences",
    "growth_score",
    "earliest_date",
    "latest_date",
]

plt.rcParams["svg.fonttype"] = "none"


@click.command()
@click.option("--lineages", help="Recombinant lineages (tsv)", required=True)
@click.option("--outdir", help="Output directory", required=False, default=".")
@click.option(
    "--lineage-col", help="Column name of lineage", required=False, default="lineage"
)
@click.option(
    "--breakpoint-col",
    help="Column name of breakpoints",
    required=False,
    default="breakpoints",
)
@click.option(
    "--parent-col", help="Column name of parents", required=False, default="parents"
)
@click.option(
    "--parent-type",
    help="Parent type (ex. clade, lineage)",
    required=False,
    default="clade",
)
@click.option(
    "--cluster-col", help="Column name of cluster ID (ex. cluster_id)", required=False
)
@click.option(
    "--clusters", help="Restrict plotting to only these cluster IDs", required=False
)
@click.option(
    "--figsize",
    help="Figure dimensions as h,w in inches (ex. 6.75,5.33)",
    default=",".join([str(d) for d in FIGSIZE]),
)
@click.option(
    "--autoscale",
    help="Autoscale plot dimensions to the number of lineages (overrides --figsize)",
    is_flag=True,
)
@click.option(
    "--positives",
    help="Table of positive recombinants",
    required=False,
)
def main(
    lineages,
    outdir,
    lineage_col,
    parent_col,
    breakpoint_col,
    cluster_col,
    clusters,
    parent_type,
    autoscale,
    figsize,
    positives,
):
    """Plot recombinant lineage breakpoints"""

    # Creat output directory if it doesn't exist
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    # -------------------------------------------------------------------------
    # Import dataframes
    lineages_df = pd.read_csv(lineages, sep="\t")
    lineages_df.fillna(NO_DATA_CHAR, inplace=True)

    if cluster_col:

        lineages_df[cluster_col] = [str(c) for c in lineages_df[cluster_col]]
        # Filter dataframe on cluster IDs
        if clusters:
            clusters_list = clusters.split(",")
            lineages_df = lineages_df[lineages_df[cluster_col].isin(clusters_list)]

        # Which lineages have the same name but different cluster ID?
        lineages_seen = []
        lineages_dup = []
        for lineage in list(lineages_df[lineage_col]):
            if lineage not in lineages_seen:
                lineages_seen.append(lineage)
            else:
                lineages_dup.append(lineage)

    else:
        lineages_dup = []

    if positives:
        positives_df = pd.read_csv(positives, sep="\t")
        positives_df["strain"] = [str(s) for s in positives_df["strain"]]
        if cluster_col:
            positives_df[cluster_col] = [str(s) for s in positives_df[cluster_col]]
        positives_df.fillna(NO_DATA_CHAR, inplace=True)

    # Figsize format back to list
    figsize = [float(d) for d in figsize.split(",")]

    # -------------------------------------------------------------------------

    # Create a dataframe to hold plot data
    # Lineage (y axis, categorical)
    # Coordinate (x axis, numeric)

    breakpoints_data = {
        "lineage": [],
        "parent": [],
        "start": [],
        "end": [],
    }

    parents_colors = {}

    # -------------------------------------------------------------------------
    # Create a dataframe to hold plot data

    # Iterate through lineages
    for rec in lineages_df.iterrows():
        lineage = rec[1][lineage_col]

        if cluster_col:
            cluster_id = rec[1][cluster_col]
            lineage = "{} {}".format(lineage, cluster_id)

        parents = rec[1][parent_col]
        breakpoints = rec[1][breakpoint_col]

        parents_split = parents.split(",")
        breakpoints_split = breakpoints.split(",")

        prev_start_coord = 0

        # Iterate through the parents
        for i in range(0, len(parents_split)):
            parent = parents_split[i]

            # Convert NA parents to Unknown
            if parent == NO_DATA_CHAR:
                parent = "Unknown"
            if parent not in parents_colors:
                parents_colors[parent] = ""

            if i < (len(parents_split) - 1):
                breakpoint = breakpoints_split[i]
                # Check that we actually found a breakpoint
                if ":" not in breakpoint:
                    continue

                breakpoint_start_coord = int(breakpoint.split(":")[0])
                breakpoint_end_coord = int(breakpoint.split(":")[1])

                # Give this coordinate to both parents
                parent_next = parents_split[i + 1]
                if parent_next == NO_DATA_CHAR:
                    parent_next = "Unknown"

                start_coord = prev_start_coord
                end_coord = int(breakpoint.split(":")[0]) - 1
                # Update start coord
                prev_start_coord = int(breakpoint.split(":")[1]) + 1

                # Add record for breakpoint
                breakpoints_data["lineage"].append(lineage)
                breakpoints_data["parent"].append("breakpoint")
                breakpoints_data["start"].append(breakpoint_start_coord)
                breakpoints_data["end"].append(breakpoint_end_coord)

            else:
                start_coord = prev_start_coord
                end_coord = GENOME_LENGTH

            # Add record for parent
            breakpoints_data["lineage"].append(lineage)
            breakpoints_data["parent"].append(parent)
            breakpoints_data["start"].append(start_coord)
            breakpoints_data["end"].append(end_coord)

    # Convert the dictionary to a dataframe
    breakpoints_df = pd.DataFrame(breakpoints_data)

    # Sort by coordinates
    breakpoints_df.sort_values(by=["parent", "start", "end"], inplace=True)

    # -------------------------------------------------------------------------
    # Colors

    # Pick paletted based on number of parents
    parents_palette = categorical_palette(num_cat=len(parents_colors))

    i = 0

    for parent in parents_colors:
        if parent == "Unknown":
            # We'll sort this out after, so it will be at the end
            continue

        color_rgb = parents_palette[i]
        color = colors.to_hex(color_rgb)
        parents_colors[parent] = color
        i += 1

    # Reorder unknown parents to the end
    if "Unknown" in parents_colors:
        del parents_colors["Unknown"]
        parents_colors["Unknown"] = UNKNOWN_COLOR
    parents_colors["breakpoint"] = BREAKPOINT_COLOR

    # -------------------------------------------------------------------------
    # Plot Setup

    # When dynamically setting the aspect ratio, the height is
    # multiplied by the number of lineages
    num_lineages = len(set(list(breakpoints_df["lineage"])))
    if autoscale:
        # This is the minimum size it can be for 1 lineage with two parents
        figsize = [figsize[0], 2]
        # Adjusted for other sizes of lineages, mirrors fontsize detection later
        if num_lineages >= 40:
            figsize = [figsize[0], 0.1 * num_lineages]
        elif num_lineages >= 30:
            figsize = [figsize[0], 0.2 * num_lineages]
        elif num_lineages >= 20:
            figsize = [figsize[0], 0.3 * num_lineages]
        elif num_lineages >= 10:
            figsize = [figsize[0], 0.5 * num_lineages]
        elif num_lineages > 1:
            figsize = [figsize[0], 1 * num_lineages]

    fig, ax = plt.subplots(
        1,
        1,
        dpi=DPI,
        figsize=figsize,
    )

    # -------------------------------------------------------------------------
    # Plot Breakpoint Regions

    rect_height = 1
    start_y = 0
    y_buff = 1
    y = start_y
    y_increment = rect_height + y_buff
    y_tick_locs = []
    y_tick_labs_lineage = []

    num_lineages = len(set(breakpoints_df["lineage"]))
    lineages_seen = []

    # Iterate through lineages to plot
    for rec in breakpoints_df.iterrows():
        lineage = rec[1]["lineage"]
        if lineage in lineages_seen:
            continue
        lineages_seen.append(lineage)

        y_tick_locs.append(y + (rect_height / 2))
        lineage_label = lineage.split(" ")[0]
        if cluster_col:

            cluster_id_label = lineage.split(" ")[1]

        # Non-unique, use cluster ID in y axis
        if lineage_label in lineages_dup:
            ylabel = "{} ({})".format(lineage_label, cluster_id_label)
        else:
            ylabel = lineage_label

        y_tick_labs_lineage.append(ylabel)

        lineage_df = breakpoints_df[breakpoints_df["lineage"] == lineage]

        # Iterate through regions to plot
        for rec in lineage_df.iterrows():
            parent = rec[1]["parent"]
            start = rec[1]["start"]
            end = rec[1]["end"]

            color = parents_colors[parent]

            region_rect = patches.Rectangle(
                xy=[start, y],
                width=end - start,
                height=rect_height,
                linewidth=1,
                edgecolor="none",
                facecolor=color,
            )
            ax.add_patch(region_rect)

        # Iterate through substitutions to plot
        if positives:
            positive_rec = positives_df[(positives_df[lineage_col] == lineage_label)]
            # If we're using a cluster id col, further filter on that
            if cluster_col:
                positive_rec = positive_rec[
                    (positive_rec[cluster_col] == cluster_id_label)
                ]

            # Extract the substitutions, just taking the first as representative
            cov_spectrum_subs = list(positive_rec["cov-spectrum_query"])[0]

            if cov_spectrum_subs != NO_DATA_CHAR:
                # Split into a list, and extract coordinates
                coord_list = [int(s[1:-1]) for s in cov_spectrum_subs.split(",")]
                subs_lines = []
                # Create vertical bars for each sub
                for coord in coord_list:
                    sub_line = [(coord, y), (coord, y + rect_height)]
                    subs_lines.append(sub_line)
                # Combine all bars into a collection
                collection_subs = collections.LineCollection(
                    subs_lines, color="black", linewidth=0.25
                )
                # Add the subs bars to the plot
                ax.add_collection(collection_subs)

        # Jump to the next y coordinate
        y -= y_increment

    # Axis Limits
    ax.set_xlim(0 - X_BUFF, GENOME_LENGTH + X_BUFF)
    ax.set_ylim(
        0 - ((num_lineages * y_increment) - (y_increment / 2)),
        0 + (rect_height * 2),
    )

    # This is the default fontisze to use
    y_tick_fontsize = 10
    if num_lineages >= 20:
        y_tick_fontsize = 8
    if num_lineages >= 30:
        y_tick_fontsize = 6
    if num_lineages >= 40:
        y_tick_fontsize = 4

    # Axis ticks
    ax.set_yticks(y_tick_locs)

    # Truncate long labels
    for i in range(0, len(y_tick_labs_lineage)):

        y_label = y_tick_labs_lineage[i]

        if len(y_label) > LEGEND_LABEL_MAX_LEN:
            y_label = y_label[0:LEGEND_LABEL_MAX_LEN]
            if "(" in y_label and ")" not in y_label:
                y_label = y_label + "...)"
            else:
                y_label = y_label + "..."

        y_tick_labs_lineage[i] = y_label

    ax.set_yticklabels(y_tick_labs_lineage, fontsize=y_tick_fontsize)

    # Axis Labels
    ax.set_ylabel("Lineage")
    ax.set_xlabel("Genomic Coordinate")

    # -------------------------------------------------------------------------
    # Manually create legend

    legend_handles = []
    legend_labels = []

    for parent in parents_colors:
        handle = lines.Line2D([0], [0], color=parents_colors[parent], lw=4)
        legend_handles.append(handle)
        if parent == "breakpoint":
            legend_labels.append(parent.title())
        else:
            legend_labels.append(parent)

    subs_handle = lines.Line2D(
        [],
        [],
        color="black",
        marker="|",
        linestyle="None",
        markersize=10,
        markeredgewidth=1,
    )
    legend_handles.append(subs_handle)
    legend_labels.append("Substitution")

    # Dynamically set the number of columns in the legend based on how
    # how much space the labels will take up (in characters)
    max_char_len = 0
    for label in legend_labels:
        if len(label) >= max_char_len:
            max_char_len = len(label)

    legend_ncol = math.floor(LEGEND_CHAR_WIDTH / max_char_len)
    # we don't want too many columns
    if legend_ncol > LEGEND_MAX_COL:
        legend_ncol = LEGEND_MAX_COL
    elif legend_ncol > len(parents_colors):
        legend_ncol = len(parents_colors)

    legend = ax.legend(
        handles=legend_handles,
        labels=legend_labels,
        title=parent_type.title(),
        edgecolor="black",
        fontsize=LEGEND_FONTSIZE,
        ncol=legend_ncol,
        loc="lower center",
        mode="expand",
        bbox_to_anchor=(0, 1.02, 1, 0.2),
        borderaxespad=0,
        borderpad=1,
    )

    legend.get_frame().set_linewidth(1)
    legend.get_title().set_fontweight("bold")
    # legend.get_frame().set_boxstyle("Square", pad=0.2)

    # -------------------------------------------------------------------------
    # Export

    # plt.suptitle("Recombination Breakpoints by Lineage")
    if num_lineages > 0:
        plt.tight_layout()

    outpath = os.path.join(outdir, "breakpoints_{}".format(parent_type))
    breakpoints_df.to_csv(outpath + ".tsv", sep="\t", index=False)
    plt.savefig(outpath + ".png")
    plt.savefig(outpath + ".svg")


if __name__ == "__main__":
    main()

Python Pandas matplotlib functions CLICK From line 3 of scripts/plot_breakpoints.py

import click
import os
import pandas as pd
import numpy as np
import epiweeks
import matplotlib.pyplot as plt
from matplotlib import patches, colors
from datetime import datetime, timedelta, date
import sys
import copy
from functions import categorical_palette
import math
import warnings
from functions import create_logger
import logging

logging.getLogger("matplotlib.font_manager").disabled = True

warnings.filterwarnings("error")

NO_DATA_CHAR = "NA"
ALPHA_LAG = 0.25
ALPHA_BAR = 1.00
WIDTH_BAR = 0.75
EPIWEEK_MAX_BUFF_FACTOR = 1.1
# This is the aspect ratio/dpi for ppt embeds
# The dimensions are set in the powerpoint template (resources/template.pttx)
DPI = 96 * 2
FIGSIZE = [6.75, 5.33]
# The maximum number of epiweeks we can comfortably plot, otherwise
# the figsize needs to be upscaled
FIGSIZE_MAX_WEEKS = 65

UNKNOWN_COLOR = "dimgrey"
UNKNOWN_RGB = colors.to_rgb(UNKNOWN_COLOR)

# This is the number of characters than can fit width-wise across the legend
LEGEND_FONTSIZE = 6
LEGEND_CHAR_WIDTH = 90
# The maximum columns in the legend is dictated by the char width, but more
# importantly, in the categorical_palette function, we restrict it to the
# first 5 colors of the tap10 palette, and make different shades within it
LEGEND_MAX_COL = 5

# Show the first N char of the label in the plot
LEGEND_LABEL_MAX_LEN = 15

# This is the maximum number of key RBD mutations.
MAX_RBD_LEVEL = 12

# Select and rename columns from linelist
LINEAGES_COLS = [
    "cluster_id",
    "status",
    "lineage",
    "parents_clade",
    "parents_lineage",
    "breakpoints",
    "issue",
    "sequences",
    "growth_score",
    "earliest_date",
    "latest_date",
]

plt.rcParams["svg.fonttype"] = "none"


@click.command()
@click.option("--input", help="Recombinant sequences (TSV)", required=True)
@click.option("--outdir", help="Output directory", required=False, default=".")
@click.option(
    "--weeks",
    help="Number of weeks in retrospect to plot",
    required=False,
)
@click.option(
    "--min-date",
    help="Ignore sequences before this date (yyyy-mm-dd, overrides --weeks)",
    required=False,
)
@click.option("--max-date", help="Ignore sequences after this date", required=False)
@click.option(
    "--geo",
    help="Geography column to use when plotting",
    required=False,
    default="country",
)
@click.option(
    "--lag", help="Reporting lag weeks to draw a grey box", required=False, default=4
)
@click.option(
    "--min-cluster-size",
    help="Only plot clusters/lineages with at least this many sequences.",
    default=1,
)
@click.option("--log", help="Output log file.", required=False)
def main(
    input,
    outdir,
    weeks,
    geo,
    lag,
    min_date,
    max_date,
    min_cluster_size,
    log,
):
    """Plot recombinant lineages"""

    # create logger
    logger = create_logger(logfile=log)

    # Creat output directory if it doesn't exist
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    # -------------------------------------------------------------------------
    # Import dataframes
    logger.info("Importing linelist: {}".format(input))
    df = pd.read_csv(input, sep="\t")
    df.fillna(NO_DATA_CHAR, inplace=True)

    # Issue #168 NULL dates are allowed
    # Set to today instead
    # https://github.com/ktmeaton/ncov-recombinant/issues/168
    seq_date = []
    for d in list(df["date"]):
        try:
            d = datetime.strptime(d, "%Y-%m-%d").date()
        except ValueError:
            # Set NA dates to today
            d = date.today()

        seq_date.append(d)
    df["datetime"] = seq_date

    df["year"] = [d.year for d in df["datetime"]]
    df["epiweek"] = [
        epiweeks.Week.fromdate(d, system="cdc").startdate() for d in df["datetime"]
    ]

    # Filter on weeks reporting
    logger.info("Filtering on data")
    if max_date:
        max_datetime = datetime.strptime(max_date, "%Y-%m-%d")
        max_epiweek = epiweeks.Week.fromdate(max_datetime, system="cdc").startdate()
    else:
        max_epiweek = epiweeks.Week.fromdate(datetime.today(), system="cdc").startdate()

    if min_date:
        min_datetime = datetime.strptime(min_date, "%Y-%m-%d")
        min_epiweek = epiweeks.Week.fromdate(min_datetime, system="cdc").startdate()
    elif weeks:
        weeks = int(weeks)
        min_epiweek = max_epiweek - timedelta(weeks=(weeks - 1))
    elif len(df) == 0:
        # Just need something for empty plots
        min_epiweek = max_epiweek - timedelta(weeks=16)
    else:
        min_epiweek = epiweeks.Week.fromdate(
            min(df["epiweek"]), system="cdc"
        ).startdate()

    weeks = int((max_epiweek - min_epiweek).days / 7)

    # Dummy data outside of plotting range when empty
    one_week_prev = min_epiweek - timedelta(weeks=1)

    df = copy.deepcopy(
        df[(df["epiweek"] >= min_epiweek) & (df["epiweek"] <= max_epiweek)]
    )

    # Change status to title case
    df["status"] = [s.title() for s in df["status"]]

    # Get largest lineage
    largest_lineage = NO_DATA_CHAR
    largest_lineage_size = 0

    # All record cluster sizes to decided which should be dropped
    drop_small_clusters_ids = []

    for lineage in set(df["recombinant_lineage_curated"]):
        match_df = df[df["recombinant_lineage_curated"] == lineage]
        lineage_size = len(match_df)
        if lineage_size >= largest_lineage_size:
            largest_lineage = match_df["recombinant_lineage_curated"].values[0]
            largest_lineage_size = lineage_size

        if lineage_size < min_cluster_size:
            for i in match_df.index:
                drop_small_clusters_ids.append(i)

    df.drop(labels=drop_small_clusters_ids, axis="rows", inplace=True)

    df["parents_clade"] = df["parents_clade"].fillna("Unknown")
    df["parents_lineage"] = df["parents_lineage"].fillna("Unknown")

    # -------------------------------------------------------------------------
    # Pivot Tables
    # -------------------------------------------------------------------------

    logger.info("Creating pivot tables")
    # -------------------------------------------------------------------------
    # All
    all_df = pd.pivot_table(
        data=df.sort_values(by="epiweek"),
        values="strain",
        index=["epiweek"],
        aggfunc="count",
    )
    all_df.index.name = None
    all_df.fillna(0, inplace=True)
    all_df.insert(0, "epiweek", all_df.index)
    # Check if it was empty
    if len(all_df) == 0:
        all_df.insert(1, "sequences", [])
        max_epiweek_sequences = 0
    else:
        all_df.rename(columns={"strain": "sequences"}, inplace=True)
        max_epiweek_sequences = max(all_df["sequences"])

    # Attempt to dynamically create pivot tables
    plot_dict = {
        "lineage": {
            "legend_title": "lineage",
            "cols": ["recombinant_lineage_curated"],
            "y": "recombinant_lineage_curated",
        },
        "status": {"legend_title": "status", "cols": ["status"]},
        "geography": {"legend_title": geo, "cols": [geo]},
        "largest": {
            "legend_title": geo,
            "cols": [geo],
            "filter": "recombinant_lineage_curated",
            "value": largest_lineage,
        },
        "designated": {
            "legend_title": "lineage",
            "cols": ["recombinant_lineage_curated"],
            "filter": "status",
            "value": "Designated",
        },
        "proposed": {
            "legend_title": "lineage",
            "cols": ["recombinant_lineage_curated"],
            "filter": "status",
            "value": "Proposed",
        },
        "unpublished": {
            "legend_title": "lineage",
            "cols": ["recombinant_lineage_curated"],
            "filter": "status",
            "value": "Unpublished",
        },
        "parents_clade": {"legend_title": "Parents (Clade)", "cols": ["parents_clade"]},
        "parents_lineage": {
            "legend_title": "Parents (Lineage)",
            "cols": ["parents_lineage"],
        },
        "cluster_id": {"legend_title": "Cluster ID", "cols": ["cluster_id"]},
        "rbd_level": {
            "legend_title": "Receptor Binding Domain Mutations",
            "cols": ["rbd_level"],
        },
    }

    for plot in plot_dict:

        logger.info("Creating plot data: {}".format(plot))
        columns = plot_dict[plot]["cols"]

        # Several plots need special filtering
        if "filter" in plot_dict[plot]:
            filter = plot_dict[plot]["filter"]
            value = plot_dict[plot]["value"]

            plot_df = pd.pivot_table(
                data=df[df[filter] == value].sort_values(by="epiweek"),
                values="strain",
                index=["epiweek"],
                columns=columns,
                aggfunc="count",
            )

        # Otherwise no filtering required
        else:
            plot_df = pd.pivot_table(
                data=df.sort_values(by="epiweek"),
                index=["epiweek"],
                values="strain",
                aggfunc="count",
                columns=columns,
            )

        plot_df.index.name = None
        plot_df.fillna(0, inplace=True)

        # Convert counts from floats to integers
        plot_df[plot_df.columns] = plot_df[plot_df.columns].astype(int)

        # Fill in missing levels for RBD
        if plot == "rbd_level" and len(plot_df) > 0:
            # min_level = min(plot_df.columns)
            # max_level = max(plot_df.columns)
            min_level = 0
            max_level = MAX_RBD_LEVEL

            for level in range(min_level, max_level + 1, 1):
                if level not in plot_df.columns:
                    plot_df[level] = 0

        # Defragment dataframe for Issue #218
        plot_df = copy.copy(plot_df)

        # Add epiweeks column
        plot_df.insert(0, "epiweek", plot_df.index)

        plot_dict[plot]["df"] = plot_df

    # ----------------------------------------------------------
    # Filter for Reporting Period
    # -------------------------------------------------------------------------

    # Add empty data for weeks if needed
    epiweek_map = {}
    iter_week = min_epiweek
    iter_i = 0
    df_list = [plot_dict[plot]["df"] for plot in plot_dict]
    while iter_week <= max_epiweek:

        for plot_df in df_list:

            if iter_week not in plot_df["epiweek"]:
                plot_df.at[iter_week, "epiweek"] = iter_week

        # Check if its the largest
        epiweek_map[iter_week] = iter_i
        iter_week += timedelta(weeks=1)
        iter_i += 1

    for plot_df in df_list:
        plot_df.fillna(0, inplace=True)
        plot_df.sort_values(by="epiweek", axis="index", inplace=True)

    lag_epiweek = max_epiweek - timedelta(weeks=lag)

    # -------------------------------------------------------------------------
    # Plot
    # -------------------------------------------------------------------------

    for plot in plot_dict:

        logger.info("Creating plot figure: {}".format(plot))

        plot_df = plot_dict[plot]["df"]

        x = "epiweek"
        label = plot
        legend_title = plot_dict[plot]["legend_title"]
        out_path = os.path.join(outdir, label)

        # Save plotting dataframe to file
        # for the largest, we need to replace the slashes in the filename
        if label == "largest":
            largest_lineage_fmt = largest_lineage.replace("/", "_DELIM_")
            out_path += "_{lineage}".format(lineage=largest_lineage_fmt)

        plot_df.to_csv(out_path + ".tsv", sep="\t", index=False)

        # ---------------------------------------------------------------------
        # Sort categories by count

        # The df is sorted by time (epiweek)
        # But we want colors to be sorted by number of sequences
        df_count_dict = {}
        for col in plot_df.columns:
            if col == "epiweek":
                continue
            df_count_dict[col] = sum([c for c in plot_df[col] if not np.isnan(c)])

        # Sort by counts, except for RBD level, want sorted by value itself
        if label == "rbd_level":
            df_count_dict = dict(sorted(df_count_dict.items()))
        else:
            df_count_dict = dict(
                sorted(df_count_dict.items(), key=lambda item: item[1], reverse=True)
            )
        # Reorder the columns in the data frame
        cols = list(df_count_dict.keys())

        # Place Unknown at the end, for better color palettes
        if "Unknown" in cols:
            cols.remove("Unknown")
            ordered_cols = cols + ["Unknown"] + ["epiweek"]
        else:
            ordered_cols = cols + ["epiweek"]

        plot_df = plot_df[ordered_cols]

        # ---------------------------------------------------------------------
        # Dynamically create the color palette

        num_cat = len(plot_df.columns) - 1

        if label == "rbd_level" and len(plot_df.columns) > 1:
            num_cat = len(range(min(cols), max(cols) + 1, 1))
            plot_palette = categorical_palette(
                num_cat=num_cat, cmap="RdYlGn_r", continuous=True, cmap_num_cat=999
            )

        # Otherwise use default form function
        else:
            plot_palette = categorical_palette(num_cat=num_cat)

        # Recolor unknown
        if "Unknown" in plot_df.columns:
            unknown_i = list(plot_df.columns).index("Unknown")
            plot_palette[unknown_i] = list(UNKNOWN_RGB)

        # Setup up Figure
        fig, ax = plt.subplots(1, figsize=FIGSIZE, dpi=DPI)

        # ---------------------------------------------------------------------
        # Stacked bar charts

        # Check if we dropped all records
        if len(plot_df.columns) <= 1:
            error_msg = (
                "WARNING: No records to plot between"
                + " {min_epiweek} and {max_epiweek} for dataframe: {plot}".format(
                    plot=plot,
                    min_epiweek=min_epiweek,
                    max_epiweek=max_epiweek,
                )
            )
            print(error_msg, file=sys.stderr)
            # Add dummy data to force an empty plot
            plot_df["dummy"] = [None] * len(plot_df)
            plot_df.at[one_week_prev, "epiweek"] = one_week_prev
            plot_df.at[one_week_prev, "dummy"] = 1
            plot_df.sort_values(by="epiweek", inplace=True)

            plot_palette = categorical_palette(num_cat=1)

        plot_df.plot.bar(
            stacked=True,
            ax=ax,
            x=x,
            color=plot_palette,
            edgecolor="none",
            width=WIDTH_BAR,
            alpha=ALPHA_BAR,
        )

        # ---------------------------------------------------------------------
        # Axis limits

        xlim = ax.get_xlim()
        # If plotting 16 weeks, the x-axis will be (-0.625, 16.625)
        # If we added dummy data, need to correct start date
        x_start = xlim[1] - weeks - 1.25
        ax.set_xlim(x_start, xlim[1])

        if max_epiweek_sequences == 0:
            ylim = [0, 1]
        else:
            ylim = [0, round(max_epiweek_sequences * EPIWEEK_MAX_BUFF_FACTOR, 1)]
        ax.set_ylim(ylim[0], ylim[1])

        # ---------------------------------------------------------------------
        # Reporting Lag

        # If the scope of the data is smaller than the lag
        if lag_epiweek in epiweek_map:
            lag_i = epiweek_map[lag_epiweek]
        else:
            lag_i = epiweek_map[max_epiweek]

        lag_rect_height = ylim[1]

        # If we had to use dummy data for an empty dataframe, shift lag by 1
        if "dummy" in plot_df.columns:
            lag_i += 1

        ax.axvline(x=lag_i + (1 - (WIDTH_BAR) / 2), color="black", linestyle="--", lw=1)
        lag_rect_xy = [lag_i + (1 - (WIDTH_BAR) / 2), 0]
        lag_rect = patches.Rectangle(
            xy=lag_rect_xy,
            width=lag + (1 - (WIDTH_BAR) / 2),
            height=lag_rect_height,
            linewidth=1,
            edgecolor="none",
            facecolor="grey",
            alpha=ALPHA_LAG,
            zorder=0,
        )
        ax.add_patch(lag_rect)

        # Label the lag rectangle
        text_props = dict(facecolor="white")
        ax.text(
            x=lag_rect_xy[0],
            y=ylim[1] / 2,
            s="Reporting Lag",
            fontsize=6,
            fontweight="bold",
            rotation=90,
            bbox=text_props,
            va="center",
            ha="center",
        )

        # ---------------------------------------------------------------------
        # Legend

        # Dynamically set the number of columns in the legend based on how
        # how much space the labels will take up (in characters)
        max_char_len = 0
        for col in ordered_cols:
            if len(str(col)) >= max_char_len:
                max_char_len = len(str(col))

        legend_ncol = math.floor(LEGEND_CHAR_WIDTH / max_char_len)

        # we don't want too many columns
        if legend_ncol > LEGEND_MAX_COL:
            legend_ncol = LEGEND_MAX_COL
        elif legend_ncol > num_cat:
            legend_ncol = num_cat
        elif legend_ncol == 0:
            legend_ncol = 1

        legend = ax.legend(
            title=legend_title.title(),
            edgecolor="black",
            fontsize=LEGEND_FONTSIZE,
            ncol=legend_ncol,
            loc="lower center",
            mode="expand",
            bbox_to_anchor=(0, 1.02, 1, 0.2),
            borderaxespad=0,
        )

        # Truncate long labels in the legend
        # But only if this plots does not involve plotting parents!
        # Because we need to see the 2+ parent listed at the end
        if (
            "parents" not in label
            and "geography" not in label
            and "largest" not in label
        ):
            for i in range(0, len(legend.get_texts())):

                l_label = legend.get_texts()[i].get_text()

                if len(l_label) > LEGEND_LABEL_MAX_LEN:
                    l_label = l_label[0:LEGEND_LABEL_MAX_LEN]
                    if "(" in l_label and ")" not in l_label:
                        l_label = l_label + "...)"
                    else:
                        l_label = l_label + "..."

                legend.get_texts()[i].set_text(l_label)

        legend.get_frame().set_linewidth(1)
        legend.get_title().set_fontweight("bold")

        # If dummy is a column, there were no records and added fake data for plot
        if "dummy" in plot_df.columns:
            legend.remove()

        # ---------------------------------------------------------------------
        # Axes

        xlim = ax.get_xlim()
        # If plotting 16 weeks, the x-axis will be (-0.625, 16.625)
        # If we added dummy data, need to correct start date
        x_start = xlim[1] - weeks - 1.25
        ax.set_xlim(x_start, xlim[1])
        ax.set_ylim(ylim[0], ylim[1])
        ax.set_ylabel("Number of Sequences", fontweight="bold")
        ax.set_xlabel("Start of Week", fontweight="bold")
        # ax.xaxis.set_label_coords(0.5, -0.30)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center", fontsize=6)

        # Upscale plot dimensions if there are too many weeks
        if len(plot_df) > FIGSIZE_MAX_WEEKS:
            upscale_factor = len(plot_df) / FIGSIZE_MAX_WEEKS
            fig.set_size_inches(
                FIGSIZE[0] * upscale_factor,
                FIGSIZE[1] * upscale_factor,
            )

        # Attempt to see whether we can apply a tight layout
        try:
            plt.tight_layout()
            plt.savefig(out_path + ".png")
            plt.savefig(out_path + ".svg")
        except UserWarning:
            logger.info("Unable to apply tight_layout, plot will not be saved.")


if __name__ == "__main__":
    main()

Python Pandas numpy matplotlib functions CLICK epiweeks From line 2 of scripts/plot.py

import click
import pandas as pd
import os
import yaml
from functions import create_logger

NO_DATA_CHAR = "NA"


@click.command()
@click.option(
    "--rbd-definition", help="Key RBD mutation definitions (yaml).", required=True
)
@click.option("--nextclade", help="Nextclade qc table (tsv).", required=True)
@click.option("--output", help="Ouptut table of RBD level (tsv).", required=True)
@click.option("--log", help="Output log file.", required=False)
def main(
    rbd_definition,
    nextclade,
    output,
    log,
):
    """Calculate the number of key RBD mutations."""

    # create logger
    logger = create_logger(logfile=log)

    # Create output directory
    outdir = os.path.dirname(output)
    if not os.path.exists(outdir) and outdir != "." and outdir != "":
        os.makedirs(outdir)

    # -------------------------------------------------------------------------
    # RBD Mutation Definitions

    # Import file
    logger.info("Parsing RBD mutation definitions: {}".format(rbd_definition))
    with open(rbd_definition) as infile:
        rbd_dict = yaml.safe_load(infile)

    # Parse into dataframe
    rbd_data = {"gene": [], "coord": [], "ref": [], "alt": []}
    for mut in rbd_dict["rbd_mutations"]:
        rbd_data["gene"].append("S")
        rbd_data["coord"].append(int(mut[1]))
        rbd_data["ref"].append(mut[0])
        rbd_data["alt"].append(mut[2])

    rbd_df = pd.DataFrame(rbd_data)

    # -------------------------------------------------------------------------
    # Nextclade QC

    logger.info("Parsing nextclade QC: {}".format(nextclade))
    nextclade_df = pd.read_csv(nextclade, sep="\t")
    nextclade_df.fillna("NA", inplace=True)

    # -------------------------------------------------------------------------
    # RBD Calculations from Amino Acid Substitutions

    logger.info("Calculating RBD levels.")
    # Initialize the columns that will be in the output table
    output_data = {
        "strain": [],
        "rbd_level": [],
        "rbd_substitutions": [],
        "immune_escape": [],
        "ace2_binding": [],
    }

    # Initialize at 0
    # nextclade_df.at[nextclade_df.index, "rbd_level"] = [0] * len(nextclade_df)
    nextclade_df["rbd_level"] = [0] * len(nextclade_df)

    # Assign RBD level to each sample
    for rec in nextclade_df.iterrows():
        strain = rec[1]["seqName"]
        aa_subs = rec[1]["aaSubstitutions"].split(",")

        rbd_level = 0
        rbd_subs = []

        for s in aa_subs:
            gene = s.split(":")[0]
            # RBD Mutations only involve spike
            if gene != "S":
                continue
            coord = int(s.split(":")[1][1:-1])
            alt = s.split(":")[1][-1]

            # Check if the position is a RBD mutation
            if coord in list(rbd_df["coord"]):
                rbd_coord_alts = list(rbd_df[rbd_df["coord"] == coord]["alt"])[0]
                if alt in rbd_coord_alts:
                    rbd_level += 1
                    rbd_subs.append(s)

        immune_escape = NO_DATA_CHAR
        ace2_binding = NO_DATA_CHAR
        if "immune_escape" in nextclade_df.columns:
            immune_escape = rec[1]["immune_escape"]
            ace2_binding = rec[1]["ace2_binding"]

        output_data["strain"].append(strain)
        output_data["rbd_level"].append(rbd_level)
        output_data["rbd_substitutions"].append(",".join(rbd_subs))
        output_data["immune_escape"].append(immune_escape)
        output_data["ace2_binding"].append(ace2_binding)

    output_df = pd.DataFrame(output_data)

    # -------------------------------------------------------------------------
    # Export

    logger.info("Exporting table: {}".format(output))
    output_df.to_csv(output, sep="\t", index=False)


if __name__ == "__main__":
    main()

Python Pandas PyYAML functions CLICK nextclade From line 2 of scripts/rbd_levels.py

import click
import pptx
from pptx.enum.shapes import MSO_SHAPE
from datetime import date
import os
import pandas as pd

NO_DATA_CHAR = "NA"
TITLE = "SARS-CoV-2 Recombinants"
FONT_SIZE_PARAGRAPH = 20

RECOMBINANT_STATUS = [
    "designated",
    "proposed",
    "unpublished",
]

""" Ref for slide types:
0 ->  title and subtitle
1 ->  title and content
2 ->  section header
3 ->  two content
4 ->  Comparison
5 ->  Title only
6 ->  Blank
7 ->  Content with caption
8 ->  Pic with caption
"""


@click.command()
@click.option("--linelist", help="Linelist TSV", required=True)
@click.option("--plot-dir", help="Plotting directory", required=True)
@click.option("--output", help="Output PPTX path", required=True)
@click.option(
    "--geo", help="Geography column to summarize", required=False, default="country"
)
@click.option(
    "--min-cluster-size",
    help="The minimum size of clusters included in the plots",
    default=1,
)
@click.option(
    "--template",
    help="Powerpoint template",
    required=False,
    default="resources/template.pptx",
)
def main(
    linelist,
    plot_dir,
    template,
    geo,
    output,
    min_cluster_size,
):
    """Create a report of powerpoint slides"""

    # Parse build name from plot_dir path
    build = os.path.basename(os.path.dirname(plot_dir))
    outdir = os.path.dirname(output)
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    subtitle = "{}\n{}".format(build.title(), date.today())

    # Import dataframes
    linelist_df = pd.read_csv(linelist, sep="\t")

    plot_suffix = ".png"
    df_suffix = ".tsv"
    labels = [
        f.replace(df_suffix, "") for f in os.listdir(plot_dir) if f.endswith(df_suffix)
    ]

    plot_dict = {}
    for label in labels:
        plot_dict[label] = {}
        plot_dict[label]["plot_path"] = os.path.join(plot_dir, label + plot_suffix)
        plot_dict[label]["df_path"] = os.path.join(plot_dir, label + df_suffix)
        plot_dict[label]["df"] = pd.read_csv(plot_dict[label]["df_path"], sep="\t")

        # Breakpoints df isn't over time, but by lineage
        if "epiweek" in plot_dict[label]["df"].columns:
            plot_dict[label]["df"].index = plot_dict[label]["df"]["epiweek"]

        # Largest is special, as it takes the form largest_<lineage>.*
        if label.startswith("largest_"):

            largest_lineage = "_".join(label.split("_")[1:])
            # Replace the _DELIM_ character we added for saving files
            largest_lineage = largest_lineage.replace("_DELIM_", "/")

            plot_dict["largest"] = plot_dict[label]
            plot_dict["largest"]["lineage"] = largest_lineage

            del plot_dict[label]

    # ---------------------------------------------------------------------
    # Presentation
    # ---------------------------------------------------------------------
    presentation = pptx.Presentation(template)

    # ---------------------------------------------------------------------
    # Title Slide
    title_slide_layout = presentation.slide_layouts[0]
    slide = presentation.slides.add_slide(title_slide_layout)
    slide.shapes.title.text = TITLE
    slide.placeholders[1].text = subtitle

    # ---------------------------------------------------------------------
    # General Summary

    # Slide setup
    graph_slide_layout = presentation.slide_layouts[8]
    slide = presentation.slides.add_slide(graph_slide_layout)
    title = slide.shapes.title
    title.text_frame.text = "Status"
    title.text_frame.paragraphs[0].font.bold = True

    chart_placeholder = slide.placeholders[1]
    # Plotting may have failed to create an individual figure
    plot_path = plot_dict["lineage"]["plot_path"]
    if os.path.exists(plot_path):
        chart_placeholder.insert_picture(plot_path)
    body = slide.placeholders[2]

    # Stats

    lineages_df = plot_dict["lineage"]["df"]
    lineages = list(lineages_df.columns)
    lineages.remove("epiweek")

    status_counts = {
        status: {"sequences": 0, "lineages": 0} for status in RECOMBINANT_STATUS
    }

    status_seq_df = plot_dict["status"]["df"]
    statuses = list(status_seq_df.columns)
    statuses.remove("epiweek")

    for status in statuses:
        status = status.lower()
        status_lin_df = plot_dict[status]["df"]
        status_lin = list(status_lin_df.columns)
        status_lin.remove("epiweek")

        num_status_lin = len(status_lin)
        status_counts[status]["lineages"] += num_status_lin

        for lin in status_lin:
            num_status_seq = int(sum(lineages_df[lin]))
            status_counts[status]["sequences"] += num_status_seq

    # Number of lineages and sequences
    total_sequences = int(sum([status_counts[s]["sequences"] for s in status_counts]))
    total_lineages = int(sum([status_counts[s]["lineages"] for s in status_counts]))

    # Construct the summary text
    summary = "\n"

    summary += "There are {total_lineages} recombinant lineages*.\n".format(
        total_lineages=total_lineages
    )

    for status in RECOMBINANT_STATUS:
        if status in status_counts:
            count = status_counts[status]["lineages"]
        else:
            count = 0
        summary += "  - {lineages} lineages are {status}.\n".format(
            lineages=count, status=status
        )

    summary += "\n"
    summary += "There are {total_sequences} recombinant sequences*.\n".format(
        total_sequences=total_sequences
    )

    for status in RECOMBINANT_STATUS:
        if status in status_counts:
            count = status_counts[status]["sequences"]
        else:
            count = 0
        summary += "  - {sequences} sequences are {status}.\n".format(
            sequences=count, status=status
        )

    # Add a footnote to indicate cluster size
    summary += "*Excluding small lineages (N<{})".format(min_cluster_size)

    body.text_frame.text = summary

    # Adjust font size of body
    for paragraph in body.text_frame.paragraphs:
        for run in paragraph.runs:
            run.font.size = pptx.util.Pt(14)

    # ---------------------------------------------------------------------
    # Status Summary

    for status in RECOMBINANT_STATUS:

        status = status.lower()
        if status not in plot_dict:
            continue

        # Slide setup
        graph_slide_layout = presentation.slide_layouts[8]
        slide = presentation.slides.add_slide(graph_slide_layout)
        title = slide.shapes.title

        title.text_frame.text = status.title()
        title.text_frame.paragraphs[0].font.bold = True

        chart_placeholder = slide.placeholders[1]
        # Plotting may have failed to create an individual figure
        plot_path = plot_dict[status]["plot_path"]
        if os.path.exists(plot_path):
            chart_placeholder.insert_picture(plot_path)

        body = slide.placeholders[2]

        # Stats
        status_df = plot_dict[status]["df"]
        status_lineages = list(status_df.columns)
        status_lineages.remove("epiweek")

        # Order columns
        status_counts = {
            lineage: int(sum(status_df[lineage])) for lineage in status_lineages
        }
        status_lineages = sorted(status_counts, key=status_counts.get, reverse=True)
        num_status_lineages = len(status_lineages)

        summary = "\n"
        summary += "There are {num_status_lineages} {status} lineages*.\n".format(
            status=status, num_status_lineages=num_status_lineages
        )

        for lineage in status_lineages:
            seq_count = int(sum(status_df[lineage].dropna()))
            summary += "  - {lineage} ({seq_count})\n".format(
                lineage=lineage, seq_count=seq_count
            )

        summary += "\n*Excluding small lineages (N<{})".format(min_cluster_size)
        body.text_frame.text = summary

        # Adjust font size of body
        for paragraph in body.text_frame.paragraphs:
            for run in paragraph.runs:
                run.font.size = pptx.util.Pt(14)

    # ---------------------------------------------------------------------
    # Geographic Summary

    geo_df = plot_dict["geography"]["df"]
    geos = list(geo_df.columns)
    geos.remove("epiweek")
    # Order columns
    geos_counts = {region: int(sum(geo_df[region])) for region in geos}
    geos = sorted(geos_counts, key=geos_counts.get, reverse=True)

    num_geos = len(geos)

    graph_slide_layout = presentation.slide_layouts[8]
    slide = presentation.slides.add_slide(graph_slide_layout)
    title = slide.shapes.title

    title.text_frame.text = "Geography"
    title.text_frame.paragraphs[0].font.bold = True

    chart_placeholder = slide.placeholders[1]
    # Plotting may have failed to create an individual figure
    plot_path = plot_dict["geography"]["plot_path"]
    if os.path.exists(plot_path):
        chart_placeholder.insert_picture(plot_path)

    body = slide.placeholders[2]

    summary = "\n"
    summary += "Recombinants are observed in {num_geos} {geo}*.\n".format(
        num_geos=num_geos, geo=geo
    )

    for region in geos:
        seq_count = int(sum(geo_df[region].dropna()))
        summary += "  - {region} ({seq_count})\n".format(
            region=region, seq_count=seq_count
        )

    summary += "\n*Excluding small lineages (N<{})".format(min_cluster_size)

    body.text_frame.text = summary

    # Adjust font size of body
    for paragraph in body.text_frame.paragraphs:
        for run in paragraph.runs:
            run.font.size = pptx.util.Pt(14)

    # ---------------------------------------------------------------------
    # Largest Summary

    largest_df = plot_dict["largest"]["df"]

    largest_geos = list(largest_df.columns)
    largest_geos.remove("epiweek")

    # Order columns
    largest_geos_counts = {
        region: int(sum(largest_df[region])) for region in largest_geos
    }
    largest_geos = sorted(
        largest_geos_counts, key=largest_geos_counts.get, reverse=True
    )

    num_geos = len(largest_geos)

    largest_lineage = plot_dict["largest"]["lineage"]
    largest_lineage_size = 0

    for lineage in lineages:
        seq_count = int(sum(lineages_df[lineage].dropna()))
        if seq_count > largest_lineage_size:
            largest_lineage_size = seq_count

    graph_slide_layout = presentation.slide_layouts[8]
    slide = presentation.slides.add_slide(graph_slide_layout)
    title = slide.shapes.title

    title.text_frame.text = "Largest"
    title.text_frame.paragraphs[0].font.bold = True

    chart_placeholder = slide.placeholders[1]
    # Plotting may have failed to create an individual figure
    plot_path = plot_dict["largest"]["plot_path"]
    if os.path.exists(plot_path):
        chart_placeholder.insert_picture(plot_path)
    body = slide.placeholders[2]

    summary = "\n"
    summary += "The largest lineage is {lineage} (N={size})*.\n".format(
        lineage=largest_lineage,
        size=largest_lineage_size,
    )

    summary += "{lineage} is observed in {num_geo} {geo}.\n".format(
        lineage=largest_lineage, num_geo=num_geos, geo=geo
    )

    for region in largest_geos:
        seq_count = int(sum(largest_df[region].dropna()))
        summary += "  - {region} ({seq_count})\n".format(
            region=region, seq_count=seq_count
        )

    summary += "\n*Excluding small lineages (N<{})".format(min_cluster_size)

    body.text_frame.text = summary

    # Adjust font size of body
    for paragraph in body.text_frame.paragraphs:
        for run in paragraph.runs:
            run.font.size = pptx.util.Pt(14)

    # ---------------------------------------------------------------------
    # RBD Levels

    rbd_df = plot_dict["rbd_level"]["df"]
    rbd_levels = list(rbd_df.columns)
    rbd_levels.remove("epiweek")
    # Order columns
    rbd_counts = {
        level: int(sum(rbd_df[level]))
        for level in rbd_levels
        if int(sum(rbd_df[level])) > 0
    }
    rbd_levels = dict(sorted(rbd_counts.items()))

    num_rbd_levels = len(rbd_levels)

    graph_slide_layout = presentation.slide_layouts[8]
    slide = presentation.slides.add_slide(graph_slide_layout)
    title = slide.shapes.title

    title.text_frame.text = "Receptor Binding Domain"
    title.text_frame.paragraphs[0].font.bold = True

    chart_placeholder = slide.placeholders[1]
    # Plotting may have failed to create an individual figure
    plot_path = plot_dict["rbd_level"]["plot_path"]
    if os.path.exists(plot_path):
        chart_placeholder.insert_picture(plot_path)
    body = slide.placeholders[2]

    summary = "\n"
    summary += "{num_rbd_levels} RBD levels are observed*.\n".format(
        num_rbd_levels=num_rbd_levels,
    )

    for level in rbd_levels:
        seq_count = int(sum(rbd_df[level].dropna()))
        summary += "  - Level {level} ({seq_count})\n".format(
            level=level, seq_count=seq_count
        )

    summary += "\n*Excluding small lineages (N<{})".format(min_cluster_size)
    body.text_frame.text = summary

    # Adjust font size of body
    for paragraph in body.text_frame.paragraphs:
        for run in paragraph.runs:
            run.font.size = pptx.util.Pt(14)

    # ---------------------------------------------------------------------
    # Parents Summary (Clade)

    parents_df = plot_dict["parents_clade"]["df"]

    parents = list(parents_df.columns)
    parents.remove("epiweek")

    parents_counts = {p: int(sum(parents_df[p])) for p in parents}
    parents = sorted(parents_counts, key=parents_counts.get, reverse=True)

    num_parents = len(parents)

    graph_slide_layout = presentation.slide_layouts[8]
    slide = presentation.slides.add_slide(graph_slide_layout)
    title = slide.shapes.title

    title.text_frame.text = "Parents (Clade)"
    title.text_frame.paragraphs[0].font.bold = True

    chart_placeholder = slide.placeholders[1]
    # Plotting may have failed to create an individual figure
    plot_path = plot_dict["parents_clade"]["plot_path"]
    if os.path.exists(plot_path):
        chart_placeholder.insert_picture(plot_path)
    body = slide.placeholders[2]

    summary = "\n"
    summary += "There are {num_parents} clade combinations*.\n".format(
        num_parents=num_parents
    )

    for parent in parents:
        seq_count = int(sum(parents_df[parent].dropna()))
        summary += "  - {parent} ({seq_count})\n".format(
            parent=parent, seq_count=seq_count
        )

    summary += "\n*Excluding small lineages (N<{})".format(min_cluster_size)
    body.text_frame.text = summary

    # Adjust font size of body
    for paragraph in body.text_frame.paragraphs:
        for run in paragraph.runs:
            run.font.size = pptx.util.Pt(14)

    # ---------------------------------------------------------------------
    # Parents Summary (Lineage)

    parents_df = plot_dict["parents_lineage"]["df"]

    parents = list(parents_df.columns)
    parents.remove("epiweek")

    parents_counts = {p: int(sum(parents_df[p])) for p in parents}
    parents = sorted(parents_counts, key=parents_counts.get, reverse=True)

    num_parents = len(parents)

    graph_slide_layout = presentation.slide_layouts[8]
    slide = presentation.slides.add_slide(graph_slide_layout)
    title = slide.shapes.title

    title.text_frame.text = "Parents (Lineage)"
    title.text_frame.paragraphs[0].font.bold = True

    chart_placeholder = slide.placeholders[1]
    # Plotting may have failed to create an individual figure
    plot_path = plot_dict["parents_lineage"]["plot_path"]
    if os.path.exists(plot_path):
        chart_placeholder.insert_picture(plot_path)
    body = slide.placeholders[2]

    summary = "\n"
    summary += "There are {num_parents} lineage combinations*.\n".format(
        num_parents=num_parents
    )

    for parent in parents:
        seq_count = int(sum(parents_df[parent].dropna()))
        summary += "  - {parent} ({seq_count})\n".format(
            parent=parent, seq_count=seq_count
        )

    summary += "\n*Excluding small lineages (N<{})".format(min_cluster_size)
    body.text_frame.text = summary

    # Adjust font size of body
    for paragraph in body.text_frame.paragraphs:
        for run in paragraph.runs:
            run.font.size = pptx.util.Pt(14)

    # ---------------------------------------------------------------------
    # Breakpoints Clade Summary

    graph_slide_layout = presentation.slide_layouts[8]
    slide = presentation.slides.add_slide(graph_slide_layout)
    title = slide.shapes.title

    title.text_frame.text = "Breakpoints (Clade)"
    title.text_frame.paragraphs[0].font.bold = True

    chart_placeholder = slide.placeholders[1]
    # Plotting may have failed to create an individual figure
    plot_path = plot_dict["breakpoints_clade"]["plot_path"]
    if os.path.exists(plot_path):
        chart_placeholder.insert_picture(plot_path)
    body = slide.placeholders[2]

    # ---------------------------------------------------------------------
    # Breakpoints Lineage Summary

    graph_slide_layout = presentation.slide_layouts[8]
    slide = presentation.slides.add_slide(graph_slide_layout)
    title = slide.shapes.title

    title.text_frame.text = "Breakpoints (Lineage)"
    title.text_frame.paragraphs[0].font.bold = True

    chart_placeholder = slide.placeholders[1]
    # Plotting may have failed to create an individual figure
    plot_path = plot_dict["breakpoints_lineage"]["plot_path"]
    if os.path.exists(plot_path):
        chart_placeholder.insert_picture(plot_path)
    body = slide.placeholders[2]

    # ---------------------------------------------------------------------
    # Footer

    # Versions
    pipeline_version = linelist_df["pipeline_version"].values[0]
    recombinant_classifier_dataset = linelist_df[
        "recombinant_classifier_dataset"
    ].values[0]

    for slide in presentation.slides:

        # Don't add footer to title slide
        if slide.slide_layout == presentation.slide_layouts[0]:
            continue

        footer = slide.shapes.add_shape(
            autoshape_type_id=MSO_SHAPE.RECTANGLE,
            left=pptx.util.Cm(3),
            top=pptx.util.Cm(17),
            width=pptx.util.Pt(800),
            height=pptx.util.Pt(30),
        )

        footer.fill.solid()
        footer.fill.fore_color.rgb = pptx.dml.color.RGBColor(255, 255, 255)
        footer.line.color.rgb = pptx.dml.color.RGBColor(0, 0, 0)
        p = footer.text_frame.paragraphs[0]
        p.text = "{} {} {}".format(
            pipeline_version, " " * 20, recombinant_classifier_dataset
        )
        p.font.size = pptx.util.Pt(14)
        p.font.color.rgb = pptx.dml.color.RGBColor(0, 0, 0)

    # ---------------------------------------------------------------------
    # Saving file
    presentation.save(output)


if __name__ == "__main__":
    main()

Python Pandas CLICK From line 2 of scripts/report.py

sc2rf_args=()

while [[ $# -gt 0 ]]; do
  case $1 in
    --alignment)
      alignment=$2
      shift # past argument
      shift # past value
      ;;
    --primers)
      primers=$2
      shift # past argument
      shift # past value
      ;;
    --primers-name)
      primers_name=$2
      shift # past argument
      shift # past value
      ;;
    --output-ansi)
      output_ansi=$2
      shift # past argument
      shift # past value
      ;;
    --output-csv)
      output_csv=$2
      shift # past argument
      shift # past value
      ;;
    --log)
      log=$2
      shift # past argument
      shift # past value
      ;;
    -*|--*)
      arg=$1
      value=$2
      sc2rf_args+=("$arg")
      shift # past argument
      if [[ ${value:0:1} != "-" ]]; then
        sc2rf_args+=("$value")
        shift # past value
      fi
      ;;
    *)
      # This is a risky way to parse the clades for sc2rf
      value=$1
      sc2rf_args+=("$value")
      shift # past value
      ;;
  esac
done

# Prep the Output Directory
outdir=$(dirname $output_csv)
mkdir -p $outdir

# Location of sc2rf executable
sc2rf="sc2rf/sc2rf.py"

# Add optional params to sc2rf_args
primers_name=${primers_name:-primers}
sc2rf_args+=("--csvfile $output_csv")

# Add primers
if [[ "${primers}" ]]; then
  cp $primers sc2rf/${primers_name}.bed
  sc2rf_args+=("--primers ${primers_name}.bed")
fi

# rebuild examples
#log_rebuild=${log%.*}_rebuild
#python3 sc2rf.py --rebuild-examples 1> ${log_rebuild}.log 2> ${log_rebuild}.err

echo "python3 $sc2rf ${alignment} ${sc2rf_args[@]}" > ${output_ansi}
python3 $sc2rf ${alignment} ${sc2rf_args[@]} 1>> ${output_ansi} 2> ${log};

# Clean up primers
if [[ "${primers}" ]]; then
  rm -f sc2rf/${primers_name}.bed
fi

Shell From line 6 of scripts/sc2rf.sh

while [[ $# -gt 0 ]]; do

  case $1 in
    --output)
      output=$2
      shift # past argument
      shift # past value
      ;;
    --nextclade)
      nextclade=$2
      shift # past argument
      shift # past value
      ;;
    --nextclade-dataset)
      nextclade_dataset=$2
      shift # past argument
      shift # past value
      ;;
    --sc2rf)
      sc2rf=$2
      shift # past argument
      shift # past value
      ;;
    --rbd-levels)
      rbd_levels=$2
      shift # past argument
      shift # past value
      ;;
    --extra-cols)
      extra_cols=$2
      shift # past argument
      shift # past value
      ;;
    -*|--*)
      echo "Unknown option $1"
      exit 1
      ;;
  esac
done

# ncov-recombinant pipeline version
git_commit_hash=$(git rev-parse HEAD)
git_commit=${git_commit_hash:0:8}
git_tag=$(git tag | tail -n1)
ncov_recombinant_ver="${git_tag}:${git_commit}"

# Nextclade version
nextclade_ver=$(nextclade --version | cut -d " " -f 2)

sort_col="Nextclade_pango"
default_cols="strain,date,country"
nextclade_cols="privateNucMutations.reversionSubstitutions,privateNucMutations.unlabeledSubstitutions,privateNucMutations.labeledSubstitutions,substitutions"

# Hack to fix commas if extra_cols is empty
cols="${default_cols},${nextclade_cols}"
if [[ $extra_cols ]]; then
  cols="${cols},${extra_cols}"
fi

csvtk cut -t -f "${cols},clade,Nextclade_pango" ${nextclade} \
  | csvtk rename -t -f "clade" -n "Nextclade_clade" \
  | csvtk merge -t --na "NA" -f "strain" - ${sc2rf} \
  | csvtk merge -t --na "NA" -f "strain" - ${rbd_levels} \
  | csvtk sort -t -k "$sort_col" \
  | csvtk mutate2 -t -n "ncov-recombinant_version" -e "\"$ncov_recombinant_ver\"" \
  | csvtk mutate2 -t -n "nextclade_version" -e "\"$nextclade_ver\"" \
  | csvtk mutate2 -t -n "nextclade_dataset" -e "\"$nextclade_dataset\"" \
  > $output

Shell csvtk From line 6 of scripts/summary.sh

import click
import pandas as pd
import os

NO_DATA_CHAR = "NA"
VALIDATE_COLS = ["status", "lineage", "parents_clade", "breakpoints"]


@click.command()
@click.option("--expected", help="Expected linelist (TSV)", required=True)
@click.option("--observed", help="Observed linelist (TSV)", required=True)
@click.option("--outdir", help="Output directory", required=True)
def main(
    expected,
    observed,
    outdir,
):
    """Validate output"""

    # Check for output directory
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    # Import Dataframes
    expected_df = pd.read_csv(expected, sep="\t")
    observed_df = pd.read_csv(observed, sep="\t")

    expected_df.fillna(NO_DATA_CHAR, inplace=True)
    observed_df.fillna(NO_DATA_CHAR, inplace=True)

    output_data = {
        "strain": [],
        "status": [],
        "fail_cols": [],
        "expected": [],
        "observed": [],
    }

    # Validate observed values
    for rec in observed_df.iterrows():
        strain = rec[1]["strain"]
        fail_cols = []
        expected_vals = []
        observed_vals = []

        # Check if this in the validation table
        if strain not in expected_df["strain"].values:
            status = "No Expected Values"
        else:

            # Check if all observed values match the expected values
            match = True

            for col in VALIDATE_COLS:
                exp_val = expected_df[expected_df["strain"] == strain][col].values[0]
                obs_val = observed_df[observed_df["strain"] == strain][col].values[0]

                if exp_val != obs_val:
                    match = False
                    fail_cols.append(col)
                    expected_vals.append(exp_val)
                    observed_vals.append(obs_val)

            # Check if any columns failed matching
            if match:
                status = "Pass"
            else:
                status = "Fail"

        output_data["strain"].append(strain)
        output_data["status"].append(status)
        output_data["fail_cols"].append(";".join(fail_cols))
        output_data["expected"].append(";".join(expected_vals))
        output_data["observed"].append(";".join(observed_vals))

    # Table of validation
    output_df = pd.DataFrame(output_data)
    outpath = os.path.join(outdir, "validation.tsv")
    output_df.to_csv(outpath, sep="\t", index=False)

    # Overall build validation status
    build_status = "Pass"
    # Option 1. Fail if one sample failed
    if "Fail" in output_data["status"]:
        build_status = "Fail"
    # Option 2. Pass if values are all "Pass" or "No Expected Values"
    elif "Pass" in output_data["status"]:
        build_status = "Pass"
    # Option 2. No Expected Values
    elif len(output_df) == 0:
        build_status = "No Expected Values"

    outpath = os.path.join(outdir, "status.txt")
    with open(outpath, "w") as outfile:
        outfile.write(build_status + "\n")


if __name__ == "__main__":
    main()

Python Pandas CLICK From line 2 of scripts/validate.py

run:
  # Print the config
  config_json = json.dumps(config, indent = 2)
  print(config_json)

SnakeMake From line 158 of workflow/Snakefile

run:
  config_copy = copy.deepcopy(config)
  # Remove other builds from config
  for build in config["builds"]:
    if build != wildcards.build:
      del config_copy["builds"][build]

  config_json = json.dumps(config_copy, indent = 2)

  # Create output directory
  if not os.path.exists(params.output_dir):
    os.mkdir(params.output_dir)

  # Save to json file
  outfile_path = output.json
  with open(outfile_path, "w") as outfile:
    outfile.write(config_json)

SnakeMake JSON From line 177 of workflow/Snakefile

run:
  for rule in workflow.rules:
    print("-" * 160)
    print("rule: ", rule.name )
    if rule.docstring:
        print(rule.docstring)
    if rule._input:
        print("\tinput:")
        for in_file in rule.input:
            print("\t\t" + str(in_file))
        for in_file in rule.input.keys():
            print("\t\t" + in_file + ": " + str(rule.input[in_file]))
    if rule._output:
        print("\toutput:")
        for out_file in rule.output:
            print("\t\t" + out_file)
        for out_file in rule.output.keys():
            print("\t\t" + out_file + ": " + str(rule.output[out_file]))
    if rule._params:
        print("\tparams:")
        for param in rule.params.keys():
            print("\t\t" + param + ": " + str(rule.params[param]))
    if rule.resources:
        print("\tresources:")
        for resource in rule.resources.keys():
            print("\t\t" + resource + ": " + str(rule.resources[resource]))
    if rule.conda_env:
        print("\t\tconda: ", rule.conda_env)
    if rule._log:
        print("\t\tlog: ", rule._log)

SnakeMake From line 203 of workflow/Snakefile

shell:
  """
  python3 scripts/download_issues.py --breakpoints {input.breakpoints} 1> {output.issues} 2> {log};
  csvtk cut -t -f "issue,lineage" {output.issues} | tail -n+2   1> {output.issue_to_lineage} 2>> {log};
  python3 scripts/plot_breakpoints.py --lineages {input.breakpoints} --outdir {params.outdir} --autoscale >> {log} 2>&1;
  """

SnakeMake csvtk From line 265 of workflow/Snakefile

shell:
  """
  python3 scripts/lineage_tree.py --output {output.tree} > {log};
  """

SnakeMake From line 294 of workflow/Snakefile

shell:
  """
  # Download dataset
  nextclade dataset get --name {wildcards.dataset} --tag {wildcards.tag} --output-dir {output.dataset_dir} > {log} 2>&1;
  """

SnakeMake nextclade From line 321 of workflow/Snakefile

shell:
  """
  # Align sequences
  nextclade run \
    --jobs {resources.cpus} \
    --input-dataset {input.dataset} \
    --output-all {params.outdir} \
    --output-selection {params.selection} \
    --output-tsv {output.qc} \
    --output-fasta {output.alignment} \
    --output-basename {params.basename} \
    {input.sequences} \
    >> {log} 2>&1;

  # Merge QC output with metadata
  csvtk rename -t -f "seqName" -n "strain" {output.qc} 2>> {log} \
    | csvtk merge -t -f "strain" {input.metadata} - \
    1> {output.metadata} 2>> {log};
  """

SnakeMake nextclade csvtk From line 383 of workflow/Snakefile

shell:
  """
  # Extract recombinant strains
  csvtk grep -t -v -f "clade" -r -p "{params.exclude_clades}" {input.qc}  2> {log} \
    | csvtk grep -t -f "qc.mixedSites.status" -p "bad" -v 2>> {log} \
    | csvtk grep -t -f "{params.fields}" -r -p "{params.values}" 2>> {log} \
    | csvtk cut -t -f "seqName" 2>> {log} \
    | tail -n+2 \
    > {output.strains};
  # Extract recombinant sequences
  seqkit grep -f {output.strains} {input.alignment} 1> {output.alignment} 2>> {log};
  # Extract recombinant qc
  csvtk grep -t -P {output.strains} {input.qc} 1> {output.qc} 2>> {log};
  # Extract non-recombinants qc
  csvtk grep -t -P {output.strains} -v {input.qc} 1> {output.non} 2>> {log};
  """

SnakeMake seqkit csvtk From line 446 of workflow/Snakefile

shell:
  """
  python3 scripts/rbd_levels.py --rbd-definition {input.rbd_definition} --nextclade {input.nextclade} --output {output.table} --log {log};
  """

SnakeMake From line 486 of workflow/Snakefile

shell:
  """
  scripts/sc2rf.sh \
    --alignment {input.alignment} \
    --output-ansi {output.ansi} \
    --output-csv {output.csv} \
    --log {log} \
    --max-name-length {params.max_name_length} \
    {params.sc2rf_args};
  """

SnakeMake From line 565 of workflow/Snakefile

shell:
  """
  python3 sc2rf/postprocess.py \
    --csv {params.csv} \
    --ansi {params.ansi} \
    --prefix {params.prefix} \
    --outdir {params.outdir} \
    --aligned {input.alignment} \
    --issues {input.issues} \
    --nextclade {input.nextclade} \
    --nextclade-no-recomb {input.nextclade_no_recomb} \
    --lineage-tree {input.lineage_tree} \
    {params.metadata} \
    {params.auto_pass} \
    {params.motifs} \
    {params.min_len} \
    {params.min_consec_allele} \
    {params.max_breakpoint_len} \
    {params.max_parents} \
    {params.max_breakpoints} \
    {params.dup_method} \
    {params.lapis} \
    {params.gisaid_access_key} \
    --log {log} \
    >> {log} 2>&1;

  # Cleanup
  mv -f {params.outdir}/recombinants.tsv {params.outdir}/stats.tsv >> {log} 2>&1;
  """

SnakeMake From line 711 of workflow/Snakefile

shell:
  """
  bash scripts/summary.sh \
    --output {output.summary} \
    --nextclade {input.nextclade} \
    --sc2rf {input.sc2rf} \
    --rbd-levels {input.rbd_levels} \
    --nextclade-dataset {params.nextclade_dataset}_{params.nextclade_tag} \
    {params.extra_cols} \
    > {log} 2>&1;
  """

SnakeMake From line 786 of workflow/Snakefile

shell:
  """
  # Create the strain tables
  python3 scripts/linelist.py \
    --input {input.summary} \
    --issues {input.issues} \
    --lineage-tree {input.lineage_tree} \
    --outdir {params.outdir} \
    {params.extra_cols} \
    {params.min_lineage_size} \
    {params.min_private_muts} \
    > {log} 2>&1;

  # Create the lineages table
  python3 scripts/lineages.py --input {output.positives} --output {output.lineages} {params.geo} >> {log} 2>&1;

  # Create the parents table
  python3 scripts/parents.py --input {output.positives} --output {output.parents} >> {log} 2>&1;
  """

SnakeMake From line 860 of workflow/Snakefile

shell:
  """
  python3 scripts/plot.py \
    --input {input.positives} \
    --outdir {output.plots} \
    {params.lag} \
    {params.geo} \
    {params.weeks} \
    {params.min_date} \
    {params.max_date} \
    {params.min_cluster_size} \
    > {log} 2>&1;

  # Extract the cluster IDs to be plotted
  cluster_ids=$(csvtk headers -t {output.plots}/cluster_id.tsv | tail -n+2 | tr "\\n" "," | sed 's/,$/\\n/g')

  # Plot breakpoints by clade
  python3 scripts/plot_breakpoints.py \
    --lineages {input.lineages} \
    --lineage-col recombinant_lineage_curated \
    --positives {input.positives} \
    --outdir {output.plots} \
    --parent-col parents_clade \
    --parent-type clade \
    --cluster-col cluster_id \
    --clusters "${{cluster_ids}}" \
    {params.autoscale} \
    >> {log} 2>&1;

  # Plot breakpoints by lineage
  python3 scripts/plot_breakpoints.py \
    --lineages {input.lineages} \
    --lineage-col recombinant_lineage_curated \
    --positives {input.positives} \
    --outdir {output.plots} \
    --parent-col parents_lineage \
    --parent-type lineage \
    --cluster-col cluster_id \
    --clusters "${{cluster_ids}}" \
    {params.autoscale} \
    >> {log} 2>&1;
  """

SnakeMake From line 966 of workflow/Snakefile

shell:
  """
  # Create the excel report
  csvtk csv2xlsx -t -o {output.xlsx} {input.tables} > {log} 2>&1;

  # Create the powerpoint slides
  python3 scripts/report.py --linelist {input.linelist} --plot-dir {input.plots} --output {output.pptx} {params.geo} {params.template} {params.min_cluster_size} >> {log} 2>&1;
  """

SnakeMake csvtk From line 1072 of workflow/Snakefile

shell:
  """
  python scripts/validate.py --expected {input.expected} --observed {input.observed} --outdir {params.outdir};

  # Throw a pipeline error if any sample has "Fail"
  build_status=$(cat {output.status})

  if [[ $build_status == "Fail" ]]; then

    echo "Build failed validation: {wildcards.build}" > {log}
    csvtk grep -t -f "status" -p "Fail" {output.table} >> {log}
    exit 1
  fi
  """

SnakeMake csvtk From line 1120 of workflow/Snakefile

Nextstrain (Nextclade) 🔣 🔌	Lena Schimmel (sc2rf) 🔌	Yatish Turakhia (UShER) 🔣 🔌	Angie Hinrichs (UShER) 🔣 🔌	Benjamin Delisle 🐛 ⚠️
Vani Priyadarsini Ikkurthi 🐛 ⚠️	Mark Horsman 🤔 🎨	Jesse Bloom Lab 🔣 🔌	Dan Fornika 🤔 ⚠️	Tara Newman 🤔 ⚠️

Reproducible workflow for SARS-CoV-2 recombinant sequence detection.

Help improve this workflow!

Credits

Code Snippets

Comments

Support

Free

Related Workflows

public

public

public

public

public

public