Code Snippets

Single-Cell and Single-Nuclei Data Integration Workflow

import os
import anndata as adata
import argparse
import re
from utils.integrate_scanorama import integrate_scanorama

# define arguments
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input_anndata',
                    dest = 'input_anndata',
                    required = True,
                    help = 'Path to HDF5 file with merged AnnData object to integrate')
parser.add_argument('-o', '--output_anndata',
                    dest = 'output_anndata',
                    required = True,
                    help = 'Path to HDF5 file to save the integrated AnnData object')
parser.add_argument('-b', '--batch_column',
                    dest = 'batch_column',
                    default = 'batch',
                    help = 'The name of the column in `anndata.obs` that indicates the batches for each cell, '
                            ' typically this corresponds to the library id.')
parser.add_argument('--use_hvg',
                    dest = 'use_hvg',
                    action = 'store_true',
                    default = False,
                    help = 'Boolean indicating whether or not to use only highly variable genes for data integration.'
                    'If --use_hvg is used, integration will only consider highly variable genes, and similarly '
                    'the returned integrated object will only contain the highly variable genes.')
parser.add_argument('--corrected_only',
                    dest = 'corrected_only',
                    action = 'store_true',
                    help = 'Boolean indicating to return only the corrected data or all raw data.'
                    ' Default will return all data. To return only corrected data, use --corrected_only.')
parser.add_argument('-s', '--seed',
                    dest = 'seed',
                    type=int,
                    default = None,
                    help = 'Random seed to set for scanorama.')

args = parser.parse_args()

# compile extension regex
file_ext = re.compile(r"\.hdf5$|.h5$", re.IGNORECASE)

# check that input file exists, if it does exist, make sure it's an h5 file
if not os.path.exists(args.input_anndata):
    raise FileExistsError("--input_anndata file not found.")
elif not file_ext.search(args.input_anndata):
    raise ValueError("--input_anndata must end in either .hdf5 or .h5 and contain a merged AnnData object.")

# make sure output file path is h5 file
if not file_ext.search(args.output_anndata):
    raise ValueError("--output_anndata must provide a file path ending in either .hdf5 or .h5.")

# check that output file directory exists and create directory if doesn't exist
integrated_adata_dir = os.path.dirname(args.output_anndata)
if not os.path.isdir(integrated_adata_dir):
    os.makedirs(integrated_adata_dir, exist_ok = True)

# read in merged anndata object
merged_adata = adata.read_h5ad(args.input_anndata)

if args.use_hvg:
    try:
        var_genes = list(merged_adata.uns['variable_genes'])
    except KeyError:
        print("Variable genes cannot be found in anndata object."
              " If using --use_hvg, make sure HVG are stored in adata.uns['variable_genes'].")
        raise
else:
    var_genes = list(merged_adata.var_names)

# integrate anndata with scanorama
scanorama_integrated_adata = integrate_scanorama(merged_adata,
                                                 integrate_genes = var_genes,
                                                 batch_column = args.batch_column,
                                                 seed = args.seed)

# remove raw data and logcounts to minimize space if corrected_only is true
if args.corrected_only:
    print("Removing raw data and log-normalized data. Only corrected data will be returned.")
    scanorama_integrated_adata.X = None
    del scanorama_integrated_adata.layers["logcounts"]

# write anndata to h5
scanorama_integrated_adata.write(filename = args.output_anndata)

Python utils anndata From line 14 of scripts/03b-integrate-scanorama.py

import os
import anndata as adata
import argparse
import re
from utils.integrate_scvi import integrate_scvi

# define arguments
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input_anndata',
                    dest = 'input_anndata',
                    required = True,
                    help = 'Path to HDF5 file with merged AnnData object to integrate')
parser.add_argument('-o', '--output_anndata',
                    dest = 'output_anndata',
                    required = True,
                    help = 'Path to HDF5 file to save the integrated AnnData object')
parser.add_argument('-b', '--batch_column',
                    dest = 'batch_column',
                    default = 'batch',
                    help = 'The name of the column in `anndata.obs` that indicates the batches for each cell, '
                            ' typically this corresponds to the library id.')
parser.add_argument('--categorical_covariates',
                    dest = 'categorical_covariates',
                    default = None,
                    type = str,
                    help = 'A comma-separated list of columns containing additional categorical data to be'
                    ' included as a covariate. Default is None.')
parser.add_argument('--continuous_covariates',
                    dest = 'continuous_covariates',
                    default = "subsets_mito_percent",
                    type = str,
                    help = 'A comma-separated list of columns containing additional continous data to be'
                    ' included as a covariate. Default is "subsets_mito_percent".')
parser.add_argument('--num_latent',
                    dest = 'num_latent',
                    type = int,
                    default = 20,
                    help = 'Number of dimensions to return from the latent space.')
parser.add_argument('--use_hvg',
                    dest = 'use_hvg',
                    action = 'store_true',
                    default = False,
                    help = 'Boolean indicating whether or not to use only highly variable genes for data integration.'
                    'If --use_hvg is used, the returned integrated object will only contain the highly variable genes.')
parser.add_argument('--corrected_only',
                    dest = 'corrected_only',
                    action = 'store_true',
                    help = 'Boolean indicating to return only the corrected data or all raw data.'
                    ' Default will return all data. To return only corrected data, use --corrected_only.')
parser.add_argument('-s', '--seed',
                    dest = 'seed',
                    type=int,
                    default = None,
                    help = 'Random seed to set for scanorama.')

args = parser.parse_args()

# compile extension regex
file_ext = re.compile(r"\.hdf5$|.h5$", re.IGNORECASE)

# check that input file exists, if it does exist, make sure it's an h5 file
if not os.path.exists(args.input_anndata):
    raise FileExistsError("--input_anndata file not found.")
elif not file_ext.search(args.input_anndata):
    raise ValueError("--input_anndata must end in either .hdf5 or .h5 and contain a merged AnnData object.")

# make sure output file path is h5 file
if not file_ext.search(args.output_anndata):
    raise ValueError("--output_anndata must provide a file path ending in either .hdf5 or .h5.")

# check that output file directory exists and create directory if doesn't exist
integrated_adata_dir = os.path.dirname(args.output_anndata)
os.makedirs(integrated_adata_dir, exist_ok = True)

# read in merged anndata object
merged_adata = adata.read_h5ad(args.input_anndata)

if args.use_hvg:
    try:
        var_genes = list(merged_adata.uns['variable_genes'])
    except KeyError:
        print("Variable genes cannot be found in anndata object."
              " If using --use_hvg, make sure HVG are stored in adata.uns['variable_genes'].")
        raise
else:
    var_genes = list(merged_adata.var_names)

# split covariates from comma separated strings into lists
if args.categorical_covariates:
    categorical_covariates = [covariate.strip() for covariate in args.categorical_covariates.split(',')]
else:
    categorical_covariates = None

if args.continuous_covariates:
    if args.continuous_covariates == "None":
        continuous_covariates = None
    else:
        continuous_covariates = [covariate.strip() for covariate in args.continuous_covariates.split(',')]
else:
    continuous_covariates = None

# integrate anndata with scvi
scvi_integrated_adata = integrate_scvi(merged_adata,
                                       integrate_genes = var_genes,
                                       batch_column = args.batch_column,
                                       categorical_covariate_columns = categorical_covariates,
                                       continuous_covariate_columns = continuous_covariates,
                                       num_latent = args.num_latent,
                                       seed = args.seed)

# remove raw data and logcounts to minimize space if corrected_only is true
if args.corrected_only:
    print("Removing raw data and log-normalized data. Only corrected data will be returned.")
    scvi_integrated_adata.X = None
    del scvi_integrated_adata.layers["logcounts"]


# write anndata to h5
scvi_integrated_adata.write(filename = args.output_anndata)

Python utils anndata From line 16 of scripts/03c-integrate-scvi.py

TYTUS - UBCS statistics calculation tool

from utils import get_chrom_sizes
from consts import QUALITY_WINDOW_LENGTH, MAX_NUMBER_OF_MISMACHES

from Bio.Seq import Seq

alignment_file = snakemake.input['alignment_file']
snd_file = snakemake.output['snd_file']
chrom_sizes_file = snakemake.input['query_chrom_sizes_file']

chrom_sizes = get_chrom_sizes(chrom_sizes_file)

query_name = snakemake.wildcards['query']
target_name = snakemake.wildcards['target']
ffrom = snakemake.params['ffrom']

snp_file = snakemake.input['snp_file']

def get_snps(snp_file):

    snps = {}

    with open(snp_file) as f:

        for line in f:

            line = line.split()

            start_coord = int(line[1]) + 1
            end_coord = int(line[2]) + 1

            for i in range(start_coord, end_coord):
                base = line[4]
                snps[i] = (base, line)

    return snps


def get_axt_entry(f, ffrom):

    header = f.readline()

    if not header: return None

    header = header.split()

    strand = header[7]

    if ffrom == 'target':

        t_chrom = header[1]
        t_start = int(header[2])

        q_chrom = header[4]
        q_start = int(header[5])
        q_end = int(header[6])

        target_al_seq = f.readline().rstrip().upper()
        query_al_seq = f.readline().rstrip().upper()

    else:

        t_chrom = header[4]
        t_start = int(header[5])

        q_chrom = header[1]
        q_start = int(header[2])

        query_al_seq = f.readline().rstrip().upper()
        target_al_seq =  f.readline().rstrip().upper()           

    f.readline()

    return t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq 

def add_snds_from_axt_entry(t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq, snd_bed_entries):

    t_count = 0
    q_count = 0

    for i, (t_chr, q_chr) in enumerate(zip(target_al_seq, query_al_seq)):

        if t_chr != '-':
            t_count += 1

        if q_chr != '-':
            q_count += 1

        if t_chr == q_chr:
            continue

        t_window = target_al_seq[i - QUALITY_WINDOW_LENGTH: i + QUALITY_WINDOW_LENGTH + 1]
        q_window = query_al_seq[i - QUALITY_WINDOW_LENGTH: i + QUALITY_WINDOW_LENGTH + 1]

        if len(t_window) != 2 * QUALITY_WINDOW_LENGTH + 1:  continue

        #print('enough place')

        if '-' in t_window + q_window: continue

        #print('without deletions and insertions')

        if len([ 1  for t_win_chr, q_win_chr  in zip(t_window, q_window) if t_win_chr != q_win_chr]) > MAX_NUMBER_OF_MISMACHES: continue

        #print('number of  mismatches above max threshold')

        t_subst_start = t_start + t_count - 2

        if strand == '-' and ffrom == 'query':
            t_subst_start = chrom_sizes[t_chrom] - (t_start + t_count - 1)
            t_window = str(Seq(t_window).reverse_complement())
            q_window = str(Seq(q_window).reverse_complement())

        t_coord_start = str(t_subst_start - QUALITY_WINDOW_LENGTH)
        t_coord_end =  str(t_subst_start + QUALITY_WINDOW_LENGTH + 1)

        t_coords = t_chrom + ':' + t_coord_start + '-' + t_coord_end

        q_subst_start = q_start + q_count - 2

        if strand == '-' and ffrom == 'target':
            q_subst_start = chrom_sizes[q_chrom] - (q_start + q_count - 1)

        q_coord_start = str(q_subst_start - QUALITY_WINDOW_LENGTH)
        q_coord_end = str(q_subst_start + QUALITY_WINDOW_LENGTH + 1)

        q_coords = q_chrom + ':' + q_coord_start + '-' + q_coord_end

        t_snp =  int(t_coord_start) +  QUALITY_WINDOW_LENGTH + 1 in snps

        #if t_output_window[5] != snps[int(t_coord_start)+ 6][0]:
        #    print(t_output_window, t_output_window[5], snps[int(t_coord_start) + 6]

        name = ('SND_ID:'  + str(len(snd_bed_entries)) , 'QUERY:' + query_name, 'TARGET:'+ target_name,
                'TARGET_COORDS:' + t_coords, 'QUERY_COORDS:' + q_coords,
                'CHANGE:' +  t_window + '>' + q_window, 'FROM:' + ffrom, 'SNP:' + str(t_snp), 'STRAND:' + strand)

        name = ';'.join(name)

        snd_bed_entry = '\t'.join([t_chrom, t_coord_start, t_coord_end, name, '0', strand ])

        snd_bed_entries[int(t_coord_start)] = snd_bed_entry

def get_snd_bed_entries(alignment_file, snps):

    with open(alignment_file) as f:

        snd_bed_entries = {}

        while True:

            axt_entry = get_axt_entry(f, ffrom)

            if axt_entry is None:
                break

            t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq = axt_entry

            add_snds_from_axt_entry(t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq, snd_bed_entries)

        return snd_bed_entries

def save_snd_bed_entries(snd_bed_entries_file, snd_bed_entries):

    with open(snd_file, 'w') as f_out:

        for t_coord_start in sorted(snd_bed_entries):
            snd_bed_entry = snd_bed_entries[t_coord_start]
            f_out.write(snd_bed_entry)
            f_out.write('\n')

snps = get_snps(snp_file)
snd_bed_entries = get_snd_bed_entries(alignment_file, snps)
save_snd_bed_entries(snd_file, snd_bed_entries)

Python Biopython utils From line 1 of python/getSNDsFromAXT.py

Workflow for detecting PMCs in confocal images (v0.1.0)

import json
import os
import sys
from collections import namedtuple
import logging
import itertools
from turtle import dot

import numpy as np
import xarray as xr
import skimage
from skimage import exposure, io, morphology, measure
from skimage.filters import threshold_otsu

import bigfish.detection as bf_detection
import bigfish.stack as bf_stack
import bigfish.plot as bf_plot

BoundingBox = namedtuple("BoundingBox", ["ymin", "ymax", "xmin", "xmax"])
try:
    sys.path.append(os.path.dirname(__file__))
except NameError:
    pass
import utils


def normalize_zstack(z_stack, bits):
    """Normalize z-slices in a 3D image using contrast stretching

    Parameters
    ----------
    z_stack : numpy.ndarray
        3 dimensional confocal FISH image.
    bits : int
        Bit depth of image.

    Returns
    -------
    numpy.ndarray
        Z-corrected image with each slice minimum and maximum matched
    """
    out = np.array(
        [
            exposure.rescale_intensity(
                x, in_range=(0, 2 ** bits - 1), out_range=(z_stack.min(), z_stack.max())
            )
            for x in z_stack
        ]
    )
    return skimage.img_as_uint(exposure.rescale_intensity(out))


def read_bit_img(img_file, bits=12):
    """Read an image and return as a 16-bit image."""
    img = exposure.rescale_intensity(
        io.imread(img_file),
        in_range=(0, 2 ** (bits) - 1)
        #         out_range=(0, )
    )
    return skimage.img_as_uint(img)


def select_signal(image, p_in_focus=0.75, margin_width=10):
    """
    Generate bounding box of FISH image to select on areas where signal is present.

    Parameters
    ----------
    image : np.ndarray
        3D FISH image
    p_in_focus : float, optional
        Percent of in-focus slices to retain for 2D projection, by default 0.75.
    margin_width : int, optional
        Number of pixels to pad selection by. Default is 10.

    Returns
    -------
    namedtuple
        minimum and maximum coordinate values of the bounding box in the xy plane
    """
    image = image.astype(np.uint16)
    focus = bf_stack.compute_focus(image)
    selected = bf_stack.in_focus_selection(image, focus, p_in_focus)
    projected_2d = bf_stack.maximum_projection(selected)
    foreground = np.where(projected_2d > threshold_otsu(projected_2d))
    limits = BoundingBox(
        ymin=max(foreground[0].min() - margin_width, 0),
        ymax=min(foreground[0].max() + margin_width, image.shape[1]),
        xmin=max(foreground[1].min() - margin_width, 0),
        xmax=min(foreground[1].max() + margin_width, image.shape[2]),
    )
    return limits


def crop_to_selection(img, bbox):
    """
    Crop image to selection defined by bounding box.

    Crops a 3D image to specified x and y coordinates.
    Parameters
    ----------
    img : np.ndarray
        3Dimensional image to crop
    bbox : namedtuple
        Tuple defining minimum and maximum coordinates for x and y planes.

    Returns
    -------
    np.ndarray
        3D image cropped to the specified selection.
    """
    return img[:, bbox.ymin : bbox.ymax, bbox.xmin : bbox.xmax]


def count_spots_in_labels(spots, labels):
    """
    Count the number of RNA molecules in specified labels.

    Parameters
    ----------
    spots : np.ndarray
        Coordinates in original image where RNA molecules were detected.
    labels : np.ndarray
        Integer array of same shape as `img` denoting regions to interest to quantify.
        Each separate region should be uniquely labeled.

    Returns
    -------
    dict
        dictionary containing the number of molecules contained in each labeled region.
    """
    assert spots.shape[1] == len(labels.shape)
    n_labels = np.unique(labels) - 1  # subtract one for backgroudn
    counts = {i: 0 for i in range(1, n_labels + 1)}
    for each in spots:
        if len(each) == 3:
            cell_label = labels[each[0], each[1], each[2]]
        else:
            cell_label = labels[each[0], each[1]]
        if cell_label != 0:
            counts[cell_label] += 1
    return counts


def preprocess_image(
    img, smooth_method="gaussian", sigma=7, whitehat=True, selem=None, stretch=99.99
):
    scaled = exposure.rescale_intensity(
        img, in_range=tuple(np.percentile(img, [0, stretch]))
    )
    if smooth_method == "log":
        smooth_func = bf_stack.log_filter
        to_smooth = bf_stack.cast_img_float64(scaled)
    elif smooth_method == "gaussian":
        smooth_func = bf_stack.remove_background_gaussian
        to_smooth = bf_stack.cast_img_uint16(scaled)
    else:
        raise ValueError(f"Unsupported background filter: {smooth_method}")
    if whitehat:
        f = lambda x, s: morphology.white_tophat(smooth_func(x, s), selem)
    else:
        f = lambda x, s: smooth_func(x, s)
    smoothed = np.stack([f(img_slice, sigma) for img_slice in to_smooth])
    return bf_stack.cast_img_float64(np.stack(smoothed))


def count_spots(
    smoothed_signal,
    cell_labels,
    voxel_size_nm,
    dot_radius_nm,
    smooth_method="gaussian",
    decompose_alpha=0.5,
    decompose_beta=1,
    decompose_gamma=5,
    verbose=False,
):

    if verbose:
        spot_radius_px = bf_detection.get_object_radius_pixel(
            voxel_size_nm=voxel_size_nm, object_radius_nm=dot_radius_nm, ndim=3
        )
        logging.info("spot radius (z axis): %0.3f pixels", spot_radius_px[0])
        logging.info("spot radius (yx plan): %0.3f pixels", spot_radius_px[-1])
    spots, threshold = bf_detection.detect_spots(
        smoothed_signal,
        return_threshold=True,
        voxel_size=voxel_size_nm,
        spot_radius=dot_radius_nm,
    )
    if verbose:
        logging.info("%d spots detected...", spots.shape[0])
        logging.info("plotting threshold optimization for spot detection...")
        bf_plot.plot_elbow(
            smoothed_signal,
            voxel_size=voxel_size_nm,
            spot_radius=dot_radius_nm,
        )
    decompose_cast = {
        "gaussian": bf_stack.cast_img_uint16,
        "log": bf_stack.cast_img_float64,
    }
    try:
        (
            spots_post_decomposition,
            dense_regions,
            reference_spot,
        ) = bf_detection.decompose_dense(
            decompose_cast[smooth_method](smoothed_signal),
            spots,
            voxel_size=voxel_size_nm,
            spot_radius=dot_radius_nm,
            alpha=decompose_alpha,  # alpha impacts the number of spots per candidate region
            beta=decompose_beta,  # beta impacts the number of candidate regions to decompose
            gamma=decompose_gamma,  # gamma the filtering step to denoise the image
        )
        logging.info(
            "detected spots before decomposition: %d\n"
            "detected spots after decomposition: %d",
            spots.shape[0],
            spots_post_decomposition.shape[0],
        )
        if verbose:
            print(
                f"detected spots before decomposition: {spots.shape[0]}\n"
                f"detected spots after decomposition: {spots_post_decomposition.shape[0]}\n"
                f"shape of reference spot for decomposition: {reference_spot.shape}"
            )
            bf_plot.plot_reference_spot(reference_spot, rescale=True)
    except RuntimeError:
        logging.warning("decomposition failed, using originally identified spots")
        spots_post_decomposition = spots
    n_labels = len(np.unique(cell_labels)) - 1
    counts = {i: 0 for i in range(1, n_labels + 1)}
    expression_3d = np.zeros_like(smoothed_signal)
    # get slices to account for cropping
    for each in spots_post_decomposition:
        spot_coord = tuple(each)
        cell_label = cell_labels[spot_coord]
        if cell_label != 0:
            counts[cell_label] += 1
    for region in measure.regionprops(cell_labels):
        expression_3d[region.slice][region.image] = counts[region.label]
    return counts, expression_3d


def average_intensity(smoothed_signal, cell_labels):
    n_labels = len(np.unique(cell_labels)) - 1
    intensities = {i: 0 for i in range(1, n_labels + 1)}
    z_normed_smooth = (smoothed_signal - smoothed_signal.mean()) / smoothed_signal.std()
    expression_3d = np.zeros_like(z_normed_smooth)
    for region in measure.regionprops(cell_labels, z_normed_smooth):
        intensities[region.label] = region.mean_intensity
        expression_3d[region.slice][region.image] = region.mean_intensity
    return intensities, expression_3d


def quantify_expression(
    fish_img,
    cell_labels,
    measures=["spots", "intensity"],
    voxel_size_nm=None,
    dot_radius_nm=None,
    whitehat=True,
    whitehat_selem=None,
    smooth_method="gaussian",
    smooth_sigma=1,
    decompose_alpha=0.5,
    decompose_beta=1,
    decompose_gamma=5,
    bits=12,
    crop_image=True,
    verbose=False,
):
    """
    Count the number of molecules in an smFISH image

    Parameters
    ----------
    fish_img : np.ndarray
        Image in which to perform molecule counting
    cell_labels : np.ndarray
        Integer array of same shape as `img` denoting regions to interest to quantify.
        Each separate region should be uniquely labeled.
    measures : list
        Measures to use to quantify expression. Possible values are "spots",
        "intensity", and ["spots", "intensity"]. Default is to measure both
    voxel_size_nm : tuple(float, int), None
        Physical dimensions of each voxel in ZYX order. Required if running spot
        counting.
    dot_radius_nm : tuple(float, int), None
        Physical size of expected dots. Required if running spot
        counting.
    whitehat : bool, optional
        Whether to perform white tophat filtering prior to image de-noising, by default True
    whitehat_selem : [int, np.ndarray], optional
        Structuring element to use for white tophat filtering.
    smooth_method : str, optional
        Method to use for image de-noising. Possible values are "log" and "gaussian" for
        Laplacian of Gaussians and Gaussian background subtraction, respectively. By default "log".
    smooth_sigma : [int, np.ndarray], optional
        Sigma value to use for smoothing function, by default 1
    decompose_alpha : float, optional
        Intensity percentile used to compute the reference spot, between 0 and 1.
        By default 0.7. For more information, see:
        https://big-fish.readthedocs.io/en/stable/detection/dense.html
    decompose_beta : int, optional
        Multiplicative factor for the intensity threshold of a dense region,
        by default 1. For more information, see:
        https://big-fish.readthedocs.io/en/stable/detection/dense.html
    decompose_gamma : int, optional
        Multiplicative factor use to compute a gaussian scale, by default 5.
        For more information, see:
        https://big-fish.readthedocs.io/en/stable/detection/dense.html
    bits : int, optional
        Bit depth of original image. Used for scaling image while maintaining
        ob
    crop_image : bool, optional
        Whether to crop signal. Default is True.
    verbose : bool, optional
        Whether to verbosely print results and progress.

    Returns
    -------
    (np.ndarray, dict)
        np.ndarray: positions of all identified mRNA molecules.
        dict: dictionary containing the number of molecules contained in each labeled region.
    """
    if (voxel_size_nm is None or dot_radius_nm is None) and "spots" in measures:
        raise ValueError(
            "Require `voxel_size_nm` and `dot_radius_nm` when performing spot counting."
        )
    if crop_image:
        if verbose:
            logging.info("Cropping image to signal")
        limits = select_signal(fish_img)
        if verbose:
            logging.info(
                "Cropped image to %d x %d",
                {limits.ymax - limits.ymin},
                {limits.xmax - limits.xmin},
            )
    else:
        # create BoudndingBox that selects whole image
        limits = BoundingBox(
            ymin=0, ymax=fish_img.shape[1], xmin=0, xmax=fish_img.shape[2]
        )
    if verbose:
        logging.info("Preprocessing image.")
    cropped_img = skimage.img_as_float64(
        exposure.rescale_intensity(
            crop_to_selection(fish_img, limits),
            in_range=(0, 2 ** bits - 1),
            out_range=(0, 1),
        )
    )
    smoothed = preprocess_image(
        cropped_img, smooth_method, smooth_sigma, whitehat, whitehat_selem, 99.99
    )

    cropped_labels = crop_to_selection(cell_labels, limits)
    quant = dict()
    if "spots" in measures:
        counts, counts_3d = count_spots(
            smoothed,
            cropped_labels,
            voxel_size_nm=voxel_size_nm,
            dot_radius_nm=dot_radius_nm,
            smooth_method=smooth_method,
            decompose_alpha=decompose_alpha,
            decompose_beta=decompose_beta,
            decompose_gamma=decompose_gamma,
            verbose=verbose,
        )
        quant["spots"] = counts
        if crop_image:  # match original shape if cropped
            counts_3d = utils.pad_to_shape(counts_3d, fish_img.shape)
    if "intensity" in measures:
        intensities, intense_3d = average_intensity(smoothed, cropped_labels)
        quant["intensity"] = intensities
        if crop_image:  # match original shape if cropped
            intense_3d = utils.pad_to_shape(intense_3d, fish_img.shape)
    if len(measures) == 2:
        expression_3d = np.stack([counts_3d, intense_3d])
    return (
        quant,
        expression_3d,
    )


def get_quant_measure(method):
    """Get method of quantification"""
    if method == "spots":
        return ["spots"]
    elif method == "intensity":
        return ["intensity"]
    elif method == "both":
        return ["spots", "intensity"]
    else:
        raise ValueError(f"Unrecognized method {method}.")


if __name__ == "__main__":
    import h5py
    import pandas as pd

    try:
        snakemake
    except NameError:
        snakemake = None
    if snakemake is not None:
        logging.basicConfig(filename=snakemake.log[0], level=logging.INFO)
        raw_img, dimensions = utils.read_image_file(
            snakemake.input["image"], as_nm=True
        )
        if dimensions is None and snakemake.input["image"].endswith(".h5"):
            with open(snakemake.params["dimensions"], "r") as handle:
                data = json.load(handle)
                dimensions = np.array([data[c] for c in "zyx"]) * (10 ** 3)
        labels = np.array(h5py.File(snakemake.input["labels"], "r")["image"])
        logging.info("%d labels detected.", len(np.unique(labels) - 1))
        start = int(snakemake.params["z_start"])
        stop = int(snakemake.params["z_end"])
        if stop < 0:
            stop = raw_img.shape[1]
        gene_params = snakemake.params["gene_params"]

        def has_probe_info(name, gene_params):
            if name not in gene_params.keys():
                logging.warning(
                    "No entry for %s found in gene parameters. Not quantifying signal",
                    name,
                )
                return False
            return True

        channels = {
            x: i
            for i, x in enumerate(snakemake.params["channels"].split(";"))
            if has_probe_info(x, gene_params)
        }
        if len(channels) < 0:
            raise ValueError(
                f"No quantification parameters provided for channels: {snakemake.params['channels'].replace(';', ', ')}"
            )
        genes = list(channels.keys())
        fish_exprs = {}
        summarized_images = [None] * len(channels)
        embryo = snakemake.wildcards["embryo"]
        measures = get_quant_measure(snakemake.params["quant_method"])
        for i, gene in enumerate(genes):
            logging.info(f"Quantifying {gene} signal...")
            fish_data = raw_img[channels[gene], start:stop, :, :]
            quant, image = quantify_expression(
                fish_data,
                labels,
                measures=measures,
                voxel_size_nm=dimensions.tolist(),
                dot_radius_nm=gene_params[gene]["radius"],
                whitehat=True,
                smooth_method="gaussian",
                smooth_sigma=7,
                verbose=True,
                bits=12,
                crop_image=snakemake.params["crop_image"],
            )
            for each in measures:
                fish_exprs[f"{gene}_{each}"] = quant[each]
            summarized_images[i] = image
        # write summarized expression images to netcdf using Xarray to keep
        # track of dims
        out_image = np.array(summarized_images)
        xr.DataArray(
            data=out_image,
            coords={"gene": genes, "measure": measures},
            dims=["gene", "measure", "Z", "Y", "X"],
        ).to_netcdf(snakemake.output["image"])
        exprs_df = pd.DataFrame.from_dict(fish_exprs)
        exprs_df.index.name = "label"
        physical_properties = (
            pd.DataFrame(
                measure.regionprops_table(
                    labels,
                    properties={
                        "label",
                        "centroid",
                        "area",
                        "equivalent_diameter",
                    },
                )
            )
            .rename(
                columns={
                    "centroid-0": "Z",
                    "centroid-1": "Y",
                    "centroid-2": "X",
                    "equivalent_diameter": "diameter",
                }
            )
            .set_index("label")
        )
        out = exprs_df.join(physical_properties)
        out["embryo"] = embryo
        out[
            [
                "embryo",
                "area",
                "diameter",
                "Z",
                "Y",
                "X",
            ]
            + [
                f"{gene}_{measure}"
                for (gene, measure) in itertools.product(genes, measures)
            ]
        ].to_csv(snakemake.output["csv"])

Python Snakemake Pandas numpy JSON utils Quant h5py xarray FOCUS skimage big-fish From line 1 of scripts/count_spots.py

import sys
import os

import h5py
import numpy as np
from skimage import exposure

try:
    from intensipy import Intensify
except ImportError:
    pass

sys.path.append(os.path.basename(__file__))
import utils


def get_channel_index(channels, channel):
    channel_index = [
        i for i, x in enumerate(channels.split(";")) if x.lower() == channel.lower()
    ][0]
    return channel_index


def preprocess_slice(img, upper_percentile=99.99, new_min=0, new_max=1):
    """Preprocess a Z-slice by scaling and equalizing intensities.

    Parameters
    ----------
    img : np.ndarray
        Image slice to scale and equalize.
    upper_percentile : float, optional
        Upper bound to clip intensities for scaling, by default 99.99.
    new_min : int, optional
        New minimum intensity value, by default 0.
    new_max : int, optional
        New maximum intensity value, by default 1.

    Returns
    -------
    np.ndarray
        Scaled and equalize image slice.
    """
    lb, ub = np.percentile(img, (0, upper_percentile))
    out = exposure.equalize_adapthist(
        exposure.rescale_intensity(img, in_range=(lb, ub), out_range=(new_min, new_max))
    )
    return out


if __name__ == "__main__":
    try:
        snakemake
    except NameError:
        snakemake = None
    if snakemake is not None:
        img, __ = utils.read_image_file(snakemake.input["image"])
        channel = get_channel_index(
            snakemake.params["channels"], snakemake.params["channel_name"]
        )
        z_start = int(snakemake.params["z_start"])
        z_stop = int(snakemake.params["z_end"])
        pmc = img[channel, z_start:z_stop, :, :]
        if snakemake.params["intensipy"]:
            model = Intensify(xy_norm=False, dy=29, dx=29)
            pmc = model.normalize(pmc.astype(float))
        else:
            pmc = np.array(
                [
                    preprocess_slice(x, upper_percentile=100, new_min=0, new_max=1)
                    for x in pmc
                ]
            )
        utils.to_hdf5(pmc, snakemake.output["h5"])

Python Snakemake numpy utils h5py skimage From line 5 of scripts/normalize_pmc_stain.py

genomic relatedness detection pipeline (v1.8)

import pandas
import numpy
from utils.bcftools import bcftools_query


def interpolate(vcf_path, cm_path, output_path, chrom):

    genetic_map = pandas.read_csv(cm_path, sep=" ")

    pos = list(map(int, bcftools_query(vcf_path, arg="%POS\n", list_samples=False)))
    # interpolate CM from BP
    #chro = genetic_map[genetic_map.chr == int(chrom)]
    # because for chr19 'Genetic_Map(cM)' is 'Genetic_Map.cM.'
    cms = numpy.interp(pos, genetic_map.position.values, genetic_map.iloc[:, -1].values)

    eps = 1e-7
    new_cms = []
    for i in range(0, cms.shape[0]):
        if i == 0:
            new_cms.append(cms[i])
            continue

        if new_cms[-1] >= cms[i]:
            new_cms.append(new_cms[-1] + eps)
        else:
            new_cms.append(cms[i])

    with open(output_path, 'w') as w_map:
        for i, cm in enumerate(new_cms):
            if i > 0 and cm <= new_cms[i-1]:
                raise ValueError(f'cm positions {i-1} and {i} are NOT strictly increasing: {new_cms[:i+1]}')
            w_map.write(f"{i}\t{cm:.8f}\n")


if __name__ == '__main__':
    vcf_path = snakemake.input['vcf']
    cm_path = snakemake.input['cmmap']

    chrom = snakemake.wildcards['chrom']

    output_path = snakemake.output[0]
    interpolate(vcf_path, cm_path, output_path, chrom)

Python Pandas numpy utils From line 1 of scripts/interpolate.py

from utils.ibd import merge_germline_segments, segments_to_germline, interpolate_all, read_germline_segments, read_pedsim_segments


if __name__ == '__main__':

    germline_path = snakemake.input['germline']
    map_dir = snakemake.params['cm_dir']
    gap = float(snakemake.params['merge_gap'])
    pedsim_path = snakemake.params['true_ibd']

    use_true_ibd = bool(snakemake.params['use_true_ibd'])
    if not use_true_ibd:
        segments = read_germline_segments(germline_path)
        segments = interpolate_all(segments, map_dir)
        segments = merge_germline_segments(segments, gap=gap)
        segments_to_germline(segments, snakemake.output['ibd'])
    else:
        true_segments = read_pedsim_segments(pedsim_path)
        segments_to_germline(true_segments, snakemake.output['ibd'])

Python utils From line 1 of scripts/merge_ibd.py

import pandas
import numpy
import os
import logging
from collections import Counter, namedtuple
from utils.ibd import read_king_segments as rks, interpolate_all, Segment


def is_non_zero_file(fpath):
    return os.path.isfile(fpath) and os.path.getsize(fpath) > 0


def _sort_ids(data):
    unsorted_mask = data.id2 < data.id1
    data.loc[unsorted_mask, 'id1'], data.loc[unsorted_mask, 'id2'] = data.loc[unsorted_mask, 'id2'], data.loc[
        unsorted_mask, 'id1']


def read_bucket_dir(bucket_dir):

    total = None
    for file in os.listdir(bucket_dir):
        if not file.endswith('tsv'):
            continue
        path = os.path.join(bucket_dir, file)
        bucket = read_germline(path)
        if total is None:
            total = bucket
        else:
            total = total.append(bucket)
    return total


def read_germline(ibd_path):
    germline_names = [
        'fid_iid1',
        'fid_iid2',
        'chrom',
        'start_end_bp',
        'start_end_snp',
        'snp_len',
        'genetic_len',
        'len_units',
        'mismatches',
        'is_homozygous1',
        'is_homozygous2'
    ]

    data = pandas.read_table(ibd_path, header=None, names=germline_names)
    if data.shape[0] == 0:
        return pandas.DataFrame(columns=['id1', 'id2', 'seg_count_germline', 'total_seg_len_germline']).set_index(['id1', 'id2'])

    data.loc[:, 'id1'] = data.fid_iid1.str.split().str[1]
    data.loc[:, 'id2'] = data.fid_iid2.str.split().str[1]
    _sort_ids(data)
    segments_info = data.loc[:, ['id1', 'id2', 'chrom', 'genetic_len']].groupby(by=['id1', 'id2']).agg({'genetic_len': 'sum', 'chrom': 'count'})
    segments_info.rename({'genetic_len': 'total_seg_len_germline', 'chrom': 'seg_count_germline'}, axis=1, inplace=True)
    return segments_info


def map_king_degree(king_degree):
    degree_map = {
        'Dup/MZ': 0,
        'PO': 1,
        'FS': 2,
        '2nd': 2,
        '3rd': 3,
        '4th': 4,
        'NA': numpy.nan
    }
    return [degree_map[kd] for kd in king_degree]


def map_king_relation(king_degree):
    degree_map = {
        'Dup/MZ': 0,
        'PO': 'PO',
        'FS': 'FS',
        '2nd': 2,
        '3rd': 3,
        '4th': 4,
        'NA': numpy.nan
    }
    return [degree_map[kd] for kd in king_degree]


def read_king(king_path):
    # FID1    ID1     FID2    ID2     MaxIBD1 MaxIBD2 IBD1Seg IBD2Seg PropIBD InfType
    try:
        data = pandas.read_table(king_path)
        data.loc[:, 'id1'] = data.FID1.astype(str) + '_' + data.ID1.astype(str)
        data.loc[:, 'id2'] = data.FID2.astype(str) + '_' + data.ID2.astype(str)
        _sort_ids(data)

        data.loc[:, 'king_degree'] = map_king_degree(data.InfType)
        data.loc[:, 'king_relation'] = map_king_relation(data.InfType)
        data.loc[:, 'king_degree'] = data.king_degree.astype(float).astype(pandas.Int32Dtype())
        data.rename({'PropIBD': 'shared_genome_proportion', 'IBD1Seg': 'king_ibd1_prop', 'IBD2Seg': 'king_ibd2_prop'}, axis='columns', inplace=True)
        indexed = data.loc[:, ['id1', 'id2', 'king_degree', 'king_relation', 'king_ibd1_prop', 'king_ibd2_prop', 'shared_genome_proportion']].\
            set_index(['id1', 'id2'])
        return indexed

    except pandas.errors.EmptyDataError:
        return pandas.DataFrame(data=None, index=None,
                                columns=['id1',
                                         'id2',
                                         'king_degree',
                                         'king_relation',
                                         'king_ibd1_prop',
                                         'king_ibd2_prop',
                                         'shared_genome_proportion']).set_index(['id1', 'id2'])


def read_king_segments_chunked(king_segments_path, map_dir):
    total = None
    try:
        for i, chunk in enumerate(pandas.read_table(
                                                    king_segments_path, 
                                                    compression='gzip', 
                                                    dtype={'FID1': str, 'ID1': str, 'FID2': str, 'ID2': str}, 
                                                    chunksize=1e+6)):

            segments = {}
            for i, row in chunk.iterrows():
                id1 = row['FID1'] + '_' + row['ID1']
                id2 = row['FID2'] + '_' + row['ID2']
                seg = Segment(id1, id2, row['Chr'],
                            bp_start=row['StartMB']*1e+6, bp_end=row['StopMB']*1e+6)
                key = tuple(sorted((seg.id1, seg.id2)))
                if key not in segments:
                    segments[key] = [seg]
                else:
                    segments[key].append(seg)

            segments = interpolate_all(segments, map_dir)
            data = pandas.DataFrame(columns=['id1', 'id2', 'total_seg_len_king', 'seg_count_king'])
            logging.info(f'loaded and interpolated segments from chunk {i} for {len(segments)} pairs')
            rows = []
            for key, segs in segments.items():
                row = {'id1': key[0],
                    'id2': key[1],
                    'total_seg_len_king': sum([s.cm_len for s in segs]),
                    'seg_count_king': len(segs)}
                rows.append(row)

            rows_frame = pandas.DataFrame.from_records(rows, columns=['id1', 'id2', 'total_seg_len_king', 'seg_count_king'])
            data = pandas.concat([data, rows_frame], ignore_index=True)

            _sort_ids(data)
            data = data.set_index(['id1', 'id2'])
            total = total.append(data) if total is not None else data

        return total

    except pandas.errors.EmptyDataError:
        return pandas.DataFrame(columns=['id1', 'id2', 'total_seg_len_king', 'seg_count_king']).set_index(['id1', 'id2'])


def read_king_segments(king_segments_path, map_dir):
    try:
        segments = rks(king_segments_path)
        segments = interpolate_all(segments, map_dir)
        data = pandas.DataFrame(columns=['id1', 'id2', 'total_seg_len_king', 'seg_count_king'])
        logging.info(f'loaded and interpolated segments for {len(segments)} pairs')
        for key, segs in segments.items():
            row = {'id1': key[0],
                   'id2': key[1],
                   'total_seg_len_king': sum([s.cm_len for s in segs]),
                   'seg_count_king': len(segs)}
            data = data.append(row, ignore_index=True)

        _sort_ids(data)
        return data.set_index(['id1', 'id2'])

    except pandas.errors.EmptyDataError:
        return pandas.DataFrame(columns=['id1', 'id2', 'total_seg_len_king', 'seg_count_king']).set_index(['id1', 'id2'])


def kinship_to_degree(kinship):
    degrees = []
    # intervals are from KING manual
    # >0.354, [0.177, 0.354], [0.0884, 0.177] and [0.0442, 0.0884]
    for k in kinship:
        if k > 0.354:
            degrees.append(0)
        elif 0.177 < k <= 0.354:
            degrees.append(1)
        elif 0.0884 < k <= 0.177:
            degrees.append(2)
        elif 0.0442 < k <= 0.0884:
            degrees.append(3)
        else:
            degrees.append(None)
    return degrees


def _read_kinship_data(kinship_path):
    try:
        data = pandas.read_table(kinship_path)
        print(kinship_path, data.shape)
        # If no relations were found, king creates file with only header
        if data.shape[0] != 0:
            if 'FID' in data.columns:
                data.loc[:, 'id1'] = data.FID.astype(str) + '_' + data.ID1.astype(str)
                data.loc[:, 'id2'] = data.FID.astype(str) + '_' + data.ID2.astype(str)
            else:
                data.loc[:, 'id1'] = data.FID1.astype(str) + '_' + data.ID1.astype(str)
                data.loc[:, 'id2'] = data.FID2.astype(str) + '_' + data.ID2.astype(str)

            _sort_ids(data)
            data.rename({'Kinship': 'kinship'}, axis=1, inplace=True)
            data.loc[:, 'kinship_degree'] = kinship_to_degree(data.kinship)
            data = data.loc[:, ['id1', 'id2', 'kinship', 'kinship_degree']].set_index(['id1', 'id2'])
        else:
            data = pandas.DataFrame(columns=['id1', 'id2', 'kinship', 'kinship_degree']).set_index(['id1', 'id2'])
        return data
    except pandas.errors.EmptyDataError:
        return pandas.DataFrame(columns=['id1', 'id2', 'kinship', 'kinship_degree']).set_index(['id1', 'id2'])


def read_kinship(kinship_path, kinship0_path):
    # parse within families
    # FID     ID1     ID2     N_SNP   Z0      Phi     HetHet  IBS0    Kinship Error
    within = _read_kinship_data(kinship_path)
    logging.info(f'loaded {within.shape[0]} pairs from within-families kinship estimation results')

    # FID1    ID1     FID2    ID2     N_SNP   HetHet  IBS0    Kinship
    across = _read_kinship_data(kinship0_path)
    logging.info(f'loaded {across.shape[0]} pairs from across-families kinship estimation results')
    return pandas.concat([within, across], axis=0)


def read_ersa(ersa_path):
    # Indv_1     Indv_2      Rel_est1      Rel_est2      d_est     lower_d  upper_d     N_seg     Tot_cM
    data = pandas.read_table(ersa_path, header=0,
                             names=['id1', 'id2', 'rel_est1', 'rel_est2',
                                    'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'seg_count_ersa', 'total_seg_len_ersa'],
                             dtype={'ersa_degree': str, 'seg_count_ersa': str, 'total_seg_len_ersa': str})

    data = data.loc[(data.rel_est1 != 'NA') | (data.rel_est2 != 'NA'), :]
    data.loc[:, 'id1'] = data.id1.str.strip()
    data.loc[:, 'id2'] = data.id2.str.strip()
    _sort_ids(data)
    data.loc[:, 'ersa_degree'] = pandas.to_numeric(data.ersa_degree.str.strip(), errors='coerce').astype(
        pandas.Int32Dtype())
    data.loc[:, 'ersa_lower_bound'] = pandas.to_numeric(data.ersa_lower_bound.str.strip(), errors='coerce').astype(
        pandas.Int32Dtype())
    data.loc[:, 'ersa_upper_bound'] = pandas.to_numeric(data.ersa_upper_bound.str.strip(), errors='coerce').astype(
        pandas.Int32Dtype())
    print('dtypes are: ', data.dtypes)
    data.loc[:, 'seg_count_ersa'] = pandas.to_numeric(data.seg_count_ersa.str.strip(), errors='coerce').astype(
        pandas.Int32Dtype())
    data.loc[:, 'total_seg_len_ersa'] = pandas.to_numeric(data.total_seg_len_ersa.str.strip(), errors='coerce')

    data.loc[data.ersa_lower_bound != 1, 'ersa_lower_bound'] = data.ersa_lower_bound - 1
    data.loc[data.ersa_upper_bound != 1, 'ersa_upper_bound'] = data.ersa_upper_bound - 1

    logging.info(f'read {data.shape[0]} pairs from ersa output')
    logging.info(f'ersa after reading has {pandas.notna(data.ersa_degree).sum()}')
    data.loc[:, 'is_niece_aunt'] = [True if 'Niece' in d else False for d in data.rel_est1]
    logging.info(f'we have total of {data.is_niece_aunt.sum()} possible Niece/Aunt relationships')

    return data.loc[data.id1 != data.id2, ['id1', 'id2', 'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'is_niece_aunt', 'seg_count_ersa', 'total_seg_len_ersa']].\
        set_index(['id1', 'id2'])


def read_ersa2(ersa_path):
    # individual_1    individual_2    est_number_of_shared_ancestors  est_degree_of_relatedness       0.95 CI_2p_lower:4
    # 2p_upper:5     1p_lower:6 1p_upper:7        0p_lower:8        0p_upper:9
    # maxlnl_relatedness      maxlnl_unrelatednes
    data = pandas.read_table(ersa_path, comment='#')

    data = data.loc[data.est_degree_of_relatedness != 'no_sig_rel', :]
    data.rename({'individual_1': 'id1', 'individual_2': 'id2', 'est_number_of_shared_ancestors': 'shared_ancestors'}, axis=1, inplace=True)
    data.loc[:, 'ersa_degree'] = pandas.to_numeric(data['est_degree_of_relatedness'], errors='coerce').\
        astype(pandas.Int32Dtype())

    lower, upper = [], []
    for i, ancestors in enumerate(data.shared_ancestors):
        if ancestors == 0:
            lower.append(data.iloc[i, 8])
            upper.append(data.iloc[i, 9])
        if ancestors == 1:
            lower.append(data.iloc[i, 6])
            upper.append(data.iloc[i, 7])
        if ancestors == 2:
            lower.append(data.iloc[i, 4])
            upper.append(data.iloc[i, 5])

    data.loc[:, 'ersa_lower_bound'] = lower
    data.loc[:, 'ersa_upper_bound'] = upper

    cols = ['id1', 'id2', 'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'shared_ancestors']
    return data.loc[data.id1 != data.id2, cols].set_index(['id1', 'id2'])


if __name__ == '__main__':

    try:
        snakemake
    except NameError:
        test_dir = '/media_ssd/pipeline_data/TF-CEU-TRIBES-ibis-king-2'
        Snakemake = namedtuple('Snakemake', ['input', 'output', 'params', 'log'])
        snakemake = Snakemake(
            input={'king': f'{test_dir}/king/data.seg',
                   'king_segments': f'{test_dir}/king/data.segments.gz',
                   'kinship': f'{test_dir}/king/data.kin',
                   'kinship0': f'{test_dir}/king/data.kin0',
                   'ersa': f'{test_dir}/ersa/relatives.tsv'},

            output=[f'test_data/merge_king_ersa.tsv'],
            params={'cm_dir': f'{test_dir}/cm'},
            log=['test_data/merge_king_ersa.log']
        )

    logging.basicConfig(filename=snakemake.log[0], level=logging.DEBUG, format='%(levelname)s:%(asctime)s %(message)s')

    king_path = snakemake.input['king']
    king_segments_path = snakemake.input['king_segments']
    # within families
    kinship_path = snakemake.input['kinship']
    # across families
    kinship0_path = snakemake.input['kinship0']
    ersa_path = snakemake.input['ersa']
    map_dir = snakemake.params['cm_dir']
    output_path = snakemake.output[0]

    king = read_king(king_path)
    kinship = read_kinship(kinship_path, kinship0_path)
    king_segments = read_king_segments_chunked(king_segments_path, map_dir)
    ersa = read_ersa(ersa_path)

    relatives = king.merge(kinship, how='outer', left_index=True, right_index=True).\
                     merge(ersa, how='outer', left_index=True, right_index=True).\
                     merge(king_segments, how='outer', left_index=True, right_index=True)

    prefer_ersa_mask = pandas.isnull(relatives.king_degree) | (relatives.king_degree > 3)
    relatives.loc[:, 'final_degree'] = relatives.king_degree
    # if king is unsure or king degree > 3 then we use ersa distant relatives estimation
    relatives.loc[prefer_ersa_mask, 'final_degree'] = relatives.ersa_degree
    logging.info(f'king is null or more than 3: {prefer_ersa_mask.sum()}')
    logging.info(f'ersa is not null: {pandas.notna(relatives.ersa_degree).sum()}')
    print(f'relatives columns are {relatives.columns}')
    if 'total_seg_len_king' in relatives.columns:
        relatives.loc[:, 'total_seg_len'] = relatives.total_seg_len_king*relatives.king_ibd1_prop
        relatives.loc[:, 'total_seg_len_ibd2'] = relatives.total_seg_len_king*relatives.king_ibd2_prop
        relatives.loc[:, 'seg_count'] = relatives.seg_count_king

    relatives.loc[prefer_ersa_mask, 'total_seg_len'] = relatives.total_seg_len_ersa
    relatives.loc[prefer_ersa_mask, 'seg_count'] = relatives.seg_count_ersa
    print('is na: ', pandas.isna(relatives.loc[prefer_ersa_mask, 'total_seg_len_ersa']).sum())
    # approximate calculations, IBD share is really small in this case
    relatives.loc[prefer_ersa_mask, 'shared_genome_proportion'] = 0.5*relatives.loc[prefer_ersa_mask, 'total_seg_len'].values / 3440
    relatives.drop(['total_seg_len_king', 'seg_count_king', 'total_seg_len_ersa', 'seg_count_ersa', 'king_ibd1_prop', 'king_ibd2_prop'],
                   axis='columns', inplace=True)

    logging.info(f'final degree not null: {pandas.notna(relatives.final_degree).sum()}')

    relatives.loc[pandas.notna(relatives.final_degree), :].to_csv(output_path, sep='\t')

Python Snakemake Pandas numpy utils king From line 1 of scripts/merge_king_ersa.py

import pandas
import logging
from utils.bcftools import bcftools_stats
from utils.bcftools import bcftools_query
from utils.bcftools import bcftools_view
from io import StringIO


def find_outliers(psc: pandas.DataFrame, outliers_file_path: str, keep_samples_file_path: str, alpha: float):

    q1 = psc.nNonMissing.quantile(0.25)
    q3 = psc.nNonMissing.quantile(0.75)
    iqr = q3-q1
    lower_bound_mask = psc.nNonMissing < (q1 - alpha*iqr)
    upper_bound_mask = psc.nNonMissing > (q3 + alpha*iqr)
    outliers = psc[lower_bound_mask | upper_bound_mask]
    if outliers.empty:
        outliers['exclusion_reason'] = []
    else:
        outliers.loc[lower_bound_mask, 'exclusion_reason'] = f'Sample was below 1st quantile - IQR*{alpha} ({q1 - alpha*iqr}) by Missing Samples'
        outliers.loc[upper_bound_mask, 'exclusion_reason'] = f'Sample was above 3rd quantile + IQR*{alpha} ({q3 + alpha*iqr}) by Missing Samples'
    keep_samples = pandas.concat([psc, outliers, outliers]).drop_duplicates(keep=False)

    outliers_list = list(outliers.sample_id)
    with open(outliers_file_path, 'w') as outliers_file:
        for outlier in outliers_list:
            outliers_file.write(outlier + '\n')

    keep_samples_list = list(keep_samples.sample_id)
    with open(keep_samples_file_path, 'w') as keep_samples_file:
        for sample in keep_samples_list:
            keep_samples_file.write(sample + '\n')

    return outliers


def get_stats(vcf_input, samples_path, psc_path=False, stats_path=''):

    bcftools_query(vcf_input, list_samples=True, save_output=samples_path)
    raw_table = bcftools_stats(vcf_input, samples_path, save_output=psc_path, save_stats=stats_path)
    names = [
        'PSC',
        'id',
        'sample_id',
        'nRefHom',
        'nNonRefHom',
        'nHets',
        'nTransitions',
        'nTransversions',
        'nIndels',
        'average depth',
        'nSingletons',
        'nHapRef',
        'nHapAlt',
        'nMissing'
    ]
    psc = pandas.read_table(StringIO(raw_table), header=None, names=names)

    psc.loc[:, 'nNonMissing'] = psc.nRefHom + psc.nNonRefHom + psc.nHets
    psc.loc[:, 'missing_share'] = psc.nMissing / (psc.nMissing + psc.nNonMissing)
    psc.loc[:, 'alt_hom_share'] = psc.nNonRefHom / psc.nNonMissing
    psc.loc[:, 'het_samples_share'] = psc.nHets / psc.nNonMissing

    return psc



if __name__ == '__main__':

    input_vcf = snakemake.input['vcf']
    stats_file = snakemake.output['stats']
    samples = snakemake.params['samples']
    psc_path = snakemake.params['psc']
    keep_samples = snakemake.params['keep_samples']
    bad_samples_path = snakemake.output['bad_samples']
    report_path = snakemake.output['report']
    outliers = snakemake.output['outliers']
    no_outliers_vcf = snakemake.output['no_outliers_vcf']
    missing_samples = float(snakemake.params['missing_samples'])
    alt_hom_samples = float(snakemake.params['alt_hom_samples'])
    het_samples = float(snakemake.params['het_samples'])
    alpha = float(snakemake.params['iqr_alpha'])

    logging.basicConfig(filename=snakemake.log[0], level=logging.DEBUG, format='%(levelname)s:%(asctime)s %(message)s')

    # get initial stats, that we do not save
    logging.info("Getting initial vcf stats...")
    psc = get_stats(vcf_input=input_vcf,
                    samples_path=samples,
                    psc_path=psc_path,
                    stats_path=stats_file)
    # filter outliers
    outliers = find_outliers(psc, outliers_file_path=outliers, keep_samples_file_path=keep_samples, alpha=alpha)
    logging.info(f"Found {len(outliers)} outliers!")
    if not outliers.empty:
        bcftools_view(input_vcf, no_outliers_vcf, keep_samples)
    else:
        with open(no_outliers_vcf, 'w') as dummy_no_outliers_vcf:
            dummy_no_outliers_vcf.write('')
        no_outliers_vcf = input_vcf

    # get final stats without outliers
    logging.info("Getting final vcf stats...")
    psc = get_stats(vcf_input=no_outliers_vcf,
                    samples_path=samples,
                    psc_path=psc_path,
                    stats_path=stats_file)

    bad_missing_samples_mask = (psc.missing_share >= missing_samples / 100) | (psc.nMissing + psc.nNonMissing == 0)

    bad_alt_hom_samples_mask = (psc.alt_hom_share < alt_hom_samples / 100) | (psc.nNonMissing == 0)

    bad_het_samples_mask = (psc.het_samples_share < het_samples / 100) | (psc.nNonMissing == 0)

    psc.loc[bad_het_samples_mask, 'exclusion_reason'] = f'Sample has <= {het_samples}% heterozygous SNPs'
    psc.loc[bad_alt_hom_samples_mask, 'exclusion_reason'] = f'Sample has <= {alt_hom_samples}% homozygous alternative SNPs'
    psc.loc[bad_missing_samples_mask, 'exclusion_reason'] = f'Sample has >= {missing_samples}% missing SNPs'
    bad_samples = psc.loc[(bad_alt_hom_samples_mask | bad_het_samples_mask | bad_missing_samples_mask),
                          ['sample_id', 'missing_share', 'alt_hom_share', 'het_samples_share', 'exclusion_reason']]
    psc = pandas.concat([psc, outliers[['sample_id', 'missing_share', 'alt_hom_share', 'het_samples_share', 'exclusion_reason']]], ignore_index=True)

    samples_only = bad_samples.loc[:, ['sample_id']].copy()
    # if input vcf file has iids in form fid_iid we split it, else we just assign fid equal to iid
    samples_only.loc[:, 'fid'] = [s.split('_')[0] if '_' in s else s for s in samples_only.sample_id]
    samples_only.loc[:, 'iid'] = [s.split('_')[1] if '_' in s else s for s in samples_only.sample_id]

    samples_only.loc[:, ['fid', 'iid']].to_csv(bad_samples_path, sep='\t', header=None, index=False)
    bad_samples.to_csv(report_path, sep='\t')

    log = f'We have total of {psc.shape[0]} samples\n' \
          f'{bad_missing_samples_mask.sum()} samples have >= {missing_samples}% missing share\n' \
          f'{bad_alt_hom_samples_mask.sum()} samples have <= {alt_hom_samples}% homozygous alternative variants\n' \
          f'{bad_het_samples_mask.sum()} samples have <= {het_samples}% heterozygous variants\n' \
          f'{len(outliers)} samples detected as outliers\n'

    logging.info(log)
    print(log)

Python Pandas utils From line 1 of scripts/select_bad_samples.py

Metagenome-Assembled Genome Analysis of Gut Microbiota (v2.16.2)

import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logging.captureWarnings(True)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception

#### Begining of scripts


from common_report import *

import pandas as pd
import plotly.express as px


from utils.taxonomy import tax2table


def make_plots(bin_table):
    div = {}

    div["input_file"] = bin_table

    # Prepare data
    df = pd.read_table(bin_table)

    if snakemake.config["bin_quality_asesser"].lower() == "busco":
        df["Bin Id"] = df["Input_file"].str.replace(".fasta", "", regex=False)

        logging.info("No taxonomic information available, use busco Dataset")

        lineage_name = "Dataset"
        hover_data = [
            "Scores_archaea_odb10",
            "Scores_bacteria_odb10",
            "Scores_eukaryota_odb10",
        ]
        size_name = None

    elif snakemake.config["bin_quality_asesser"].lower() == "checkm":
        df = df.join(
            tax2table(df["Taxonomy (contained)"], remove_prefix=True).fillna("NA")
        )

        lineage_name = "phylum"
        size_name = "Genome size (Mbp)"
        hover_data = ["genus"]

    elif snakemake.config["bin_quality_asesser"].lower() == "checkm2":
        df["Bin Id"] = df.index

        lineage_name = "Translation_Table_Used"
        hover_data = [
            "Completeness_Model_Used",
            "Coding_Density",
            "Contig_N50",
            "GC_Content",
            "Additional_Notes",
        ]
        size_name = "Genome_Size"
    else:
        raise Exception(f"bin_quality_asesser in the config file not understood")

    df.index = df["Bin Id"]

    div[
        "QualityScore"
    ] = "<p>Quality score is calculated as: Completeness - 5 x Contamination.</p>"

    # 2D plot
    fig = px.scatter(
        data_frame=df,
        y="Completeness",
        x="Contamination",
        color=lineage_name,
        size=size_name,
        hover_data=hover_data,
        hover_name="Bin Id",
    )
    fig.update_yaxes(range=(50, 102))
    fig.update_xaxes(range=(-0.2, 10.1))
    div["2D"] = fig.to_html(**HTML_PARAMS)

    ## By sample
    fig = px.strip(
        data_frame=df,
        y="Quality_score",
        x="Sample",
        color=lineage_name,
        hover_data=hover_data,
        hover_name="Bin Id",
    )
    fig.update_yaxes(range=(50, 102))
    div["bySample"] = fig.to_html(**HTML_PARAMS)

    # By Phylum
    fig = px.strip(
        data_frame=df,
        y="Quality_score",
        x=lineage_name,
        hover_data=hover_data,
        hover_name="Bin Id",
    )
    fig.update_yaxes(range=(50, 102))
    div["byPhylum"] = fig.to_html(**HTML_PARAMS)

    return div


# main


div = make_plots(bin_table=snakemake.input.bin_table)


make_html(
    div=div,
    report_out=snakemake.output.report,
    html_template_file=os.path.join(reports_dir, "template_bin_report.html"),
    wildcards=snakemake.wildcards,
)

Python Pandas utils CheckM plotly checkm2 From line 1 of report/bin_report.py

Workflow Steps and Code Snippets

Single-Cell and Single-Nuclei Data Integration Workflow

TYTUS - UBCS statistics calculation tool

Workflow for detecting PMCs in confocal images (v0.1.0)

genomic relatedness detection pipeline (v1.8)

Metagenome-Assembled Genome Analysis of Gut Microbiota (v2.16.2)

utils