Workflow Steps and Code Snippets

225 tagged steps and code snippets that match keyword utils

Single-Cell and Single-Nuclei Data Integration Workflow

14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import anndata as adata
import argparse
import re
from utils.integrate_scanorama import integrate_scanorama

# define arguments
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input_anndata',
                    dest = 'input_anndata',
                    required = True,
                    help = 'Path to HDF5 file with merged AnnData object to integrate')
parser.add_argument('-o', '--output_anndata',
                    dest = 'output_anndata',
                    required = True,
                    help = 'Path to HDF5 file to save the integrated AnnData object')
parser.add_argument('-b', '--batch_column',
                    dest = 'batch_column',
                    default = 'batch',
                    help = 'The name of the column in `anndata.obs` that indicates the batches for each cell, '
                            ' typically this corresponds to the library id.')
parser.add_argument('--use_hvg',
                    dest = 'use_hvg',
                    action = 'store_true',
                    default = False,
                    help = 'Boolean indicating whether or not to use only highly variable genes for data integration.'
                    'If --use_hvg is used, integration will only consider highly variable genes, and similarly '
                    'the returned integrated object will only contain the highly variable genes.')
parser.add_argument('--corrected_only',
                    dest = 'corrected_only',
                    action = 'store_true',
                    help = 'Boolean indicating to return only the corrected data or all raw data.'
                    ' Default will return all data. To return only corrected data, use --corrected_only.')
parser.add_argument('-s', '--seed',
                    dest = 'seed',
                    type=int,
                    default = None,
                    help = 'Random seed to set for scanorama.')

args = parser.parse_args()

# compile extension regex
file_ext = re.compile(r"\.hdf5$|.h5$", re.IGNORECASE)

# check that input file exists, if it does exist, make sure it's an h5 file
if not os.path.exists(args.input_anndata):
    raise FileExistsError("--input_anndata file not found.")
elif not file_ext.search(args.input_anndata):
    raise ValueError("--input_anndata must end in either .hdf5 or .h5 and contain a merged AnnData object.")

# make sure output file path is h5 file
if not file_ext.search(args.output_anndata):
    raise ValueError("--output_anndata must provide a file path ending in either .hdf5 or .h5.")

# check that output file directory exists and create directory if doesn't exist
integrated_adata_dir = os.path.dirname(args.output_anndata)
if not os.path.isdir(integrated_adata_dir):
    os.makedirs(integrated_adata_dir, exist_ok = True)

# read in merged anndata object
merged_adata = adata.read_h5ad(args.input_anndata)

if args.use_hvg:
    try:
        var_genes = list(merged_adata.uns['variable_genes'])
    except KeyError:
        print("Variable genes cannot be found in anndata object."
              " If using --use_hvg, make sure HVG are stored in adata.uns['variable_genes'].")
        raise
else:
    var_genes = list(merged_adata.var_names)

# integrate anndata with scanorama
scanorama_integrated_adata = integrate_scanorama(merged_adata,
                                                 integrate_genes = var_genes,
                                                 batch_column = args.batch_column,
                                                 seed = args.seed)

# remove raw data and logcounts to minimize space if corrected_only is true
if args.corrected_only:
    print("Removing raw data and log-normalized data. Only corrected data will be returned.")
    scanorama_integrated_adata.X = None
    del scanorama_integrated_adata.layers["logcounts"]

# write anndata to h5
scanorama_integrated_adata.write(filename = args.output_anndata)
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import anndata as adata
import argparse
import re
from utils.integrate_scvi import integrate_scvi

# define arguments
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input_anndata',
                    dest = 'input_anndata',
                    required = True,
                    help = 'Path to HDF5 file with merged AnnData object to integrate')
parser.add_argument('-o', '--output_anndata',
                    dest = 'output_anndata',
                    required = True,
                    help = 'Path to HDF5 file to save the integrated AnnData object')
parser.add_argument('-b', '--batch_column',
                    dest = 'batch_column',
                    default = 'batch',
                    help = 'The name of the column in `anndata.obs` that indicates the batches for each cell, '
                            ' typically this corresponds to the library id.')
parser.add_argument('--categorical_covariates',
                    dest = 'categorical_covariates',
                    default = None,
                    type = str,
                    help = 'A comma-separated list of columns containing additional categorical data to be'
                    ' included as a covariate. Default is None.')
parser.add_argument('--continuous_covariates',
                    dest = 'continuous_covariates',
                    default = "subsets_mito_percent",
                    type = str,
                    help = 'A comma-separated list of columns containing additional continous data to be'
                    ' included as a covariate. Default is "subsets_mito_percent".')
parser.add_argument('--num_latent',
                    dest = 'num_latent',
                    type = int,
                    default = 20,
                    help = 'Number of dimensions to return from the latent space.')
parser.add_argument('--use_hvg',
                    dest = 'use_hvg',
                    action = 'store_true',
                    default = False,
                    help = 'Boolean indicating whether or not to use only highly variable genes for data integration.'
                    'If --use_hvg is used, the returned integrated object will only contain the highly variable genes.')
parser.add_argument('--corrected_only',
                    dest = 'corrected_only',
                    action = 'store_true',
                    help = 'Boolean indicating to return only the corrected data or all raw data.'
                    ' Default will return all data. To return only corrected data, use --corrected_only.')
parser.add_argument('-s', '--seed',
                    dest = 'seed',
                    type=int,
                    default = None,
                    help = 'Random seed to set for scanorama.')

args = parser.parse_args()

# compile extension regex
file_ext = re.compile(r"\.hdf5$|.h5$", re.IGNORECASE)

# check that input file exists, if it does exist, make sure it's an h5 file
if not os.path.exists(args.input_anndata):
    raise FileExistsError("--input_anndata file not found.")
elif not file_ext.search(args.input_anndata):
    raise ValueError("--input_anndata must end in either .hdf5 or .h5 and contain a merged AnnData object.")

# make sure output file path is h5 file
if not file_ext.search(args.output_anndata):
    raise ValueError("--output_anndata must provide a file path ending in either .hdf5 or .h5.")

# check that output file directory exists and create directory if doesn't exist
integrated_adata_dir = os.path.dirname(args.output_anndata)
os.makedirs(integrated_adata_dir, exist_ok = True)

# read in merged anndata object
merged_adata = adata.read_h5ad(args.input_anndata)

if args.use_hvg:
    try:
        var_genes = list(merged_adata.uns['variable_genes'])
    except KeyError:
        print("Variable genes cannot be found in anndata object."
              " If using --use_hvg, make sure HVG are stored in adata.uns['variable_genes'].")
        raise
else:
    var_genes = list(merged_adata.var_names)

# split covariates from comma separated strings into lists
if args.categorical_covariates:
    categorical_covariates = [covariate.strip() for covariate in args.categorical_covariates.split(',')]
else:
    categorical_covariates = None

if args.continuous_covariates:
    if args.continuous_covariates == "None":
        continuous_covariates = None
    else:
        continuous_covariates = [covariate.strip() for covariate in args.continuous_covariates.split(',')]
else:
    continuous_covariates = None

# integrate anndata with scvi
scvi_integrated_adata = integrate_scvi(merged_adata,
                                       integrate_genes = var_genes,
                                       batch_column = args.batch_column,
                                       categorical_covariate_columns = categorical_covariates,
                                       continuous_covariate_columns = continuous_covariates,
                                       num_latent = args.num_latent,
                                       seed = args.seed)

# remove raw data and logcounts to minimize space if corrected_only is true
if args.corrected_only:
    print("Removing raw data and log-normalized data. Only corrected data will be returned.")
    scvi_integrated_adata.X = None
    del scvi_integrated_adata.layers["logcounts"]


# write anndata to h5
scvi_integrated_adata.write(filename = args.output_anndata)

TYTUS - UBCS statistics calculation tool

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from utils import get_chrom_sizes
from consts import QUALITY_WINDOW_LENGTH, MAX_NUMBER_OF_MISMACHES

from Bio.Seq import Seq

alignment_file = snakemake.input['alignment_file']
snd_file = snakemake.output['snd_file']
chrom_sizes_file = snakemake.input['query_chrom_sizes_file']

chrom_sizes = get_chrom_sizes(chrom_sizes_file)

query_name = snakemake.wildcards['query']
target_name = snakemake.wildcards['target']
ffrom = snakemake.params['ffrom']

snp_file = snakemake.input['snp_file']

def get_snps(snp_file):

    snps = {}

    with open(snp_file) as f:

        for line in f:

            line = line.split()

            start_coord = int(line[1]) + 1
            end_coord = int(line[2]) + 1

            for i in range(start_coord, end_coord):
                base = line[4]
                snps[i] = (base, line)

    return snps


def get_axt_entry(f, ffrom):

    header = f.readline()

    if not header: return None

    header = header.split()

    strand = header[7]

    if ffrom == 'target':

        t_chrom = header[1]
        t_start = int(header[2])

        q_chrom = header[4]
        q_start = int(header[5])
        q_end = int(header[6])

        target_al_seq = f.readline().rstrip().upper()
        query_al_seq = f.readline().rstrip().upper()

    else:

        t_chrom = header[4]
        t_start = int(header[5])

        q_chrom = header[1]
        q_start = int(header[2])

        query_al_seq = f.readline().rstrip().upper()
        target_al_seq =  f.readline().rstrip().upper()           

    f.readline()

    return t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq 

def add_snds_from_axt_entry(t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq, snd_bed_entries):

    t_count = 0
    q_count = 0

    for i, (t_chr, q_chr) in enumerate(zip(target_al_seq, query_al_seq)):

        if t_chr != '-':
            t_count += 1

        if q_chr != '-':
            q_count += 1

        if t_chr == q_chr:
            continue

        t_window = target_al_seq[i - QUALITY_WINDOW_LENGTH: i + QUALITY_WINDOW_LENGTH + 1]
        q_window = query_al_seq[i - QUALITY_WINDOW_LENGTH: i + QUALITY_WINDOW_LENGTH + 1]

        if len(t_window) != 2 * QUALITY_WINDOW_LENGTH + 1:  continue

        #print('enough place')

        if '-' in t_window + q_window: continue

        #print('without deletions and insertions')

        if len([ 1  for t_win_chr, q_win_chr  in zip(t_window, q_window) if t_win_chr != q_win_chr]) > MAX_NUMBER_OF_MISMACHES: continue

        #print('number of  mismatches above max threshold')

        t_subst_start = t_start + t_count - 2

        if strand == '-' and ffrom == 'query':
            t_subst_start = chrom_sizes[t_chrom] - (t_start + t_count - 1)
            t_window = str(Seq(t_window).reverse_complement())
            q_window = str(Seq(q_window).reverse_complement())

        t_coord_start = str(t_subst_start - QUALITY_WINDOW_LENGTH)
        t_coord_end =  str(t_subst_start + QUALITY_WINDOW_LENGTH + 1)

        t_coords = t_chrom + ':' + t_coord_start + '-' + t_coord_end

        q_subst_start = q_start + q_count - 2

        if strand == '-' and ffrom == 'target':
            q_subst_start = chrom_sizes[q_chrom] - (q_start + q_count - 1)

        q_coord_start = str(q_subst_start - QUALITY_WINDOW_LENGTH)
        q_coord_end = str(q_subst_start + QUALITY_WINDOW_LENGTH + 1)

        q_coords = q_chrom + ':' + q_coord_start + '-' + q_coord_end

        t_snp =  int(t_coord_start) +  QUALITY_WINDOW_LENGTH + 1 in snps

        #if t_output_window[5] != snps[int(t_coord_start)+ 6][0]:
        #    print(t_output_window, t_output_window[5], snps[int(t_coord_start) + 6]

        name = ('SND_ID:'  + str(len(snd_bed_entries)) , 'QUERY:' + query_name, 'TARGET:'+ target_name,
                'TARGET_COORDS:' + t_coords, 'QUERY_COORDS:' + q_coords,
                'CHANGE:' +  t_window + '>' + q_window, 'FROM:' + ffrom, 'SNP:' + str(t_snp), 'STRAND:' + strand)

        name = ';'.join(name)

        snd_bed_entry = '\t'.join([t_chrom, t_coord_start, t_coord_end, name, '0', strand ])

        snd_bed_entries[int(t_coord_start)] = snd_bed_entry

def get_snd_bed_entries(alignment_file, snps):

    with open(alignment_file) as f:

        snd_bed_entries = {}

        while True:

            axt_entry = get_axt_entry(f, ffrom)

            if axt_entry is None:
                break

            t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq = axt_entry

            add_snds_from_axt_entry(t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq, snd_bed_entries)

        return snd_bed_entries

def save_snd_bed_entries(snd_bed_entries_file, snd_bed_entries):

    with open(snd_file, 'w') as f_out:

        for t_coord_start in sorted(snd_bed_entries):
            snd_bed_entry = snd_bed_entries[t_coord_start]
            f_out.write(snd_bed_entry)
            f_out.write('\n')

snps = get_snps(snp_file)
snd_bed_entries = get_snd_bed_entries(alignment_file, snps)
save_snd_bed_entries(snd_file, snd_bed_entries)

Workflow for detecting PMCs in confocal images (v0.1.0)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
import json
import os
import sys
from collections import namedtuple
import logging
import itertools
from turtle import dot

import numpy as np
import xarray as xr
import skimage
from skimage import exposure, io, morphology, measure
from skimage.filters import threshold_otsu

import bigfish.detection as bf_detection
import bigfish.stack as bf_stack
import bigfish.plot as bf_plot

BoundingBox = namedtuple("BoundingBox", ["ymin", "ymax", "xmin", "xmax"])
try:
    sys.path.append(os.path.dirname(__file__))
except NameError:
    pass
import utils


def normalize_zstack(z_stack, bits):
    """Normalize z-slices in a 3D image using contrast stretching

    Parameters
    ----------
    z_stack : numpy.ndarray
        3 dimensional confocal FISH image.
    bits : int
        Bit depth of image.

    Returns
    -------
    numpy.ndarray
        Z-corrected image with each slice minimum and maximum matched
    """
    out = np.array(
        [
            exposure.rescale_intensity(
                x, in_range=(0, 2 ** bits - 1), out_range=(z_stack.min(), z_stack.max())
            )
            for x in z_stack
        ]
    )
    return skimage.img_as_uint(exposure.rescale_intensity(out))


def read_bit_img(img_file, bits=12):
    """Read an image and return as a 16-bit image."""
    img = exposure.rescale_intensity(
        io.imread(img_file),
        in_range=(0, 2 ** (bits) - 1)
        #         out_range=(0, )
    )
    return skimage.img_as_uint(img)


def select_signal(image, p_in_focus=0.75, margin_width=10):
    """
    Generate bounding box of FISH image to select on areas where signal is present.

    Parameters
    ----------
    image : np.ndarray
        3D FISH image
    p_in_focus : float, optional
        Percent of in-focus slices to retain for 2D projection, by default 0.75.
    margin_width : int, optional
        Number of pixels to pad selection by. Default is 10.

    Returns
    -------
    namedtuple
        minimum and maximum coordinate values of the bounding box in the xy plane
    """
    image = image.astype(np.uint16)
    focus = bf_stack.compute_focus(image)
    selected = bf_stack.in_focus_selection(image, focus, p_in_focus)
    projected_2d = bf_stack.maximum_projection(selected)
    foreground = np.where(projected_2d > threshold_otsu(projected_2d))
    limits = BoundingBox(
        ymin=max(foreground[0].min() - margin_width, 0),
        ymax=min(foreground[0].max() + margin_width, image.shape[1]),
        xmin=max(foreground[1].min() - margin_width, 0),
        xmax=min(foreground[1].max() + margin_width, image.shape[2]),
    )
    return limits


def crop_to_selection(img, bbox):
    """
    Crop image to selection defined by bounding box.

    Crops a 3D image to specified x and y coordinates.
    Parameters
    ----------
    img : np.ndarray
        3Dimensional image to crop
    bbox : namedtuple
        Tuple defining minimum and maximum coordinates for x and y planes.

    Returns
    -------
    np.ndarray
        3D image cropped to the specified selection.
    """
    return img[:, bbox.ymin : bbox.ymax, bbox.xmin : bbox.xmax]


def count_spots_in_labels(spots, labels):
    """
    Count the number of RNA molecules in specified labels.

    Parameters
    ----------
    spots : np.ndarray
        Coordinates in original image where RNA molecules were detected.
    labels : np.ndarray
        Integer array of same shape as `img` denoting regions to interest to quantify.
        Each separate region should be uniquely labeled.

    Returns
    -------
    dict
        dictionary containing the number of molecules contained in each labeled region.
    """
    assert spots.shape[1] == len(labels.shape)
    n_labels = np.unique(labels) - 1  # subtract one for backgroudn
    counts = {i: 0 for i in range(1, n_labels + 1)}
    for each in spots:
        if len(each) == 3:
            cell_label = labels[each[0], each[1], each[2]]
        else:
            cell_label = labels[each[0], each[1]]
        if cell_label != 0:
            counts[cell_label] += 1
    return counts


def preprocess_image(
    img, smooth_method="gaussian", sigma=7, whitehat=True, selem=None, stretch=99.99
):
    scaled = exposure.rescale_intensity(
        img, in_range=tuple(np.percentile(img, [0, stretch]))
    )
    if smooth_method == "log":
        smooth_func = bf_stack.log_filter
        to_smooth = bf_stack.cast_img_float64(scaled)
    elif smooth_method == "gaussian":
        smooth_func = bf_stack.remove_background_gaussian
        to_smooth = bf_stack.cast_img_uint16(scaled)
    else:
        raise ValueError(f"Unsupported background filter: {smooth_method}")
    if whitehat:
        f = lambda x, s: morphology.white_tophat(smooth_func(x, s), selem)
    else:
        f = lambda x, s: smooth_func(x, s)
    smoothed = np.stack([f(img_slice, sigma) for img_slice in to_smooth])
    return bf_stack.cast_img_float64(np.stack(smoothed))


def count_spots(
    smoothed_signal,
    cell_labels,
    voxel_size_nm,
    dot_radius_nm,
    smooth_method="gaussian",
    decompose_alpha=0.5,
    decompose_beta=1,
    decompose_gamma=5,
    verbose=False,
):

    if verbose:
        spot_radius_px = bf_detection.get_object_radius_pixel(
            voxel_size_nm=voxel_size_nm, object_radius_nm=dot_radius_nm, ndim=3
        )
        logging.info("spot radius (z axis): %0.3f pixels", spot_radius_px[0])
        logging.info("spot radius (yx plan): %0.3f pixels", spot_radius_px[-1])
    spots, threshold = bf_detection.detect_spots(
        smoothed_signal,
        return_threshold=True,
        voxel_size=voxel_size_nm,
        spot_radius=dot_radius_nm,
    )
    if verbose:
        logging.info("%d spots detected...", spots.shape[0])
        logging.info("plotting threshold optimization for spot detection...")
        bf_plot.plot_elbow(
            smoothed_signal,
            voxel_size=voxel_size_nm,
            spot_radius=dot_radius_nm,
        )
    decompose_cast = {
        "gaussian": bf_stack.cast_img_uint16,
        "log": bf_stack.cast_img_float64,
    }
    try:
        (
            spots_post_decomposition,
            dense_regions,
            reference_spot,
        ) = bf_detection.decompose_dense(
            decompose_cast[smooth_method](smoothed_signal),
            spots,
            voxel_size=voxel_size_nm,
            spot_radius=dot_radius_nm,
            alpha=decompose_alpha,  # alpha impacts the number of spots per candidate region
            beta=decompose_beta,  # beta impacts the number of candidate regions to decompose
            gamma=decompose_gamma,  # gamma the filtering step to denoise the image
        )
        logging.info(
            "detected spots before decomposition: %d\n"
            "detected spots after decomposition: %d",
            spots.shape[0],
            spots_post_decomposition.shape[0],
        )
        if verbose:
            print(
                f"detected spots before decomposition: {spots.shape[0]}\n"
                f"detected spots after decomposition: {spots_post_decomposition.shape[0]}\n"
                f"shape of reference spot for decomposition: {reference_spot.shape}"
            )
            bf_plot.plot_reference_spot(reference_spot, rescale=True)
    except RuntimeError:
        logging.warning("decomposition failed, using originally identified spots")
        spots_post_decomposition = spots
    n_labels = len(np.unique(cell_labels)) - 1
    counts = {i: 0 for i in range(1, n_labels + 1)}
    expression_3d = np.zeros_like(smoothed_signal)
    # get slices to account for cropping
    for each in spots_post_decomposition:
        spot_coord = tuple(each)
        cell_label = cell_labels[spot_coord]
        if cell_label != 0:
            counts[cell_label] += 1
    for region in measure.regionprops(cell_labels):
        expression_3d[region.slice][region.image] = counts[region.label]
    return counts, expression_3d


def average_intensity(smoothed_signal, cell_labels):
    n_labels = len(np.unique(cell_labels)) - 1
    intensities = {i: 0 for i in range(1, n_labels + 1)}
    z_normed_smooth = (smoothed_signal - smoothed_signal.mean()) / smoothed_signal.std()
    expression_3d = np.zeros_like(z_normed_smooth)
    for region in measure.regionprops(cell_labels, z_normed_smooth):
        intensities[region.label] = region.mean_intensity
        expression_3d[region.slice][region.image] = region.mean_intensity
    return intensities, expression_3d


def quantify_expression(
    fish_img,
    cell_labels,
    measures=["spots", "intensity"],
    voxel_size_nm=None,
    dot_radius_nm=None,
    whitehat=True,
    whitehat_selem=None,
    smooth_method="gaussian",
    smooth_sigma=1,
    decompose_alpha=0.5,
    decompose_beta=1,
    decompose_gamma=5,
    bits=12,
    crop_image=True,
    verbose=False,
):
    """
    Count the number of molecules in an smFISH image

    Parameters
    ----------
    fish_img : np.ndarray
        Image in which to perform molecule counting
    cell_labels : np.ndarray
        Integer array of same shape as `img` denoting regions to interest to quantify.
        Each separate region should be uniquely labeled.
    measures : list
        Measures to use to quantify expression. Possible values are "spots",
        "intensity", and ["spots", "intensity"]. Default is to measure both
    voxel_size_nm : tuple(float, int), None
        Physical dimensions of each voxel in ZYX order. Required if running spot
        counting.
    dot_radius_nm : tuple(float, int), None
        Physical size of expected dots. Required if running spot
        counting.
    whitehat : bool, optional
        Whether to perform white tophat filtering prior to image de-noising, by default True
    whitehat_selem : [int, np.ndarray], optional
        Structuring element to use for white tophat filtering.
    smooth_method : str, optional
        Method to use for image de-noising. Possible values are "log" and "gaussian" for
        Laplacian of Gaussians and Gaussian background subtraction, respectively. By default "log".
    smooth_sigma : [int, np.ndarray], optional
        Sigma value to use for smoothing function, by default 1
    decompose_alpha : float, optional
        Intensity percentile used to compute the reference spot, between 0 and 1.
        By default 0.7. For more information, see:
        https://big-fish.readthedocs.io/en/stable/detection/dense.html
    decompose_beta : int, optional
        Multiplicative factor for the intensity threshold of a dense region,
        by default 1. For more information, see:
        https://big-fish.readthedocs.io/en/stable/detection/dense.html
    decompose_gamma : int, optional
        Multiplicative factor use to compute a gaussian scale, by default 5.
        For more information, see:
        https://big-fish.readthedocs.io/en/stable/detection/dense.html
    bits : int, optional
        Bit depth of original image. Used for scaling image while maintaining
        ob
    crop_image : bool, optional
        Whether to crop signal. Default is True.
    verbose : bool, optional
        Whether to verbosely print results and progress.

    Returns
    -------
    (np.ndarray, dict)
        np.ndarray: positions of all identified mRNA molecules.
        dict: dictionary containing the number of molecules contained in each labeled region.
    """
    if (voxel_size_nm is None or dot_radius_nm is None) and "spots" in measures:
        raise ValueError(
            "Require `voxel_size_nm` and `dot_radius_nm` when performing spot counting."
        )
    if crop_image:
        if verbose:
            logging.info("Cropping image to signal")
        limits = select_signal(fish_img)
        if verbose:
            logging.info(
                "Cropped image to %d x %d",
                {limits.ymax - limits.ymin},
                {limits.xmax - limits.xmin},
            )
    else:
        # create BoudndingBox that selects whole image
        limits = BoundingBox(
            ymin=0, ymax=fish_img.shape[1], xmin=0, xmax=fish_img.shape[2]
        )
    if verbose:
        logging.info("Preprocessing image.")
    cropped_img = skimage.img_as_float64(
        exposure.rescale_intensity(
            crop_to_selection(fish_img, limits),
            in_range=(0, 2 ** bits - 1),
            out_range=(0, 1),
        )
    )
    smoothed = preprocess_image(
        cropped_img, smooth_method, smooth_sigma, whitehat, whitehat_selem, 99.99
    )

    cropped_labels = crop_to_selection(cell_labels, limits)
    quant = dict()
    if "spots" in measures:
        counts, counts_3d = count_spots(
            smoothed,
            cropped_labels,
            voxel_size_nm=voxel_size_nm,
            dot_radius_nm=dot_radius_nm,
            smooth_method=smooth_method,
            decompose_alpha=decompose_alpha,
            decompose_beta=decompose_beta,
            decompose_gamma=decompose_gamma,
            verbose=verbose,
        )
        quant["spots"] = counts
        if crop_image:  # match original shape if cropped
            counts_3d = utils.pad_to_shape(counts_3d, fish_img.shape)
    if "intensity" in measures:
        intensities, intense_3d = average_intensity(smoothed, cropped_labels)
        quant["intensity"] = intensities
        if crop_image:  # match original shape if cropped
            intense_3d = utils.pad_to_shape(intense_3d, fish_img.shape)
    if len(measures) == 2:
        expression_3d = np.stack([counts_3d, intense_3d])
    return (
        quant,
        expression_3d,
    )


def get_quant_measure(method):
    """Get method of quantification"""
    if method == "spots":
        return ["spots"]
    elif method == "intensity":
        return ["intensity"]
    elif method == "both":
        return ["spots", "intensity"]
    else:
        raise ValueError(f"Unrecognized method {method}.")


if __name__ == "__main__":
    import h5py
    import pandas as pd

    try:
        snakemake
    except NameError:
        snakemake = None
    if snakemake is not None:
        logging.basicConfig(filename=snakemake.log[0], level=logging.INFO)
        raw_img, dimensions = utils.read_image_file(
            snakemake.input["image"], as_nm=True
        )
        if dimensions is None and snakemake.input["image"].endswith(".h5"):
            with open(snakemake.params["dimensions"], "r") as handle:
                data = json.load(handle)
                dimensions = np.array([data[c] for c in "zyx"]) * (10 ** 3)
        labels = np.array(h5py.File(snakemake.input["labels"], "r")["image"])
        logging.info("%d labels detected.", len(np.unique(labels) - 1))
        start = int(snakemake.params["z_start"])
        stop = int(snakemake.params["z_end"])
        if stop < 0:
            stop = raw_img.shape[1]
        gene_params = snakemake.params["gene_params"]

        def has_probe_info(name, gene_params):
            if name not in gene_params.keys():
                logging.warning(
                    "No entry for %s found in gene parameters. Not quantifying signal",
                    name,
                )
                return False
            return True

        channels = {
            x: i
            for i, x in enumerate(snakemake.params["channels"].split(";"))
            if has_probe_info(x, gene_params)
        }
        if len(channels) < 0:
            raise ValueError(
                f"No quantification parameters provided for channels: {snakemake.params['channels'].replace(';', ', ')}"
            )
        genes = list(channels.keys())
        fish_exprs = {}
        summarized_images = [None] * len(channels)
        embryo = snakemake.wildcards["embryo"]
        measures = get_quant_measure(snakemake.params["quant_method"])
        for i, gene in enumerate(genes):
            logging.info(f"Quantifying {gene} signal...")
            fish_data = raw_img[channels[gene], start:stop, :, :]
            quant, image = quantify_expression(
                fish_data,
                labels,
                measures=measures,
                voxel_size_nm=dimensions.tolist(),
                dot_radius_nm=gene_params[gene]["radius"],
                whitehat=True,
                smooth_method="gaussian",
                smooth_sigma=7,
                verbose=True,
                bits=12,
                crop_image=snakemake.params["crop_image"],
            )
            for each in measures:
                fish_exprs[f"{gene}_{each}"] = quant[each]
            summarized_images[i] = image
        # write summarized expression images to netcdf using Xarray to keep
        # track of dims
        out_image = np.array(summarized_images)
        xr.DataArray(
            data=out_image,
            coords={"gene": genes, "measure": measures},
            dims=["gene", "measure", "Z", "Y", "X"],
        ).to_netcdf(snakemake.output["image"])
        exprs_df = pd.DataFrame.from_dict(fish_exprs)
        exprs_df.index.name = "label"
        physical_properties = (
            pd.DataFrame(
                measure.regionprops_table(
                    labels,
                    properties={
                        "label",
                        "centroid",
                        "area",
                        "equivalent_diameter",
                    },
                )
            )
            .rename(
                columns={
                    "centroid-0": "Z",
                    "centroid-1": "Y",
                    "centroid-2": "X",
                    "equivalent_diameter": "diameter",
                }
            )
            .set_index("label")
        )
        out = exprs_df.join(physical_properties)
        out["embryo"] = embryo
        out[
            [
                "embryo",
                "area",
                "diameter",
                "Z",
                "Y",
                "X",
            ]
            + [
                f"{gene}_{measure}"
                for (gene, measure) in itertools.product(genes, measures)
            ]
        ].to_csv(snakemake.output["csv"])
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import sys
import os

import h5py
import numpy as np
from skimage import exposure

try:
    from intensipy import Intensify
except ImportError:
    pass

sys.path.append(os.path.basename(__file__))
import utils


def get_channel_index(channels, channel):
    channel_index = [
        i for i, x in enumerate(channels.split(";")) if x.lower() == channel.lower()
    ][0]
    return channel_index


def preprocess_slice(img, upper_percentile=99.99, new_min=0, new_max=1):
    """Preprocess a Z-slice by scaling and equalizing intensities.

    Parameters
    ----------
    img : np.ndarray
        Image slice to scale and equalize.
    upper_percentile : float, optional
        Upper bound to clip intensities for scaling, by default 99.99.
    new_min : int, optional
        New minimum intensity value, by default 0.
    new_max : int, optional
        New maximum intensity value, by default 1.

    Returns
    -------
    np.ndarray
        Scaled and equalize image slice.
    """
    lb, ub = np.percentile(img, (0, upper_percentile))
    out = exposure.equalize_adapthist(
        exposure.rescale_intensity(img, in_range=(lb, ub), out_range=(new_min, new_max))
    )
    return out


if __name__ == "__main__":
    try:
        snakemake
    except NameError:
        snakemake = None
    if snakemake is not None:
        img, __ = utils.read_image_file(snakemake.input["image"])
        channel = get_channel_index(
            snakemake.params["channels"], snakemake.params["channel_name"]
        )
        z_start = int(snakemake.params["z_start"])
        z_stop = int(snakemake.params["z_end"])
        pmc = img[channel, z_start:z_stop, :, :]
        if snakemake.params["intensipy"]:
            model = Intensify(xy_norm=False, dy=29, dx=29)
            pmc = model.normalize(pmc.astype(float))
        else:
            pmc = np.array(
                [
                    preprocess_slice(x, upper_percentile=100, new_min=0, new_max=1)
                    for x in pmc
                ]
            )
        utils.to_hdf5(pmc, snakemake.output["h5"])

genomic relatedness detection pipeline (v1.8)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import pandas
import numpy
from utils.bcftools import bcftools_query


def interpolate(vcf_path, cm_path, output_path, chrom):

    genetic_map = pandas.read_csv(cm_path, sep=" ")

    pos = list(map(int, bcftools_query(vcf_path, arg="%POS\n", list_samples=False)))
    # interpolate CM from BP
    #chro = genetic_map[genetic_map.chr == int(chrom)]
    # because for chr19 'Genetic_Map(cM)' is 'Genetic_Map.cM.'
    cms = numpy.interp(pos, genetic_map.position.values, genetic_map.iloc[:, -1].values)

    eps = 1e-7
    new_cms = []
    for i in range(0, cms.shape[0]):
        if i == 0:
            new_cms.append(cms[i])
            continue

        if new_cms[-1] >= cms[i]:
            new_cms.append(new_cms[-1] + eps)
        else:
            new_cms.append(cms[i])

    with open(output_path, 'w') as w_map:
        for i, cm in enumerate(new_cms):
            if i > 0 and cm <= new_cms[i-1]:
                raise ValueError(f'cm positions {i-1} and {i} are NOT strictly increasing: {new_cms[:i+1]}')
            w_map.write(f"{i}\t{cm:.8f}\n")


if __name__ == '__main__':
    vcf_path = snakemake.input['vcf']
    cm_path = snakemake.input['cmmap']

    chrom = snakemake.wildcards['chrom']

    output_path = snakemake.output[0]
    interpolate(vcf_path, cm_path, output_path, chrom)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
from utils.ibd import merge_germline_segments, segments_to_germline, interpolate_all, read_germline_segments, read_pedsim_segments


if __name__ == '__main__':

    germline_path = snakemake.input['germline']
    map_dir = snakemake.params['cm_dir']
    gap = float(snakemake.params['merge_gap'])
    pedsim_path = snakemake.params['true_ibd']

    use_true_ibd = bool(snakemake.params['use_true_ibd'])
    if not use_true_ibd:
        segments = read_germline_segments(germline_path)
        segments = interpolate_all(segments, map_dir)
        segments = merge_germline_segments(segments, gap=gap)
        segments_to_germline(segments, snakemake.output['ibd'])
    else:
        true_segments = read_pedsim_segments(pedsim_path)
        segments_to_germline(true_segments, snakemake.output['ibd'])
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
import pandas
import numpy
import os
import logging
from collections import Counter, namedtuple
from utils.ibd import read_king_segments as rks, interpolate_all, Segment


def is_non_zero_file(fpath):
    return os.path.isfile(fpath) and os.path.getsize(fpath) > 0


def _sort_ids(data):
    unsorted_mask = data.id2 < data.id1
    data.loc[unsorted_mask, 'id1'], data.loc[unsorted_mask, 'id2'] = data.loc[unsorted_mask, 'id2'], data.loc[
        unsorted_mask, 'id1']


def read_bucket_dir(bucket_dir):

    total = None
    for file in os.listdir(bucket_dir):
        if not file.endswith('tsv'):
            continue
        path = os.path.join(bucket_dir, file)
        bucket = read_germline(path)
        if total is None:
            total = bucket
        else:
            total = total.append(bucket)
    return total


def read_germline(ibd_path):
    germline_names = [
        'fid_iid1',
        'fid_iid2',
        'chrom',
        'start_end_bp',
        'start_end_snp',
        'snp_len',
        'genetic_len',
        'len_units',
        'mismatches',
        'is_homozygous1',
        'is_homozygous2'
    ]

    data = pandas.read_table(ibd_path, header=None, names=germline_names)
    if data.shape[0] == 0:
        return pandas.DataFrame(columns=['id1', 'id2', 'seg_count_germline', 'total_seg_len_germline']).set_index(['id1', 'id2'])

    data.loc[:, 'id1'] = data.fid_iid1.str.split().str[1]
    data.loc[:, 'id2'] = data.fid_iid2.str.split().str[1]
    _sort_ids(data)
    segments_info = data.loc[:, ['id1', 'id2', 'chrom', 'genetic_len']].groupby(by=['id1', 'id2']).agg({'genetic_len': 'sum', 'chrom': 'count'})
    segments_info.rename({'genetic_len': 'total_seg_len_germline', 'chrom': 'seg_count_germline'}, axis=1, inplace=True)
    return segments_info


def map_king_degree(king_degree):
    degree_map = {
        'Dup/MZ': 0,
        'PO': 1,
        'FS': 2,
        '2nd': 2,
        '3rd': 3,
        '4th': 4,
        'NA': numpy.nan
    }
    return [degree_map[kd] for kd in king_degree]


def map_king_relation(king_degree):
    degree_map = {
        'Dup/MZ': 0,
        'PO': 'PO',
        'FS': 'FS',
        '2nd': 2,
        '3rd': 3,
        '4th': 4,
        'NA': numpy.nan
    }
    return [degree_map[kd] for kd in king_degree]


def read_king(king_path):
    # FID1    ID1     FID2    ID2     MaxIBD1 MaxIBD2 IBD1Seg IBD2Seg PropIBD InfType
    try:
        data = pandas.read_table(king_path)
        data.loc[:, 'id1'] = data.FID1.astype(str) + '_' + data.ID1.astype(str)
        data.loc[:, 'id2'] = data.FID2.astype(str) + '_' + data.ID2.astype(str)
        _sort_ids(data)

        data.loc[:, 'king_degree'] = map_king_degree(data.InfType)
        data.loc[:, 'king_relation'] = map_king_relation(data.InfType)
        data.loc[:, 'king_degree'] = data.king_degree.astype(float).astype(pandas.Int32Dtype())
        data.rename({'PropIBD': 'shared_genome_proportion', 'IBD1Seg': 'king_ibd1_prop', 'IBD2Seg': 'king_ibd2_prop'}, axis='columns', inplace=True)
        indexed = data.loc[:, ['id1', 'id2', 'king_degree', 'king_relation', 'king_ibd1_prop', 'king_ibd2_prop', 'shared_genome_proportion']].\
            set_index(['id1', 'id2'])
        return indexed

    except pandas.errors.EmptyDataError:
        return pandas.DataFrame(data=None, index=None,
                                columns=['id1',
                                         'id2',
                                         'king_degree',
                                         'king_relation',
                                         'king_ibd1_prop',
                                         'king_ibd2_prop',
                                         'shared_genome_proportion']).set_index(['id1', 'id2'])


def read_king_segments_chunked(king_segments_path, map_dir):
    total = None
    try:
        for i, chunk in enumerate(pandas.read_table(
                                                    king_segments_path, 
                                                    compression='gzip', 
                                                    dtype={'FID1': str, 'ID1': str, 'FID2': str, 'ID2': str}, 
                                                    chunksize=1e+6)):

            segments = {}
            for i, row in chunk.iterrows():
                id1 = row['FID1'] + '_' + row['ID1']
                id2 = row['FID2'] + '_' + row['ID2']
                seg = Segment(id1, id2, row['Chr'],
                            bp_start=row['StartMB']*1e+6, bp_end=row['StopMB']*1e+6)
                key = tuple(sorted((seg.id1, seg.id2)))
                if key not in segments:
                    segments[key] = [seg]
                else:
                    segments[key].append(seg)

            segments = interpolate_all(segments, map_dir)
            data = pandas.DataFrame(columns=['id1', 'id2', 'total_seg_len_king', 'seg_count_king'])
            logging.info(f'loaded and interpolated segments from chunk {i} for {len(segments)} pairs')
            rows = []
            for key, segs in segments.items():
                row = {'id1': key[0],
                    'id2': key[1],
                    'total_seg_len_king': sum([s.cm_len for s in segs]),
                    'seg_count_king': len(segs)}
                rows.append(row)

            rows_frame = pandas.DataFrame.from_records(rows, columns=['id1', 'id2', 'total_seg_len_king', 'seg_count_king'])
            data = pandas.concat([data, rows_frame], ignore_index=True)

            _sort_ids(data)
            data = data.set_index(['id1', 'id2'])
            total = total.append(data) if total is not None else data

        return total

    except pandas.errors.EmptyDataError:
        return pandas.DataFrame(columns=['id1', 'id2', 'total_seg_len_king', 'seg_count_king']).set_index(['id1', 'id2'])


def read_king_segments(king_segments_path, map_dir):
    try:
        segments = rks(king_segments_path)
        segments = interpolate_all(segments, map_dir)
        data = pandas.DataFrame(columns=['id1', 'id2', 'total_seg_len_king', 'seg_count_king'])
        logging.info(f'loaded and interpolated segments for {len(segments)} pairs')
        for key, segs in segments.items():
            row = {'id1': key[0],
                   'id2': key[1],
                   'total_seg_len_king': sum([s.cm_len for s in segs]),
                   'seg_count_king': len(segs)}
            data = data.append(row, ignore_index=True)

        _sort_ids(data)
        return data.set_index(['id1', 'id2'])

    except pandas.errors.EmptyDataError:
        return pandas.DataFrame(columns=['id1', 'id2', 'total_seg_len_king', 'seg_count_king']).set_index(['id1', 'id2'])


def kinship_to_degree(kinship):
    degrees = []
    # intervals are from KING manual
    # >0.354, [0.177, 0.354], [0.0884, 0.177] and [0.0442, 0.0884]
    for k in kinship:
        if k > 0.354:
            degrees.append(0)
        elif 0.177 < k <= 0.354:
            degrees.append(1)
        elif 0.0884 < k <= 0.177:
            degrees.append(2)
        elif 0.0442 < k <= 0.0884:
            degrees.append(3)
        else:
            degrees.append(None)
    return degrees


def _read_kinship_data(kinship_path):
    try:
        data = pandas.read_table(kinship_path)
        print(kinship_path, data.shape)
        # If no relations were found, king creates file with only header
        if data.shape[0] != 0:
            if 'FID' in data.columns:
                data.loc[:, 'id1'] = data.FID.astype(str) + '_' + data.ID1.astype(str)
                data.loc[:, 'id2'] = data.FID.astype(str) + '_' + data.ID2.astype(str)
            else:
                data.loc[:, 'id1'] = data.FID1.astype(str) + '_' + data.ID1.astype(str)
                data.loc[:, 'id2'] = data.FID2.astype(str) + '_' + data.ID2.astype(str)

            _sort_ids(data)
            data.rename({'Kinship': 'kinship'}, axis=1, inplace=True)
            data.loc[:, 'kinship_degree'] = kinship_to_degree(data.kinship)
            data = data.loc[:, ['id1', 'id2', 'kinship', 'kinship_degree']].set_index(['id1', 'id2'])
        else:
            data = pandas.DataFrame(columns=['id1', 'id2', 'kinship', 'kinship_degree']).set_index(['id1', 'id2'])
        return data
    except pandas.errors.EmptyDataError:
        return pandas.DataFrame(columns=['id1', 'id2', 'kinship', 'kinship_degree']).set_index(['id1', 'id2'])


def read_kinship(kinship_path, kinship0_path):
    # parse within families
    # FID     ID1     ID2     N_SNP   Z0      Phi     HetHet  IBS0    Kinship Error
    within = _read_kinship_data(kinship_path)
    logging.info(f'loaded {within.shape[0]} pairs from within-families kinship estimation results')

    # FID1    ID1     FID2    ID2     N_SNP   HetHet  IBS0    Kinship
    across = _read_kinship_data(kinship0_path)
    logging.info(f'loaded {across.shape[0]} pairs from across-families kinship estimation results')
    return pandas.concat([within, across], axis=0)


def read_ersa(ersa_path):
    # Indv_1     Indv_2      Rel_est1      Rel_est2      d_est     lower_d  upper_d     N_seg     Tot_cM
    data = pandas.read_table(ersa_path, header=0,
                             names=['id1', 'id2', 'rel_est1', 'rel_est2',
                                    'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'seg_count_ersa', 'total_seg_len_ersa'],
                             dtype={'ersa_degree': str, 'seg_count_ersa': str, 'total_seg_len_ersa': str})

    data = data.loc[(data.rel_est1 != 'NA') | (data.rel_est2 != 'NA'), :]
    data.loc[:, 'id1'] = data.id1.str.strip()
    data.loc[:, 'id2'] = data.id2.str.strip()
    _sort_ids(data)
    data.loc[:, 'ersa_degree'] = pandas.to_numeric(data.ersa_degree.str.strip(), errors='coerce').astype(
        pandas.Int32Dtype())
    data.loc[:, 'ersa_lower_bound'] = pandas.to_numeric(data.ersa_lower_bound.str.strip(), errors='coerce').astype(
        pandas.Int32Dtype())
    data.loc[:, 'ersa_upper_bound'] = pandas.to_numeric(data.ersa_upper_bound.str.strip(), errors='coerce').astype(
        pandas.Int32Dtype())
    print('dtypes are: ', data.dtypes)
    data.loc[:, 'seg_count_ersa'] = pandas.to_numeric(data.seg_count_ersa.str.strip(), errors='coerce').astype(
        pandas.Int32Dtype())
    data.loc[:, 'total_seg_len_ersa'] = pandas.to_numeric(data.total_seg_len_ersa.str.strip(), errors='coerce')

    data.loc[data.ersa_lower_bound != 1, 'ersa_lower_bound'] = data.ersa_lower_bound - 1
    data.loc[data.ersa_upper_bound != 1, 'ersa_upper_bound'] = data.ersa_upper_bound - 1

    logging.info(f'read {data.shape[0]} pairs from ersa output')
    logging.info(f'ersa after reading has {pandas.notna(data.ersa_degree).sum()}')
    data.loc[:, 'is_niece_aunt'] = [True if 'Niece' in d else False for d in data.rel_est1]
    logging.info(f'we have total of {data.is_niece_aunt.sum()} possible Niece/Aunt relationships')

    return data.loc[data.id1 != data.id2, ['id1', 'id2', 'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'is_niece_aunt', 'seg_count_ersa', 'total_seg_len_ersa']].\
        set_index(['id1', 'id2'])


def read_ersa2(ersa_path):
    # individual_1    individual_2    est_number_of_shared_ancestors  est_degree_of_relatedness       0.95 CI_2p_lower:4
    # 2p_upper:5     1p_lower:6 1p_upper:7        0p_lower:8        0p_upper:9
    # maxlnl_relatedness      maxlnl_unrelatednes
    data = pandas.read_table(ersa_path, comment='#')

    data = data.loc[data.est_degree_of_relatedness != 'no_sig_rel', :]
    data.rename({'individual_1': 'id1', 'individual_2': 'id2', 'est_number_of_shared_ancestors': 'shared_ancestors'}, axis=1, inplace=True)
    data.loc[:, 'ersa_degree'] = pandas.to_numeric(data['est_degree_of_relatedness'], errors='coerce').\
        astype(pandas.Int32Dtype())

    lower, upper = [], []
    for i, ancestors in enumerate(data.shared_ancestors):
        if ancestors == 0:
            lower.append(data.iloc[i, 8])
            upper.append(data.iloc[i, 9])
        if ancestors == 1:
            lower.append(data.iloc[i, 6])
            upper.append(data.iloc[i, 7])
        if ancestors == 2:
            lower.append(data.iloc[i, 4])
            upper.append(data.iloc[i, 5])

    data.loc[:, 'ersa_lower_bound'] = lower
    data.loc[:, 'ersa_upper_bound'] = upper

    cols = ['id1', 'id2', 'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'shared_ancestors']
    return data.loc[data.id1 != data.id2, cols].set_index(['id1', 'id2'])


if __name__ == '__main__':

    try:
        snakemake
    except NameError:
        test_dir = '/media_ssd/pipeline_data/TF-CEU-TRIBES-ibis-king-2'
        Snakemake = namedtuple('Snakemake', ['input', 'output', 'params', 'log'])
        snakemake = Snakemake(
            input={'king': f'{test_dir}/king/data.seg',
                   'king_segments': f'{test_dir}/king/data.segments.gz',
                   'kinship': f'{test_dir}/king/data.kin',
                   'kinship0': f'{test_dir}/king/data.kin0',
                   'ersa': f'{test_dir}/ersa/relatives.tsv'},

            output=[f'test_data/merge_king_ersa.tsv'],
            params={'cm_dir': f'{test_dir}/cm'},
            log=['test_data/merge_king_ersa.log']
        )

    logging.basicConfig(filename=snakemake.log[0], level=logging.DEBUG, format='%(levelname)s:%(asctime)s %(message)s')

    king_path = snakemake.input['king']
    king_segments_path = snakemake.input['king_segments']
    # within families
    kinship_path = snakemake.input['kinship']
    # across families
    kinship0_path = snakemake.input['kinship0']
    ersa_path = snakemake.input['ersa']
    map_dir = snakemake.params['cm_dir']
    output_path = snakemake.output[0]

    king = read_king(king_path)
    kinship = read_kinship(kinship_path, kinship0_path)
    king_segments = read_king_segments_chunked(king_segments_path, map_dir)
    ersa = read_ersa(ersa_path)

    relatives = king.merge(kinship, how='outer', left_index=True, right_index=True).\
                     merge(ersa, how='outer', left_index=True, right_index=True).\
                     merge(king_segments, how='outer', left_index=True, right_index=True)

    prefer_ersa_mask = pandas.isnull(relatives.king_degree) | (relatives.king_degree > 3)
    relatives.loc[:, 'final_degree'] = relatives.king_degree
    # if king is unsure or king degree > 3 then we use ersa distant relatives estimation
    relatives.loc[prefer_ersa_mask, 'final_degree'] = relatives.ersa_degree
    logging.info(f'king is null or more than 3: {prefer_ersa_mask.sum()}')
    logging.info(f'ersa is not null: {pandas.notna(relatives.ersa_degree).sum()}')
    print(f'relatives columns are {relatives.columns}')
    if 'total_seg_len_king' in relatives.columns:
        relatives.loc[:, 'total_seg_len'] = relatives.total_seg_len_king*relatives.king_ibd1_prop
        relatives.loc[:, 'total_seg_len_ibd2'] = relatives.total_seg_len_king*relatives.king_ibd2_prop
        relatives.loc[:, 'seg_count'] = relatives.seg_count_king

    relatives.loc[prefer_ersa_mask, 'total_seg_len'] = relatives.total_seg_len_ersa
    relatives.loc[prefer_ersa_mask, 'seg_count'] = relatives.seg_count_ersa
    print('is na: ', pandas.isna(relatives.loc[prefer_ersa_mask, 'total_seg_len_ersa']).sum())
    # approximate calculations, IBD share is really small in this case
    relatives.loc[prefer_ersa_mask, 'shared_genome_proportion'] = 0.5*relatives.loc[prefer_ersa_mask, 'total_seg_len'].values / 3440
    relatives.drop(['total_seg_len_king', 'seg_count_king', 'total_seg_len_ersa', 'seg_count_ersa', 'king_ibd1_prop', 'king_ibd2_prop'],
                   axis='columns', inplace=True)

    logging.info(f'final degree not null: {pandas.notna(relatives.final_degree).sum()}')

    relatives.loc[pandas.notna(relatives.final_degree), :].to_csv(output_path, sep='\t')
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import pandas
import logging
from utils.bcftools import bcftools_stats
from utils.bcftools import bcftools_query
from utils.bcftools import bcftools_view
from io import StringIO


def find_outliers(psc: pandas.DataFrame, outliers_file_path: str, keep_samples_file_path: str, alpha: float):

    q1 = psc.nNonMissing.quantile(0.25)
    q3 = psc.nNonMissing.quantile(0.75)
    iqr = q3-q1
    lower_bound_mask = psc.nNonMissing < (q1 - alpha*iqr)
    upper_bound_mask = psc.nNonMissing > (q3 + alpha*iqr)
    outliers = psc[lower_bound_mask | upper_bound_mask]
    if outliers.empty:
        outliers['exclusion_reason'] = []
    else:
        outliers.loc[lower_bound_mask, 'exclusion_reason'] = f'Sample was below 1st quantile - IQR*{alpha} ({q1 - alpha*iqr}) by Missing Samples'
        outliers.loc[upper_bound_mask, 'exclusion_reason'] = f'Sample was above 3rd quantile + IQR*{alpha} ({q3 + alpha*iqr}) by Missing Samples'
    keep_samples = pandas.concat([psc, outliers, outliers]).drop_duplicates(keep=False)

    outliers_list = list(outliers.sample_id)
    with open(outliers_file_path, 'w') as outliers_file:
        for outlier in outliers_list:
            outliers_file.write(outlier + '\n')

    keep_samples_list = list(keep_samples.sample_id)
    with open(keep_samples_file_path, 'w') as keep_samples_file:
        for sample in keep_samples_list:
            keep_samples_file.write(sample + '\n')

    return outliers


def get_stats(vcf_input, samples_path, psc_path=False, stats_path=''):

    bcftools_query(vcf_input, list_samples=True, save_output=samples_path)
    raw_table = bcftools_stats(vcf_input, samples_path, save_output=psc_path, save_stats=stats_path)
    names = [
        'PSC',
        'id',
        'sample_id',
        'nRefHom',
        'nNonRefHom',
        'nHets',
        'nTransitions',
        'nTransversions',
        'nIndels',
        'average depth',
        'nSingletons',
        'nHapRef',
        'nHapAlt',
        'nMissing'
    ]
    psc = pandas.read_table(StringIO(raw_table), header=None, names=names)

    psc.loc[:, 'nNonMissing'] = psc.nRefHom + psc.nNonRefHom + psc.nHets
    psc.loc[:, 'missing_share'] = psc.nMissing / (psc.nMissing + psc.nNonMissing)
    psc.loc[:, 'alt_hom_share'] = psc.nNonRefHom / psc.nNonMissing
    psc.loc[:, 'het_samples_share'] = psc.nHets / psc.nNonMissing

    return psc



if __name__ == '__main__':

    input_vcf = snakemake.input['vcf']
    stats_file = snakemake.output['stats']
    samples = snakemake.params['samples']
    psc_path = snakemake.params['psc']
    keep_samples = snakemake.params['keep_samples']
    bad_samples_path = snakemake.output['bad_samples']
    report_path = snakemake.output['report']
    outliers = snakemake.output['outliers']
    no_outliers_vcf = snakemake.output['no_outliers_vcf']
    missing_samples = float(snakemake.params['missing_samples'])
    alt_hom_samples = float(snakemake.params['alt_hom_samples'])
    het_samples = float(snakemake.params['het_samples'])
    alpha = float(snakemake.params['iqr_alpha'])

    logging.basicConfig(filename=snakemake.log[0], level=logging.DEBUG, format='%(levelname)s:%(asctime)s %(message)s')

    # get initial stats, that we do not save
    logging.info("Getting initial vcf stats...")
    psc = get_stats(vcf_input=input_vcf,
                    samples_path=samples,
                    psc_path=psc_path,
                    stats_path=stats_file)
    # filter outliers
    outliers = find_outliers(psc, outliers_file_path=outliers, keep_samples_file_path=keep_samples, alpha=alpha)
    logging.info(f"Found {len(outliers)} outliers!")
    if not outliers.empty:
        bcftools_view(input_vcf, no_outliers_vcf, keep_samples)
    else:
        with open(no_outliers_vcf, 'w') as dummy_no_outliers_vcf:
            dummy_no_outliers_vcf.write('')
        no_outliers_vcf = input_vcf

    # get final stats without outliers
    logging.info("Getting final vcf stats...")
    psc = get_stats(vcf_input=no_outliers_vcf,
                    samples_path=samples,
                    psc_path=psc_path,
                    stats_path=stats_file)

    bad_missing_samples_mask = (psc.missing_share >= missing_samples / 100) | (psc.nMissing + psc.nNonMissing == 0)

    bad_alt_hom_samples_mask = (psc.alt_hom_share < alt_hom_samples / 100) | (psc.nNonMissing == 0)

    bad_het_samples_mask = (psc.het_samples_share < het_samples / 100) | (psc.nNonMissing == 0)

    psc.loc[bad_het_samples_mask, 'exclusion_reason'] = f'Sample has <= {het_samples}% heterozygous SNPs'
    psc.loc[bad_alt_hom_samples_mask, 'exclusion_reason'] = f'Sample has <= {alt_hom_samples}% homozygous alternative SNPs'
    psc.loc[bad_missing_samples_mask, 'exclusion_reason'] = f'Sample has >= {missing_samples}% missing SNPs'
    bad_samples = psc.loc[(bad_alt_hom_samples_mask | bad_het_samples_mask | bad_missing_samples_mask),
                          ['sample_id', 'missing_share', 'alt_hom_share', 'het_samples_share', 'exclusion_reason']]
    psc = pandas.concat([psc, outliers[['sample_id', 'missing_share', 'alt_hom_share', 'het_samples_share', 'exclusion_reason']]], ignore_index=True)

    samples_only = bad_samples.loc[:, ['sample_id']].copy()
    # if input vcf file has iids in form fid_iid we split it, else we just assign fid equal to iid
    samples_only.loc[:, 'fid'] = [s.split('_')[0] if '_' in s else s for s in samples_only.sample_id]
    samples_only.loc[:, 'iid'] = [s.split('_')[1] if '_' in s else s for s in samples_only.sample_id]

    samples_only.loc[:, ['fid', 'iid']].to_csv(bad_samples_path, sep='\t', header=None, index=False)
    bad_samples.to_csv(report_path, sep='\t')

    log = f'We have total of {psc.shape[0]} samples\n' \
          f'{bad_missing_samples_mask.sum()} samples have >= {missing_samples}% missing share\n' \
          f'{bad_alt_hom_samples_mask.sum()} samples have <= {alt_hom_samples}% homozygous alternative variants\n' \
          f'{bad_het_samples_mask.sum()} samples have <= {het_samples}% heterozygous variants\n' \
          f'{len(outliers)} samples detected as outliers\n'

    logging.info(log)
    print(log)

Metagenome-Assembled Genome Analysis of Gut Microbiota (v2.16.2)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logging.captureWarnings(True)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception

#### Begining of scripts


from common_report import *

import pandas as pd
import plotly.express as px


from utils.taxonomy import tax2table


def make_plots(bin_table):
    div = {}

    div["input_file"] = bin_table

    # Prepare data
    df = pd.read_table(bin_table)

    if snakemake.config["bin_quality_asesser"].lower() == "busco":
        df["Bin Id"] = df["Input_file"].str.replace(".fasta", "", regex=False)

        logging.info("No taxonomic information available, use busco Dataset")

        lineage_name = "Dataset"
        hover_data = [
            "Scores_archaea_odb10",
            "Scores_bacteria_odb10",
            "Scores_eukaryota_odb10",
        ]
        size_name = None

    elif snakemake.config["bin_quality_asesser"].lower() == "checkm":
        df = df.join(
            tax2table(df["Taxonomy (contained)"], remove_prefix=True).fillna("NA")
        )

        lineage_name = "phylum"
        size_name = "Genome size (Mbp)"
        hover_data = ["genus"]

    elif snakemake.config["bin_quality_asesser"].lower() == "checkm2":
        df["Bin Id"] = df.index

        lineage_name = "Translation_Table_Used"
        hover_data = [
            "Completeness_Model_Used",
            "Coding_Density",
            "Contig_N50",
            "GC_Content",
            "Additional_Notes",
        ]
        size_name = "Genome_Size"
    else:
        raise Exception(f"bin_quality_asesser in the config file not understood")

    df.index = df["Bin Id"]

    div[
        "QualityScore"
    ] = "<p>Quality score is calculated as: Completeness - 5 x Contamination.</p>"

    # 2D plot
    fig = px.scatter(
        data_frame=df,
        y="Completeness",
        x="Contamination",
        color=lineage_name,
        size=size_name,
        hover_data=hover_data,
        hover_name="Bin Id",
    )
    fig.update_yaxes(range=(50, 102))
    fig.update_xaxes(range=(-0.2, 10.1))
    div["2D"] = fig.to_html(**HTML_PARAMS)

    ## By sample
    fig = px.strip(
        data_frame=df,
        y="Quality_score",
        x="Sample",
        color=lineage_name,
        hover_data=hover_data,
        hover_name="Bin Id",
    )
    fig.update_yaxes(range=(50, 102))
    div["bySample"] = fig.to_html(**HTML_PARAMS)

    # By Phylum
    fig = px.strip(
        data_frame=df,
        y="Quality_score",
        x=lineage_name,
        hover_data=hover_data,
        hover_name="Bin Id",
    )
    fig.update_yaxes(range=(50, 102))
    div["byPhylum"] = fig.to_html(**HTML_PARAMS)

    return div


# main


div = make_plots(bin_table=snakemake.input.bin_table)


make_html(
    div=div,
    report_out=snakemake.output.report,
    html_template_file=os.path.join(reports_dir, "template_bin_report.html"),
    wildcards=snakemake.wildcards,
)
tool / pypi

utils

Utils ===== .. image:: https://travis-ci.org/haaksmash/pyutils.svg?branch=master :target: https://travis-ci.org/haaksmash/pyutils Sometimes you write a function over and over again; sometimes you look up at the ceiling and ask "why, Guido, why isn't this included in the standard library?" Well, we perhaps can't answer that question. But we can collect those functions into a centralized place! Provided things +++++++++++++++ Utils is broken up into broad swathes of functionality, to ease the task of remembering where exactly something lives. enum ---- Python doesn't have a built-in way to define an enum, so this module provides (what I think) is a pretty clean way to go about them. .. code-block:: python from utils import enum class Colors(enum.Enum): RED = 0 GREEN = 1 # Defining an Enum class allows you to specify a few # things about the way it's going to behave. class Options: frozen = True # can't change attributes stric