Workflow Steps and Code Snippets
225 tagged steps and code snippets that match keyword utils
Single-Cell and Single-Nuclei Data Integration Workflow
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | import os import anndata as adata import argparse import re from utils.integrate_scanorama import integrate_scanorama # define arguments parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_anndata', dest = 'input_anndata', required = True, help = 'Path to HDF5 file with merged AnnData object to integrate') parser.add_argument('-o', '--output_anndata', dest = 'output_anndata', required = True, help = 'Path to HDF5 file to save the integrated AnnData object') parser.add_argument('-b', '--batch_column', dest = 'batch_column', default = 'batch', help = 'The name of the column in `anndata.obs` that indicates the batches for each cell, ' ' typically this corresponds to the library id.') parser.add_argument('--use_hvg', dest = 'use_hvg', action = 'store_true', default = False, help = 'Boolean indicating whether or not to use only highly variable genes for data integration.' 'If --use_hvg is used, integration will only consider highly variable genes, and similarly ' 'the returned integrated object will only contain the highly variable genes.') parser.add_argument('--corrected_only', dest = 'corrected_only', action = 'store_true', help = 'Boolean indicating to return only the corrected data or all raw data.' ' Default will return all data. To return only corrected data, use --corrected_only.') parser.add_argument('-s', '--seed', dest = 'seed', type=int, default = None, help = 'Random seed to set for scanorama.') args = parser.parse_args() # compile extension regex file_ext = re.compile(r"\.hdf5$|.h5$", re.IGNORECASE) # check that input file exists, if it does exist, make sure it's an h5 file if not os.path.exists(args.input_anndata): raise FileExistsError("--input_anndata file not found.") elif not file_ext.search(args.input_anndata): raise ValueError("--input_anndata must end in either .hdf5 or .h5 and contain a merged AnnData object.") # make sure output file path is h5 file if not file_ext.search(args.output_anndata): raise ValueError("--output_anndata must provide a file path ending in either .hdf5 or .h5.") # check that output file directory exists and create directory if doesn't exist integrated_adata_dir = os.path.dirname(args.output_anndata) if not os.path.isdir(integrated_adata_dir): os.makedirs(integrated_adata_dir, exist_ok = True) # read in merged anndata object merged_adata = adata.read_h5ad(args.input_anndata) if args.use_hvg: try: var_genes = list(merged_adata.uns['variable_genes']) except KeyError: print("Variable genes cannot be found in anndata object." " If using --use_hvg, make sure HVG are stored in adata.uns['variable_genes'].") raise else: var_genes = list(merged_adata.var_names) # integrate anndata with scanorama scanorama_integrated_adata = integrate_scanorama(merged_adata, integrate_genes = var_genes, batch_column = args.batch_column, seed = args.seed) # remove raw data and logcounts to minimize space if corrected_only is true if args.corrected_only: print("Removing raw data and log-normalized data. Only corrected data will be returned.") scanorama_integrated_adata.X = None del scanorama_integrated_adata.layers["logcounts"] # write anndata to h5 scanorama_integrated_adata.write(filename = args.output_anndata) |
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | import os import anndata as adata import argparse import re from utils.integrate_scvi import integrate_scvi # define arguments parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_anndata', dest = 'input_anndata', required = True, help = 'Path to HDF5 file with merged AnnData object to integrate') parser.add_argument('-o', '--output_anndata', dest = 'output_anndata', required = True, help = 'Path to HDF5 file to save the integrated AnnData object') parser.add_argument('-b', '--batch_column', dest = 'batch_column', default = 'batch', help = 'The name of the column in `anndata.obs` that indicates the batches for each cell, ' ' typically this corresponds to the library id.') parser.add_argument('--categorical_covariates', dest = 'categorical_covariates', default = None, type = str, help = 'A comma-separated list of columns containing additional categorical data to be' ' included as a covariate. Default is None.') parser.add_argument('--continuous_covariates', dest = 'continuous_covariates', default = "subsets_mito_percent", type = str, help = 'A comma-separated list of columns containing additional continous data to be' ' included as a covariate. Default is "subsets_mito_percent".') parser.add_argument('--num_latent', dest = 'num_latent', type = int, default = 20, help = 'Number of dimensions to return from the latent space.') parser.add_argument('--use_hvg', dest = 'use_hvg', action = 'store_true', default = False, help = 'Boolean indicating whether or not to use only highly variable genes for data integration.' 'If --use_hvg is used, the returned integrated object will only contain the highly variable genes.') parser.add_argument('--corrected_only', dest = 'corrected_only', action = 'store_true', help = 'Boolean indicating to return only the corrected data or all raw data.' ' Default will return all data. To return only corrected data, use --corrected_only.') parser.add_argument('-s', '--seed', dest = 'seed', type=int, default = None, help = 'Random seed to set for scanorama.') args = parser.parse_args() # compile extension regex file_ext = re.compile(r"\.hdf5$|.h5$", re.IGNORECASE) # check that input file exists, if it does exist, make sure it's an h5 file if not os.path.exists(args.input_anndata): raise FileExistsError("--input_anndata file not found.") elif not file_ext.search(args.input_anndata): raise ValueError("--input_anndata must end in either .hdf5 or .h5 and contain a merged AnnData object.") # make sure output file path is h5 file if not file_ext.search(args.output_anndata): raise ValueError("--output_anndata must provide a file path ending in either .hdf5 or .h5.") # check that output file directory exists and create directory if doesn't exist integrated_adata_dir = os.path.dirname(args.output_anndata) os.makedirs(integrated_adata_dir, exist_ok = True) # read in merged anndata object merged_adata = adata.read_h5ad(args.input_anndata) if args.use_hvg: try: var_genes = list(merged_adata.uns['variable_genes']) except KeyError: print("Variable genes cannot be found in anndata object." " If using --use_hvg, make sure HVG are stored in adata.uns['variable_genes'].") raise else: var_genes = list(merged_adata.var_names) # split covariates from comma separated strings into lists if args.categorical_covariates: categorical_covariates = [covariate.strip() for covariate in args.categorical_covariates.split(',')] else: categorical_covariates = None if args.continuous_covariates: if args.continuous_covariates == "None": continuous_covariates = None else: continuous_covariates = [covariate.strip() for covariate in args.continuous_covariates.split(',')] else: continuous_covariates = None # integrate anndata with scvi scvi_integrated_adata = integrate_scvi(merged_adata, integrate_genes = var_genes, batch_column = args.batch_column, categorical_covariate_columns = categorical_covariates, continuous_covariate_columns = continuous_covariates, num_latent = args.num_latent, seed = args.seed) # remove raw data and logcounts to minimize space if corrected_only is true if args.corrected_only: print("Removing raw data and log-normalized data. Only corrected data will be returned.") scvi_integrated_adata.X = None del scvi_integrated_adata.layers["logcounts"] # write anndata to h5 scvi_integrated_adata.write(filename = args.output_anndata) |
TYTUS - UBCS statistics calculation tool
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | from utils import get_chrom_sizes from consts import QUALITY_WINDOW_LENGTH, MAX_NUMBER_OF_MISMACHES from Bio.Seq import Seq alignment_file = snakemake.input['alignment_file'] snd_file = snakemake.output['snd_file'] chrom_sizes_file = snakemake.input['query_chrom_sizes_file'] chrom_sizes = get_chrom_sizes(chrom_sizes_file) query_name = snakemake.wildcards['query'] target_name = snakemake.wildcards['target'] ffrom = snakemake.params['ffrom'] snp_file = snakemake.input['snp_file'] def get_snps(snp_file): snps = {} with open(snp_file) as f: for line in f: line = line.split() start_coord = int(line[1]) + 1 end_coord = int(line[2]) + 1 for i in range(start_coord, end_coord): base = line[4] snps[i] = (base, line) return snps def get_axt_entry(f, ffrom): header = f.readline() if not header: return None header = header.split() strand = header[7] if ffrom == 'target': t_chrom = header[1] t_start = int(header[2]) q_chrom = header[4] q_start = int(header[5]) q_end = int(header[6]) target_al_seq = f.readline().rstrip().upper() query_al_seq = f.readline().rstrip().upper() else: t_chrom = header[4] t_start = int(header[5]) q_chrom = header[1] q_start = int(header[2]) query_al_seq = f.readline().rstrip().upper() target_al_seq = f.readline().rstrip().upper() f.readline() return t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq def add_snds_from_axt_entry(t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq, snd_bed_entries): t_count = 0 q_count = 0 for i, (t_chr, q_chr) in enumerate(zip(target_al_seq, query_al_seq)): if t_chr != '-': t_count += 1 if q_chr != '-': q_count += 1 if t_chr == q_chr: continue t_window = target_al_seq[i - QUALITY_WINDOW_LENGTH: i + QUALITY_WINDOW_LENGTH + 1] q_window = query_al_seq[i - QUALITY_WINDOW_LENGTH: i + QUALITY_WINDOW_LENGTH + 1] if len(t_window) != 2 * QUALITY_WINDOW_LENGTH + 1: continue #print('enough place') if '-' in t_window + q_window: continue #print('without deletions and insertions') if len([ 1 for t_win_chr, q_win_chr in zip(t_window, q_window) if t_win_chr != q_win_chr]) > MAX_NUMBER_OF_MISMACHES: continue #print('number of mismatches above max threshold') t_subst_start = t_start + t_count - 2 if strand == '-' and ffrom == 'query': t_subst_start = chrom_sizes[t_chrom] - (t_start + t_count - 1) t_window = str(Seq(t_window).reverse_complement()) q_window = str(Seq(q_window).reverse_complement()) t_coord_start = str(t_subst_start - QUALITY_WINDOW_LENGTH) t_coord_end = str(t_subst_start + QUALITY_WINDOW_LENGTH + 1) t_coords = t_chrom + ':' + t_coord_start + '-' + t_coord_end q_subst_start = q_start + q_count - 2 if strand == '-' and ffrom == 'target': q_subst_start = chrom_sizes[q_chrom] - (q_start + q_count - 1) q_coord_start = str(q_subst_start - QUALITY_WINDOW_LENGTH) q_coord_end = str(q_subst_start + QUALITY_WINDOW_LENGTH + 1) q_coords = q_chrom + ':' + q_coord_start + '-' + q_coord_end t_snp = int(t_coord_start) + QUALITY_WINDOW_LENGTH + 1 in snps #if t_output_window[5] != snps[int(t_coord_start)+ 6][0]: # print(t_output_window, t_output_window[5], snps[int(t_coord_start) + 6] name = ('SND_ID:' + str(len(snd_bed_entries)) , 'QUERY:' + query_name, 'TARGET:'+ target_name, 'TARGET_COORDS:' + t_coords, 'QUERY_COORDS:' + q_coords, 'CHANGE:' + t_window + '>' + q_window, 'FROM:' + ffrom, 'SNP:' + str(t_snp), 'STRAND:' + strand) name = ';'.join(name) snd_bed_entry = '\t'.join([t_chrom, t_coord_start, t_coord_end, name, '0', strand ]) snd_bed_entries[int(t_coord_start)] = snd_bed_entry def get_snd_bed_entries(alignment_file, snps): with open(alignment_file) as f: snd_bed_entries = {} while True: axt_entry = get_axt_entry(f, ffrom) if axt_entry is None: break t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq = axt_entry add_snds_from_axt_entry(t_chrom, t_start, q_chrom, q_start, strand, target_al_seq, query_al_seq, snd_bed_entries) return snd_bed_entries def save_snd_bed_entries(snd_bed_entries_file, snd_bed_entries): with open(snd_file, 'w') as f_out: for t_coord_start in sorted(snd_bed_entries): snd_bed_entry = snd_bed_entries[t_coord_start] f_out.write(snd_bed_entry) f_out.write('\n') snps = get_snps(snp_file) snd_bed_entries = get_snd_bed_entries(alignment_file, snps) save_snd_bed_entries(snd_file, snd_bed_entries) |
Workflow for detecting PMCs in confocal images (v0.1.0)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 | import json import os import sys from collections import namedtuple import logging import itertools from turtle import dot import numpy as np import xarray as xr import skimage from skimage import exposure, io, morphology, measure from skimage.filters import threshold_otsu import bigfish.detection as bf_detection import bigfish.stack as bf_stack import bigfish.plot as bf_plot BoundingBox = namedtuple("BoundingBox", ["ymin", "ymax", "xmin", "xmax"]) try: sys.path.append(os.path.dirname(__file__)) except NameError: pass import utils def normalize_zstack(z_stack, bits): """Normalize z-slices in a 3D image using contrast stretching Parameters ---------- z_stack : numpy.ndarray 3 dimensional confocal FISH image. bits : int Bit depth of image. Returns ------- numpy.ndarray Z-corrected image with each slice minimum and maximum matched """ out = np.array( [ exposure.rescale_intensity( x, in_range=(0, 2 ** bits - 1), out_range=(z_stack.min(), z_stack.max()) ) for x in z_stack ] ) return skimage.img_as_uint(exposure.rescale_intensity(out)) def read_bit_img(img_file, bits=12): """Read an image and return as a 16-bit image.""" img = exposure.rescale_intensity( io.imread(img_file), in_range=(0, 2 ** (bits) - 1) # out_range=(0, ) ) return skimage.img_as_uint(img) def select_signal(image, p_in_focus=0.75, margin_width=10): """ Generate bounding box of FISH image to select on areas where signal is present. Parameters ---------- image : np.ndarray 3D FISH image p_in_focus : float, optional Percent of in-focus slices to retain for 2D projection, by default 0.75. margin_width : int, optional Number of pixels to pad selection by. Default is 10. Returns ------- namedtuple minimum and maximum coordinate values of the bounding box in the xy plane """ image = image.astype(np.uint16) focus = bf_stack.compute_focus(image) selected = bf_stack.in_focus_selection(image, focus, p_in_focus) projected_2d = bf_stack.maximum_projection(selected) foreground = np.where(projected_2d > threshold_otsu(projected_2d)) limits = BoundingBox( ymin=max(foreground[0].min() - margin_width, 0), ymax=min(foreground[0].max() + margin_width, image.shape[1]), xmin=max(foreground[1].min() - margin_width, 0), xmax=min(foreground[1].max() + margin_width, image.shape[2]), ) return limits def crop_to_selection(img, bbox): """ Crop image to selection defined by bounding box. Crops a 3D image to specified x and y coordinates. Parameters ---------- img : np.ndarray 3Dimensional image to crop bbox : namedtuple Tuple defining minimum and maximum coordinates for x and y planes. Returns ------- np.ndarray 3D image cropped to the specified selection. """ return img[:, bbox.ymin : bbox.ymax, bbox.xmin : bbox.xmax] def count_spots_in_labels(spots, labels): """ Count the number of RNA molecules in specified labels. Parameters ---------- spots : np.ndarray Coordinates in original image where RNA molecules were detected. labels : np.ndarray Integer array of same shape as `img` denoting regions to interest to quantify. Each separate region should be uniquely labeled. Returns ------- dict dictionary containing the number of molecules contained in each labeled region. """ assert spots.shape[1] == len(labels.shape) n_labels = np.unique(labels) - 1 # subtract one for backgroudn counts = {i: 0 for i in range(1, n_labels + 1)} for each in spots: if len(each) == 3: cell_label = labels[each[0], each[1], each[2]] else: cell_label = labels[each[0], each[1]] if cell_label != 0: counts[cell_label] += 1 return counts def preprocess_image( img, smooth_method="gaussian", sigma=7, whitehat=True, selem=None, stretch=99.99 ): scaled = exposure.rescale_intensity( img, in_range=tuple(np.percentile(img, [0, stretch])) ) if smooth_method == "log": smooth_func = bf_stack.log_filter to_smooth = bf_stack.cast_img_float64(scaled) elif smooth_method == "gaussian": smooth_func = bf_stack.remove_background_gaussian to_smooth = bf_stack.cast_img_uint16(scaled) else: raise ValueError(f"Unsupported background filter: {smooth_method}") if whitehat: f = lambda x, s: morphology.white_tophat(smooth_func(x, s), selem) else: f = lambda x, s: smooth_func(x, s) smoothed = np.stack([f(img_slice, sigma) for img_slice in to_smooth]) return bf_stack.cast_img_float64(np.stack(smoothed)) def count_spots( smoothed_signal, cell_labels, voxel_size_nm, dot_radius_nm, smooth_method="gaussian", decompose_alpha=0.5, decompose_beta=1, decompose_gamma=5, verbose=False, ): if verbose: spot_radius_px = bf_detection.get_object_radius_pixel( voxel_size_nm=voxel_size_nm, object_radius_nm=dot_radius_nm, ndim=3 ) logging.info("spot radius (z axis): %0.3f pixels", spot_radius_px[0]) logging.info("spot radius (yx plan): %0.3f pixels", spot_radius_px[-1]) spots, threshold = bf_detection.detect_spots( smoothed_signal, return_threshold=True, voxel_size=voxel_size_nm, spot_radius=dot_radius_nm, ) if verbose: logging.info("%d spots detected...", spots.shape[0]) logging.info("plotting threshold optimization for spot detection...") bf_plot.plot_elbow( smoothed_signal, voxel_size=voxel_size_nm, spot_radius=dot_radius_nm, ) decompose_cast = { "gaussian": bf_stack.cast_img_uint16, "log": bf_stack.cast_img_float64, } try: ( spots_post_decomposition, dense_regions, reference_spot, ) = bf_detection.decompose_dense( decompose_cast[smooth_method](smoothed_signal), spots, voxel_size=voxel_size_nm, spot_radius=dot_radius_nm, alpha=decompose_alpha, # alpha impacts the number of spots per candidate region beta=decompose_beta, # beta impacts the number of candidate regions to decompose gamma=decompose_gamma, # gamma the filtering step to denoise the image ) logging.info( "detected spots before decomposition: %d\n" "detected spots after decomposition: %d", spots.shape[0], spots_post_decomposition.shape[0], ) if verbose: print( f"detected spots before decomposition: {spots.shape[0]}\n" f"detected spots after decomposition: {spots_post_decomposition.shape[0]}\n" f"shape of reference spot for decomposition: {reference_spot.shape}" ) bf_plot.plot_reference_spot(reference_spot, rescale=True) except RuntimeError: logging.warning("decomposition failed, using originally identified spots") spots_post_decomposition = spots n_labels = len(np.unique(cell_labels)) - 1 counts = {i: 0 for i in range(1, n_labels + 1)} expression_3d = np.zeros_like(smoothed_signal) # get slices to account for cropping for each in spots_post_decomposition: spot_coord = tuple(each) cell_label = cell_labels[spot_coord] if cell_label != 0: counts[cell_label] += 1 for region in measure.regionprops(cell_labels): expression_3d[region.slice][region.image] = counts[region.label] return counts, expression_3d def average_intensity(smoothed_signal, cell_labels): n_labels = len(np.unique(cell_labels)) - 1 intensities = {i: 0 for i in range(1, n_labels + 1)} z_normed_smooth = (smoothed_signal - smoothed_signal.mean()) / smoothed_signal.std() expression_3d = np.zeros_like(z_normed_smooth) for region in measure.regionprops(cell_labels, z_normed_smooth): intensities[region.label] = region.mean_intensity expression_3d[region.slice][region.image] = region.mean_intensity return intensities, expression_3d def quantify_expression( fish_img, cell_labels, measures=["spots", "intensity"], voxel_size_nm=None, dot_radius_nm=None, whitehat=True, whitehat_selem=None, smooth_method="gaussian", smooth_sigma=1, decompose_alpha=0.5, decompose_beta=1, decompose_gamma=5, bits=12, crop_image=True, verbose=False, ): """ Count the number of molecules in an smFISH image Parameters ---------- fish_img : np.ndarray Image in which to perform molecule counting cell_labels : np.ndarray Integer array of same shape as `img` denoting regions to interest to quantify. Each separate region should be uniquely labeled. measures : list Measures to use to quantify expression. Possible values are "spots", "intensity", and ["spots", "intensity"]. Default is to measure both voxel_size_nm : tuple(float, int), None Physical dimensions of each voxel in ZYX order. Required if running spot counting. dot_radius_nm : tuple(float, int), None Physical size of expected dots. Required if running spot counting. whitehat : bool, optional Whether to perform white tophat filtering prior to image de-noising, by default True whitehat_selem : [int, np.ndarray], optional Structuring element to use for white tophat filtering. smooth_method : str, optional Method to use for image de-noising. Possible values are "log" and "gaussian" for Laplacian of Gaussians and Gaussian background subtraction, respectively. By default "log". smooth_sigma : [int, np.ndarray], optional Sigma value to use for smoothing function, by default 1 decompose_alpha : float, optional Intensity percentile used to compute the reference spot, between 0 and 1. By default 0.7. For more information, see: https://big-fish.readthedocs.io/en/stable/detection/dense.html decompose_beta : int, optional Multiplicative factor for the intensity threshold of a dense region, by default 1. For more information, see: https://big-fish.readthedocs.io/en/stable/detection/dense.html decompose_gamma : int, optional Multiplicative factor use to compute a gaussian scale, by default 5. For more information, see: https://big-fish.readthedocs.io/en/stable/detection/dense.html bits : int, optional Bit depth of original image. Used for scaling image while maintaining ob crop_image : bool, optional Whether to crop signal. Default is True. verbose : bool, optional Whether to verbosely print results and progress. Returns ------- (np.ndarray, dict) np.ndarray: positions of all identified mRNA molecules. dict: dictionary containing the number of molecules contained in each labeled region. """ if (voxel_size_nm is None or dot_radius_nm is None) and "spots" in measures: raise ValueError( "Require `voxel_size_nm` and `dot_radius_nm` when performing spot counting." ) if crop_image: if verbose: logging.info("Cropping image to signal") limits = select_signal(fish_img) if verbose: logging.info( "Cropped image to %d x %d", {limits.ymax - limits.ymin}, {limits.xmax - limits.xmin}, ) else: # create BoudndingBox that selects whole image limits = BoundingBox( ymin=0, ymax=fish_img.shape[1], xmin=0, xmax=fish_img.shape[2] ) if verbose: logging.info("Preprocessing image.") cropped_img = skimage.img_as_float64( exposure.rescale_intensity( crop_to_selection(fish_img, limits), in_range=(0, 2 ** bits - 1), out_range=(0, 1), ) ) smoothed = preprocess_image( cropped_img, smooth_method, smooth_sigma, whitehat, whitehat_selem, 99.99 ) cropped_labels = crop_to_selection(cell_labels, limits) quant = dict() if "spots" in measures: counts, counts_3d = count_spots( smoothed, cropped_labels, voxel_size_nm=voxel_size_nm, dot_radius_nm=dot_radius_nm, smooth_method=smooth_method, decompose_alpha=decompose_alpha, decompose_beta=decompose_beta, decompose_gamma=decompose_gamma, verbose=verbose, ) quant["spots"] = counts if crop_image: # match original shape if cropped counts_3d = utils.pad_to_shape(counts_3d, fish_img.shape) if "intensity" in measures: intensities, intense_3d = average_intensity(smoothed, cropped_labels) quant["intensity"] = intensities if crop_image: # match original shape if cropped intense_3d = utils.pad_to_shape(intense_3d, fish_img.shape) if len(measures) == 2: expression_3d = np.stack([counts_3d, intense_3d]) return ( quant, expression_3d, ) def get_quant_measure(method): """Get method of quantification""" if method == "spots": return ["spots"] elif method == "intensity": return ["intensity"] elif method == "both": return ["spots", "intensity"] else: raise ValueError(f"Unrecognized method {method}.") if __name__ == "__main__": import h5py import pandas as pd try: snakemake except NameError: snakemake = None if snakemake is not None: logging.basicConfig(filename=snakemake.log[0], level=logging.INFO) raw_img, dimensions = utils.read_image_file( snakemake.input["image"], as_nm=True ) if dimensions is None and snakemake.input["image"].endswith(".h5"): with open(snakemake.params["dimensions"], "r") as handle: data = json.load(handle) dimensions = np.array([data[c] for c in "zyx"]) * (10 ** 3) labels = np.array(h5py.File(snakemake.input["labels"], "r")["image"]) logging.info("%d labels detected.", len(np.unique(labels) - 1)) start = int(snakemake.params["z_start"]) stop = int(snakemake.params["z_end"]) if stop < 0: stop = raw_img.shape[1] gene_params = snakemake.params["gene_params"] def has_probe_info(name, gene_params): if name not in gene_params.keys(): logging.warning( "No entry for %s found in gene parameters. Not quantifying signal", name, ) return False return True channels = { x: i for i, x in enumerate(snakemake.params["channels"].split(";")) if has_probe_info(x, gene_params) } if len(channels) < 0: raise ValueError( f"No quantification parameters provided for channels: {snakemake.params['channels'].replace(';', ', ')}" ) genes = list(channels.keys()) fish_exprs = {} summarized_images = [None] * len(channels) embryo = snakemake.wildcards["embryo"] measures = get_quant_measure(snakemake.params["quant_method"]) for i, gene in enumerate(genes): logging.info(f"Quantifying {gene} signal...") fish_data = raw_img[channels[gene], start:stop, :, :] quant, image = quantify_expression( fish_data, labels, measures=measures, voxel_size_nm=dimensions.tolist(), dot_radius_nm=gene_params[gene]["radius"], whitehat=True, smooth_method="gaussian", smooth_sigma=7, verbose=True, bits=12, crop_image=snakemake.params["crop_image"], ) for each in measures: fish_exprs[f"{gene}_{each}"] = quant[each] summarized_images[i] = image # write summarized expression images to netcdf using Xarray to keep # track of dims out_image = np.array(summarized_images) xr.DataArray( data=out_image, coords={"gene": genes, "measure": measures}, dims=["gene", "measure", "Z", "Y", "X"], ).to_netcdf(snakemake.output["image"]) exprs_df = pd.DataFrame.from_dict(fish_exprs) exprs_df.index.name = "label" physical_properties = ( pd.DataFrame( measure.regionprops_table( labels, properties={ "label", "centroid", "area", "equivalent_diameter", }, ) ) .rename( columns={ "centroid-0": "Z", "centroid-1": "Y", "centroid-2": "X", "equivalent_diameter": "diameter", } ) .set_index("label") ) out = exprs_df.join(physical_properties) out["embryo"] = embryo out[ [ "embryo", "area", "diameter", "Z", "Y", "X", ] + [ f"{gene}_{measure}" for (gene, measure) in itertools.product(genes, measures) ] ].to_csv(snakemake.output["csv"]) |
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | import sys import os import h5py import numpy as np from skimage import exposure try: from intensipy import Intensify except ImportError: pass sys.path.append(os.path.basename(__file__)) import utils def get_channel_index(channels, channel): channel_index = [ i for i, x in enumerate(channels.split(";")) if x.lower() == channel.lower() ][0] return channel_index def preprocess_slice(img, upper_percentile=99.99, new_min=0, new_max=1): """Preprocess a Z-slice by scaling and equalizing intensities. Parameters ---------- img : np.ndarray Image slice to scale and equalize. upper_percentile : float, optional Upper bound to clip intensities for scaling, by default 99.99. new_min : int, optional New minimum intensity value, by default 0. new_max : int, optional New maximum intensity value, by default 1. Returns ------- np.ndarray Scaled and equalize image slice. """ lb, ub = np.percentile(img, (0, upper_percentile)) out = exposure.equalize_adapthist( exposure.rescale_intensity(img, in_range=(lb, ub), out_range=(new_min, new_max)) ) return out if __name__ == "__main__": try: snakemake except NameError: snakemake = None if snakemake is not None: img, __ = utils.read_image_file(snakemake.input["image"]) channel = get_channel_index( snakemake.params["channels"], snakemake.params["channel_name"] ) z_start = int(snakemake.params["z_start"]) z_stop = int(snakemake.params["z_end"]) pmc = img[channel, z_start:z_stop, :, :] if snakemake.params["intensipy"]: model = Intensify(xy_norm=False, dy=29, dx=29) pmc = model.normalize(pmc.astype(float)) else: pmc = np.array( [ preprocess_slice(x, upper_percentile=100, new_min=0, new_max=1) for x in pmc ] ) utils.to_hdf5(pmc, snakemake.output["h5"]) |
genomic relatedness detection pipeline (v1.8)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | import pandas import numpy from utils.bcftools import bcftools_query def interpolate(vcf_path, cm_path, output_path, chrom): genetic_map = pandas.read_csv(cm_path, sep=" ") pos = list(map(int, bcftools_query(vcf_path, arg="%POS\n", list_samples=False))) # interpolate CM from BP #chro = genetic_map[genetic_map.chr == int(chrom)] # because for chr19 'Genetic_Map(cM)' is 'Genetic_Map.cM.' cms = numpy.interp(pos, genetic_map.position.values, genetic_map.iloc[:, -1].values) eps = 1e-7 new_cms = [] for i in range(0, cms.shape[0]): if i == 0: new_cms.append(cms[i]) continue if new_cms[-1] >= cms[i]: new_cms.append(new_cms[-1] + eps) else: new_cms.append(cms[i]) with open(output_path, 'w') as w_map: for i, cm in enumerate(new_cms): if i > 0 and cm <= new_cms[i-1]: raise ValueError(f'cm positions {i-1} and {i} are NOT strictly increasing: {new_cms[:i+1]}') w_map.write(f"{i}\t{cm:.8f}\n") if __name__ == '__main__': vcf_path = snakemake.input['vcf'] cm_path = snakemake.input['cmmap'] chrom = snakemake.wildcards['chrom'] output_path = snakemake.output[0] interpolate(vcf_path, cm_path, output_path, chrom) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | from utils.ibd import merge_germline_segments, segments_to_germline, interpolate_all, read_germline_segments, read_pedsim_segments if __name__ == '__main__': germline_path = snakemake.input['germline'] map_dir = snakemake.params['cm_dir'] gap = float(snakemake.params['merge_gap']) pedsim_path = snakemake.params['true_ibd'] use_true_ibd = bool(snakemake.params['use_true_ibd']) if not use_true_ibd: segments = read_germline_segments(germline_path) segments = interpolate_all(segments, map_dir) segments = merge_germline_segments(segments, gap=gap) segments_to_germline(segments, snakemake.output['ibd']) else: true_segments = read_pedsim_segments(pedsim_path) segments_to_germline(true_segments, snakemake.output['ibd']) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 | import pandas import numpy import os import logging from collections import Counter, namedtuple from utils.ibd import read_king_segments as rks, interpolate_all, Segment def is_non_zero_file(fpath): return os.path.isfile(fpath) and os.path.getsize(fpath) > 0 def _sort_ids(data): unsorted_mask = data.id2 < data.id1 data.loc[unsorted_mask, 'id1'], data.loc[unsorted_mask, 'id2'] = data.loc[unsorted_mask, 'id2'], data.loc[ unsorted_mask, 'id1'] def read_bucket_dir(bucket_dir): total = None for file in os.listdir(bucket_dir): if not file.endswith('tsv'): continue path = os.path.join(bucket_dir, file) bucket = read_germline(path) if total is None: total = bucket else: total = total.append(bucket) return total def read_germline(ibd_path): germline_names = [ 'fid_iid1', 'fid_iid2', 'chrom', 'start_end_bp', 'start_end_snp', 'snp_len', 'genetic_len', 'len_units', 'mismatches', 'is_homozygous1', 'is_homozygous2' ] data = pandas.read_table(ibd_path, header=None, names=germline_names) if data.shape[0] == 0: return pandas.DataFrame(columns=['id1', 'id2', 'seg_count_germline', 'total_seg_len_germline']).set_index(['id1', 'id2']) data.loc[:, 'id1'] = data.fid_iid1.str.split().str[1] data.loc[:, 'id2'] = data.fid_iid2.str.split().str[1] _sort_ids(data) segments_info = data.loc[:, ['id1', 'id2', 'chrom', 'genetic_len']].groupby(by=['id1', 'id2']).agg({'genetic_len': 'sum', 'chrom': 'count'}) segments_info.rename({'genetic_len': 'total_seg_len_germline', 'chrom': 'seg_count_germline'}, axis=1, inplace=True) return segments_info def map_king_degree(king_degree): degree_map = { 'Dup/MZ': 0, 'PO': 1, 'FS': 2, '2nd': 2, '3rd': 3, '4th': 4, 'NA': numpy.nan } return [degree_map[kd] for kd in king_degree] def map_king_relation(king_degree): degree_map = { 'Dup/MZ': 0, 'PO': 'PO', 'FS': 'FS', '2nd': 2, '3rd': 3, '4th': 4, 'NA': numpy.nan } return [degree_map[kd] for kd in king_degree] def read_king(king_path): # FID1 ID1 FID2 ID2 MaxIBD1 MaxIBD2 IBD1Seg IBD2Seg PropIBD InfType try: data = pandas.read_table(king_path) data.loc[:, 'id1'] = data.FID1.astype(str) + '_' + data.ID1.astype(str) data.loc[:, 'id2'] = data.FID2.astype(str) + '_' + data.ID2.astype(str) _sort_ids(data) data.loc[:, 'king_degree'] = map_king_degree(data.InfType) data.loc[:, 'king_relation'] = map_king_relation(data.InfType) data.loc[:, 'king_degree'] = data.king_degree.astype(float).astype(pandas.Int32Dtype()) data.rename({'PropIBD': 'shared_genome_proportion', 'IBD1Seg': 'king_ibd1_prop', 'IBD2Seg': 'king_ibd2_prop'}, axis='columns', inplace=True) indexed = data.loc[:, ['id1', 'id2', 'king_degree', 'king_relation', 'king_ibd1_prop', 'king_ibd2_prop', 'shared_genome_proportion']].\ set_index(['id1', 'id2']) return indexed except pandas.errors.EmptyDataError: return pandas.DataFrame(data=None, index=None, columns=['id1', 'id2', 'king_degree', 'king_relation', 'king_ibd1_prop', 'king_ibd2_prop', 'shared_genome_proportion']).set_index(['id1', 'id2']) def read_king_segments_chunked(king_segments_path, map_dir): total = None try: for i, chunk in enumerate(pandas.read_table( king_segments_path, compression='gzip', dtype={'FID1': str, 'ID1': str, 'FID2': str, 'ID2': str}, chunksize=1e+6)): segments = {} for i, row in chunk.iterrows(): id1 = row['FID1'] + '_' + row['ID1'] id2 = row['FID2'] + '_' + row['ID2'] seg = Segment(id1, id2, row['Chr'], bp_start=row['StartMB']*1e+6, bp_end=row['StopMB']*1e+6) key = tuple(sorted((seg.id1, seg.id2))) if key not in segments: segments[key] = [seg] else: segments[key].append(seg) segments = interpolate_all(segments, map_dir) data = pandas.DataFrame(columns=['id1', 'id2', 'total_seg_len_king', 'seg_count_king']) logging.info(f'loaded and interpolated segments from chunk {i} for {len(segments)} pairs') rows = [] for key, segs in segments.items(): row = {'id1': key[0], 'id2': key[1], 'total_seg_len_king': sum([s.cm_len for s in segs]), 'seg_count_king': len(segs)} rows.append(row) rows_frame = pandas.DataFrame.from_records(rows, columns=['id1', 'id2', 'total_seg_len_king', 'seg_count_king']) data = pandas.concat([data, rows_frame], ignore_index=True) _sort_ids(data) data = data.set_index(['id1', 'id2']) total = total.append(data) if total is not None else data return total except pandas.errors.EmptyDataError: return pandas.DataFrame(columns=['id1', 'id2', 'total_seg_len_king', 'seg_count_king']).set_index(['id1', 'id2']) def read_king_segments(king_segments_path, map_dir): try: segments = rks(king_segments_path) segments = interpolate_all(segments, map_dir) data = pandas.DataFrame(columns=['id1', 'id2', 'total_seg_len_king', 'seg_count_king']) logging.info(f'loaded and interpolated segments for {len(segments)} pairs') for key, segs in segments.items(): row = {'id1': key[0], 'id2': key[1], 'total_seg_len_king': sum([s.cm_len for s in segs]), 'seg_count_king': len(segs)} data = data.append(row, ignore_index=True) _sort_ids(data) return data.set_index(['id1', 'id2']) except pandas.errors.EmptyDataError: return pandas.DataFrame(columns=['id1', 'id2', 'total_seg_len_king', 'seg_count_king']).set_index(['id1', 'id2']) def kinship_to_degree(kinship): degrees = [] # intervals are from KING manual # >0.354, [0.177, 0.354], [0.0884, 0.177] and [0.0442, 0.0884] for k in kinship: if k > 0.354: degrees.append(0) elif 0.177 < k <= 0.354: degrees.append(1) elif 0.0884 < k <= 0.177: degrees.append(2) elif 0.0442 < k <= 0.0884: degrees.append(3) else: degrees.append(None) return degrees def _read_kinship_data(kinship_path): try: data = pandas.read_table(kinship_path) print(kinship_path, data.shape) # If no relations were found, king creates file with only header if data.shape[0] != 0: if 'FID' in data.columns: data.loc[:, 'id1'] = data.FID.astype(str) + '_' + data.ID1.astype(str) data.loc[:, 'id2'] = data.FID.astype(str) + '_' + data.ID2.astype(str) else: data.loc[:, 'id1'] = data.FID1.astype(str) + '_' + data.ID1.astype(str) data.loc[:, 'id2'] = data.FID2.astype(str) + '_' + data.ID2.astype(str) _sort_ids(data) data.rename({'Kinship': 'kinship'}, axis=1, inplace=True) data.loc[:, 'kinship_degree'] = kinship_to_degree(data.kinship) data = data.loc[:, ['id1', 'id2', 'kinship', 'kinship_degree']].set_index(['id1', 'id2']) else: data = pandas.DataFrame(columns=['id1', 'id2', 'kinship', 'kinship_degree']).set_index(['id1', 'id2']) return data except pandas.errors.EmptyDataError: return pandas.DataFrame(columns=['id1', 'id2', 'kinship', 'kinship_degree']).set_index(['id1', 'id2']) def read_kinship(kinship_path, kinship0_path): # parse within families # FID ID1 ID2 N_SNP Z0 Phi HetHet IBS0 Kinship Error within = _read_kinship_data(kinship_path) logging.info(f'loaded {within.shape[0]} pairs from within-families kinship estimation results') # FID1 ID1 FID2 ID2 N_SNP HetHet IBS0 Kinship across = _read_kinship_data(kinship0_path) logging.info(f'loaded {across.shape[0]} pairs from across-families kinship estimation results') return pandas.concat([within, across], axis=0) def read_ersa(ersa_path): # Indv_1 Indv_2 Rel_est1 Rel_est2 d_est lower_d upper_d N_seg Tot_cM data = pandas.read_table(ersa_path, header=0, names=['id1', 'id2', 'rel_est1', 'rel_est2', 'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'seg_count_ersa', 'total_seg_len_ersa'], dtype={'ersa_degree': str, 'seg_count_ersa': str, 'total_seg_len_ersa': str}) data = data.loc[(data.rel_est1 != 'NA') | (data.rel_est2 != 'NA'), :] data.loc[:, 'id1'] = data.id1.str.strip() data.loc[:, 'id2'] = data.id2.str.strip() _sort_ids(data) data.loc[:, 'ersa_degree'] = pandas.to_numeric(data.ersa_degree.str.strip(), errors='coerce').astype( pandas.Int32Dtype()) data.loc[:, 'ersa_lower_bound'] = pandas.to_numeric(data.ersa_lower_bound.str.strip(), errors='coerce').astype( pandas.Int32Dtype()) data.loc[:, 'ersa_upper_bound'] = pandas.to_numeric(data.ersa_upper_bound.str.strip(), errors='coerce').astype( pandas.Int32Dtype()) print('dtypes are: ', data.dtypes) data.loc[:, 'seg_count_ersa'] = pandas.to_numeric(data.seg_count_ersa.str.strip(), errors='coerce').astype( pandas.Int32Dtype()) data.loc[:, 'total_seg_len_ersa'] = pandas.to_numeric(data.total_seg_len_ersa.str.strip(), errors='coerce') data.loc[data.ersa_lower_bound != 1, 'ersa_lower_bound'] = data.ersa_lower_bound - 1 data.loc[data.ersa_upper_bound != 1, 'ersa_upper_bound'] = data.ersa_upper_bound - 1 logging.info(f'read {data.shape[0]} pairs from ersa output') logging.info(f'ersa after reading has {pandas.notna(data.ersa_degree).sum()}') data.loc[:, 'is_niece_aunt'] = [True if 'Niece' in d else False for d in data.rel_est1] logging.info(f'we have total of {data.is_niece_aunt.sum()} possible Niece/Aunt relationships') return data.loc[data.id1 != data.id2, ['id1', 'id2', 'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'is_niece_aunt', 'seg_count_ersa', 'total_seg_len_ersa']].\ set_index(['id1', 'id2']) def read_ersa2(ersa_path): # individual_1 individual_2 est_number_of_shared_ancestors est_degree_of_relatedness 0.95 CI_2p_lower:4 # 2p_upper:5 1p_lower:6 1p_upper:7 0p_lower:8 0p_upper:9 # maxlnl_relatedness maxlnl_unrelatednes data = pandas.read_table(ersa_path, comment='#') data = data.loc[data.est_degree_of_relatedness != 'no_sig_rel', :] data.rename({'individual_1': 'id1', 'individual_2': 'id2', 'est_number_of_shared_ancestors': 'shared_ancestors'}, axis=1, inplace=True) data.loc[:, 'ersa_degree'] = pandas.to_numeric(data['est_degree_of_relatedness'], errors='coerce').\ astype(pandas.Int32Dtype()) lower, upper = [], [] for i, ancestors in enumerate(data.shared_ancestors): if ancestors == 0: lower.append(data.iloc[i, 8]) upper.append(data.iloc[i, 9]) if ancestors == 1: lower.append(data.iloc[i, 6]) upper.append(data.iloc[i, 7]) if ancestors == 2: lower.append(data.iloc[i, 4]) upper.append(data.iloc[i, 5]) data.loc[:, 'ersa_lower_bound'] = lower data.loc[:, 'ersa_upper_bound'] = upper cols = ['id1', 'id2', 'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'shared_ancestors'] return data.loc[data.id1 != data.id2, cols].set_index(['id1', 'id2']) if __name__ == '__main__': try: snakemake except NameError: test_dir = '/media_ssd/pipeline_data/TF-CEU-TRIBES-ibis-king-2' Snakemake = namedtuple('Snakemake', ['input', 'output', 'params', 'log']) snakemake = Snakemake( input={'king': f'{test_dir}/king/data.seg', 'king_segments': f'{test_dir}/king/data.segments.gz', 'kinship': f'{test_dir}/king/data.kin', 'kinship0': f'{test_dir}/king/data.kin0', 'ersa': f'{test_dir}/ersa/relatives.tsv'}, output=[f'test_data/merge_king_ersa.tsv'], params={'cm_dir': f'{test_dir}/cm'}, log=['test_data/merge_king_ersa.log'] ) logging.basicConfig(filename=snakemake.log[0], level=logging.DEBUG, format='%(levelname)s:%(asctime)s %(message)s') king_path = snakemake.input['king'] king_segments_path = snakemake.input['king_segments'] # within families kinship_path = snakemake.input['kinship'] # across families kinship0_path = snakemake.input['kinship0'] ersa_path = snakemake.input['ersa'] map_dir = snakemake.params['cm_dir'] output_path = snakemake.output[0] king = read_king(king_path) kinship = read_kinship(kinship_path, kinship0_path) king_segments = read_king_segments_chunked(king_segments_path, map_dir) ersa = read_ersa(ersa_path) relatives = king.merge(kinship, how='outer', left_index=True, right_index=True).\ merge(ersa, how='outer', left_index=True, right_index=True).\ merge(king_segments, how='outer', left_index=True, right_index=True) prefer_ersa_mask = pandas.isnull(relatives.king_degree) | (relatives.king_degree > 3) relatives.loc[:, 'final_degree'] = relatives.king_degree # if king is unsure or king degree > 3 then we use ersa distant relatives estimation relatives.loc[prefer_ersa_mask, 'final_degree'] = relatives.ersa_degree logging.info(f'king is null or more than 3: {prefer_ersa_mask.sum()}') logging.info(f'ersa is not null: {pandas.notna(relatives.ersa_degree).sum()}') print(f'relatives columns are {relatives.columns}') if 'total_seg_len_king' in relatives.columns: relatives.loc[:, 'total_seg_len'] = relatives.total_seg_len_king*relatives.king_ibd1_prop relatives.loc[:, 'total_seg_len_ibd2'] = relatives.total_seg_len_king*relatives.king_ibd2_prop relatives.loc[:, 'seg_count'] = relatives.seg_count_king relatives.loc[prefer_ersa_mask, 'total_seg_len'] = relatives.total_seg_len_ersa relatives.loc[prefer_ersa_mask, 'seg_count'] = relatives.seg_count_ersa print('is na: ', pandas.isna(relatives.loc[prefer_ersa_mask, 'total_seg_len_ersa']).sum()) # approximate calculations, IBD share is really small in this case relatives.loc[prefer_ersa_mask, 'shared_genome_proportion'] = 0.5*relatives.loc[prefer_ersa_mask, 'total_seg_len'].values / 3440 relatives.drop(['total_seg_len_king', 'seg_count_king', 'total_seg_len_ersa', 'seg_count_ersa', 'king_ibd1_prop', 'king_ibd2_prop'], axis='columns', inplace=True) logging.info(f'final degree not null: {pandas.notna(relatives.final_degree).sum()}') relatives.loc[pandas.notna(relatives.final_degree), :].to_csv(output_path, sep='\t') |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | import pandas import logging from utils.bcftools import bcftools_stats from utils.bcftools import bcftools_query from utils.bcftools import bcftools_view from io import StringIO def find_outliers(psc: pandas.DataFrame, outliers_file_path: str, keep_samples_file_path: str, alpha: float): q1 = psc.nNonMissing.quantile(0.25) q3 = psc.nNonMissing.quantile(0.75) iqr = q3-q1 lower_bound_mask = psc.nNonMissing < (q1 - alpha*iqr) upper_bound_mask = psc.nNonMissing > (q3 + alpha*iqr) outliers = psc[lower_bound_mask | upper_bound_mask] if outliers.empty: outliers['exclusion_reason'] = [] else: outliers.loc[lower_bound_mask, 'exclusion_reason'] = f'Sample was below 1st quantile - IQR*{alpha} ({q1 - alpha*iqr}) by Missing Samples' outliers.loc[upper_bound_mask, 'exclusion_reason'] = f'Sample was above 3rd quantile + IQR*{alpha} ({q3 + alpha*iqr}) by Missing Samples' keep_samples = pandas.concat([psc, outliers, outliers]).drop_duplicates(keep=False) outliers_list = list(outliers.sample_id) with open(outliers_file_path, 'w') as outliers_file: for outlier in outliers_list: outliers_file.write(outlier + '\n') keep_samples_list = list(keep_samples.sample_id) with open(keep_samples_file_path, 'w') as keep_samples_file: for sample in keep_samples_list: keep_samples_file.write(sample + '\n') return outliers def get_stats(vcf_input, samples_path, psc_path=False, stats_path=''): bcftools_query(vcf_input, list_samples=True, save_output=samples_path) raw_table = bcftools_stats(vcf_input, samples_path, save_output=psc_path, save_stats=stats_path) names = [ 'PSC', 'id', 'sample_id', 'nRefHom', 'nNonRefHom', 'nHets', 'nTransitions', 'nTransversions', 'nIndels', 'average depth', 'nSingletons', 'nHapRef', 'nHapAlt', 'nMissing' ] psc = pandas.read_table(StringIO(raw_table), header=None, names=names) psc.loc[:, 'nNonMissing'] = psc.nRefHom + psc.nNonRefHom + psc.nHets psc.loc[:, 'missing_share'] = psc.nMissing / (psc.nMissing + psc.nNonMissing) psc.loc[:, 'alt_hom_share'] = psc.nNonRefHom / psc.nNonMissing psc.loc[:, 'het_samples_share'] = psc.nHets / psc.nNonMissing return psc if __name__ == '__main__': input_vcf = snakemake.input['vcf'] stats_file = snakemake.output['stats'] samples = snakemake.params['samples'] psc_path = snakemake.params['psc'] keep_samples = snakemake.params['keep_samples'] bad_samples_path = snakemake.output['bad_samples'] report_path = snakemake.output['report'] outliers = snakemake.output['outliers'] no_outliers_vcf = snakemake.output['no_outliers_vcf'] missing_samples = float(snakemake.params['missing_samples']) alt_hom_samples = float(snakemake.params['alt_hom_samples']) het_samples = float(snakemake.params['het_samples']) alpha = float(snakemake.params['iqr_alpha']) logging.basicConfig(filename=snakemake.log[0], level=logging.DEBUG, format='%(levelname)s:%(asctime)s %(message)s') # get initial stats, that we do not save logging.info("Getting initial vcf stats...") psc = get_stats(vcf_input=input_vcf, samples_path=samples, psc_path=psc_path, stats_path=stats_file) # filter outliers outliers = find_outliers(psc, outliers_file_path=outliers, keep_samples_file_path=keep_samples, alpha=alpha) logging.info(f"Found {len(outliers)} outliers!") if not outliers.empty: bcftools_view(input_vcf, no_outliers_vcf, keep_samples) else: with open(no_outliers_vcf, 'w') as dummy_no_outliers_vcf: dummy_no_outliers_vcf.write('') no_outliers_vcf = input_vcf # get final stats without outliers logging.info("Getting final vcf stats...") psc = get_stats(vcf_input=no_outliers_vcf, samples_path=samples, psc_path=psc_path, stats_path=stats_file) bad_missing_samples_mask = (psc.missing_share >= missing_samples / 100) | (psc.nMissing + psc.nNonMissing == 0) bad_alt_hom_samples_mask = (psc.alt_hom_share < alt_hom_samples / 100) | (psc.nNonMissing == 0) bad_het_samples_mask = (psc.het_samples_share < het_samples / 100) | (psc.nNonMissing == 0) psc.loc[bad_het_samples_mask, 'exclusion_reason'] = f'Sample has <= {het_samples}% heterozygous SNPs' psc.loc[bad_alt_hom_samples_mask, 'exclusion_reason'] = f'Sample has <= {alt_hom_samples}% homozygous alternative SNPs' psc.loc[bad_missing_samples_mask, 'exclusion_reason'] = f'Sample has >= {missing_samples}% missing SNPs' bad_samples = psc.loc[(bad_alt_hom_samples_mask | bad_het_samples_mask | bad_missing_samples_mask), ['sample_id', 'missing_share', 'alt_hom_share', 'het_samples_share', 'exclusion_reason']] psc = pandas.concat([psc, outliers[['sample_id', 'missing_share', 'alt_hom_share', 'het_samples_share', 'exclusion_reason']]], ignore_index=True) samples_only = bad_samples.loc[:, ['sample_id']].copy() # if input vcf file has iids in form fid_iid we split it, else we just assign fid equal to iid samples_only.loc[:, 'fid'] = [s.split('_')[0] if '_' in s else s for s in samples_only.sample_id] samples_only.loc[:, 'iid'] = [s.split('_')[1] if '_' in s else s for s in samples_only.sample_id] samples_only.loc[:, ['fid', 'iid']].to_csv(bad_samples_path, sep='\t', header=None, index=False) bad_samples.to_csv(report_path, sep='\t') log = f'We have total of {psc.shape[0]} samples\n' \ f'{bad_missing_samples_mask.sum()} samples have >= {missing_samples}% missing share\n' \ f'{bad_alt_hom_samples_mask.sum()} samples have <= {alt_hom_samples}% homozygous alternative variants\n' \ f'{bad_het_samples_mask.sum()} samples have <= {het_samples}% heterozygous variants\n' \ f'{len(outliers)} samples detected as outliers\n' logging.info(log) print(log) |
Metagenome-Assembled Genome Analysis of Gut Microbiota (v2.16.2)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | import os, sys import logging, traceback logging.basicConfig( filename=snakemake.log[0], level=logging.INFO, format="%(asctime)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) logging.captureWarnings(True) def handle_exception(exc_type, exc_value, exc_traceback): if issubclass(exc_type, KeyboardInterrupt): sys.__excepthook__(exc_type, exc_value, exc_traceback) return logging.error( "".join( [ "Uncaught exception: ", *traceback.format_exception(exc_type, exc_value, exc_traceback), ] ) ) # Install exception handler sys.excepthook = handle_exception #### Begining of scripts from common_report import * import pandas as pd import plotly.express as px from utils.taxonomy import tax2table def make_plots(bin_table): div = {} div["input_file"] = bin_table # Prepare data df = pd.read_table(bin_table) if snakemake.config["bin_quality_asesser"].lower() == "busco": df["Bin Id"] = df["Input_file"].str.replace(".fasta", "", regex=False) logging.info("No taxonomic information available, use busco Dataset") lineage_name = "Dataset" hover_data = [ "Scores_archaea_odb10", "Scores_bacteria_odb10", "Scores_eukaryota_odb10", ] size_name = None elif snakemake.config["bin_quality_asesser"].lower() == "checkm": df = df.join( tax2table(df["Taxonomy (contained)"], remove_prefix=True).fillna("NA") ) lineage_name = "phylum" size_name = "Genome size (Mbp)" hover_data = ["genus"] elif snakemake.config["bin_quality_asesser"].lower() == "checkm2": df["Bin Id"] = df.index lineage_name = "Translation_Table_Used" hover_data = [ "Completeness_Model_Used", "Coding_Density", "Contig_N50", "GC_Content", "Additional_Notes", ] size_name = "Genome_Size" else: raise Exception(f"bin_quality_asesser in the config file not understood") df.index = df["Bin Id"] div[ "QualityScore" ] = "<p>Quality score is calculated as: Completeness - 5 x Contamination.</p>" # 2D plot fig = px.scatter( data_frame=df, y="Completeness", x="Contamination", color=lineage_name, size=size_name, hover_data=hover_data, hover_name="Bin Id", ) fig.update_yaxes(range=(50, 102)) fig.update_xaxes(range=(-0.2, 10.1)) div["2D"] = fig.to_html(**HTML_PARAMS) ## By sample fig = px.strip( data_frame=df, y="Quality_score", x="Sample", color=lineage_name, hover_data=hover_data, hover_name="Bin Id", ) fig.update_yaxes(range=(50, 102)) div["bySample"] = fig.to_html(**HTML_PARAMS) # By Phylum fig = px.strip( data_frame=df, y="Quality_score", x=lineage_name, hover_data=hover_data, hover_name="Bin Id", ) fig.update_yaxes(range=(50, 102)) div["byPhylum"] = fig.to_html(**HTML_PARAMS) return div # main div = make_plots(bin_table=snakemake.input.bin_table) make_html( div=div, report_out=snakemake.output.report, html_template_file=os.path.join(reports_dir, "template_bin_report.html"), wildcards=snakemake.wildcards, ) |
utils
Utils ===== .. image:: https://travis-ci.org/haaksmash/pyutils.svg?branch=master :target: https://travis-ci.org/haaksmash/pyutils Sometimes you write a function over and over again; sometimes you look up at the ceiling and ask "why, Guido, why isn't this included in the standard library?" Well, we perhaps can't answer that question. But we can collect those functions into a centralized place! Provided things +++++++++++++++ Utils is broken up into broad swathes of functionality, to ease the task of remembering where exactly something lives. enum ---- Python doesn't have a built-in way to define an enum, so this module provides (what I think) is a pretty clean way to go about them. .. code-block:: python from utils import enum class Colors(enum.Enum): RED = 0 GREEN = 1 # Defining an Enum class allows you to specify a few # things about the way it's going to behave. class Options: frozen = True # can't change attributes stric