Code Snippets

Snakemake workflow for building an ANTS template (from antsMultivariateTemplateConstruction2.sh)

import nibabel as nib
import numpy as np


img = nib.load(snakemake.input[0])

print('config:')
print(snakemake.config)


hdr = img.header
shape = np.array(hdr.get_data_shape()).astype('float').tolist()
zooms = np.array(hdr.get_zooms()).astype('float').tolist()

affine = img.affine
origin = affine @ np.array([0,0,0,1]).T
origin = origin[0:3].astype('float').tolist()

template_dict = dict()

#add extras from config file
template_dict.update(snakemake.config['template_description_extras'])

#add shape, zooms, origin, for the resolution
template_dict.update( { 'res':  {'{res:02d}'.format(res=snakemake.config['resolution_index']) : { 'origin': origin, 'shape': shape, 'zooms': zooms } } } )


import json


with open(snakemake.output[0],'w') as outfile:
    json.dump(template_dict,outfile,indent=2)

Python numpy JSON nibabel From line 1 of scripts/create_template_description_json.py

Neuroimaging Data Processing Workflow

import numpy as np
import nibabel as nib
import json

#load up tissue probability, warped from template
tissue_prob_vol = dict()

for label,nii in zip(snakemake.config['tissue_labels'], snakemake.input.tissue_priors):
    print(label)
    print(nii)
    tissue_prob_vol[label] = nib.load(nii).get_fdata()


#load up k-class tissue segmentation
tissue_k_seg = nib.load(snakemake.input.seg_channels_4d)
tissue_k_seg.shape

sim_prior_k = np.zeros([len(snakemake.config['tissue_labels']),tissue_k_seg.shape[3]])

#for each prior, need to find the channel that best fits
for i,label in enumerate(snakemake.config['tissue_labels']):
    for k in range(tissue_k_seg.shape[3]):

        print(f'Computing overlap of {label} prior and channel {k}... ')
        #compute intersection over union
        s1 = tissue_prob_vol[label] >0.5
        s2 = tissue_k_seg.slicer[:,:,:,k].get_fdata() >0.5
        sim_prior_k[i,k] = np.sum(np.logical_and(s1,s2).flat) / np.sum(np.logical_or(s1,s2).flat) 


print('Overlap table:')
print(sim_prior_k)

label_to_k_dict = dict()

for i,label in enumerate(snakemake.config['tissue_labels']):
    label_to_k_dict[label] = int(np.argmax(sim_prior_k[i,:]))
    #write nii to file
    print('writing image at channel {} to output file: {}'.format(label_to_k_dict[label], \
                                                    snakemake.output.tissue_segs[i]))
    nib.save(tissue_k_seg.slicer[:,:,:,label_to_k_dict[label]],\
                    snakemake.output.tissue_segs[i])


with open(snakemake.output.mapping_json, 'w') as outfile:
    json.dump(label_to_k_dict, outfile,indent=4)

Python numpy JSON nibabel From line 1 of scripts/map_channels_to_tissue.py

workflow for processing ISMRM Diffusion Study - Best Practices in Image Preprocessing data

run: 
    import json
    with open(input.json) as f:
        json_dict = json.load(f)
    json_dict['PhaseEncodingDirection'] = 'j'
    with open(output.json, 'w') as f:
        json.dump(json_dict, f,indent=4)

SnakeMake JSON From line 26 of master/Snakefile

run: 
    import json
    with open(input.json) as f:
        json_dict = json.load(f)
    json_dict['PhaseEncodingDirection'] = 'j-'
    with open(output.json, 'w') as f:
        json.dump(json_dict, f,indent=4)

SnakeMake JSON From line 46 of master/Snakefile

template building for cam-can dataset

import nibabel as nib
import numpy as np
import json


img = nib.load(snakemake.input[0])


hdr = img.header
shape = np.array(hdr.get_data_shape()).astype('float').tolist()
zooms = np.array(hdr.get_zooms()).astype('float').tolist()

affine = img.affine
origin = affine @ np.array([0,0,0,1]).T
origin = origin[0:3].astype('float').tolist()

template_dict = dict()

#add extras from config file
template_dict.update(snakemake.config['template_description_extras'])

#add shape, zooms, origin, for the resolution
template_dict.update( { 'res':  {'{res:02d}'.format(res=snakemake.config['resolution_index']) : { 'origin': origin, 'shape': shape, 'zooms': zooms } } } )

#add cohorts to json, with full list of subjects for each cohort..
cohort_dict = dict()
for cohort in snakemake.params.cohorts:
    cohort_dict[cohort] = { 'participants': snakemake.params.subjects[cohort] } 

template_dict.update( { 'cohort': cohort_dict } )


with open(snakemake.output[0],'w') as outfile:
    json.dump(template_dict,outfile,indent=2)

Python numpy JSON nibabel From line 1 of scripts/create_template_description_json.py

nnUnet training workflow for fetal bold MRI

import json

#load template json
with open(snakemake.input.template_json) as f:
    dataset = json.load(f)

dataset['training'] = [{'image': img, 'label': lbl} for img,lbl in zip(snakemake.params.training_imgs_nosuffix,snakemake.input.training_lbls)]



dataset['numTraining'] = len(dataset['training'])

#write modified json
with open(snakemake.output.dataset_json, 'w') as f:
    json.dump(dataset, f, indent=4)

Python JSON From line 1 of master/create_json.py

Snakemake workflow for pre-processing dwi data to re-train using b1000 hcp data

import json
import nibabel as nib
import numpy as np
import sys

with open(snakemake.input.shells) as f:
  shells_dict = json.load(f)

#get bval parameter:
bval = snakemake.params.bval

#input dwi
dwi_nib = nib.load(snakemake.input.dwi)
print(dwi_nib.shape)

#create output shape
newshape = np.array(dwi_nib.shape[:3])

avg_shell = np.zeros(newshape.astype(int))


indices = shells_dict['shell_to_vol'][bval] 

if len(dwi_nib.shape) == 3 and len(indices) == 1 and indices[0] == 0:
    #we have 3d vol (e.g. b0 only), so just grab it..
    avg_shell = dwi_nib.get_fdata()[:,:,:]
elif len(dwi_nib.shape) == 4:
    #otherwise, pick out indices and average
    avg_shell = np.mean(dwi_nib.get_fdata()[:,:,:,indices],3)
else:
    #if not either of these cases, then something weird with indices and volumes
    print('unable to get map indices to get avg shell')
    sys.exit()

#now save as image
avg_shell_nii = nib.Nifti1Image(avg_shell, affine=dwi_nib.affine )
avg_shell_nii.to_filename(snakemake.output[0])

Python numpy JSON nibabel From line 1 of scripts/get_shell_avg.py

import numpy as np
import json

#load bvals
bvals = np.loadtxt(snakemake.input[0])


#if just single bval (i.e. rev ph enc b0), then just set manually
if bvals.size == 1:

    out_dict = dict()
    shells = [np.around(bvals,-2).astype('int').tolist()]
    out_dict['shells'] = shells
    out_dict['vol_to_shell'] = shells
    out_dict['shell_to_vol'] = {str(shells[0]): [0]} #point to index 0

#otherwise try to find shells
else:

    shells = []

    # histogram is used to find number of shells, with anywhere from 10 to 100 bins
    #  want to use the highest number of bins that doesn't split up the shells
    #  so that the bin centers can then be used as the shell bvalues..

    for i,nbins in enumerate( np.arange(10,100,5) ):

        counts, bin_edges = np.histogram(bvals, bins=nbins )

        bin_lhs = bin_edges[:-1]
        bin_rhs = bin_edges[1:]
        bin_centers = (bin_lhs + bin_rhs) / 2

        shells.append(bin_centers[np.where(counts>0)])


    #get number of shells for each bin-width choice
    nshells = [len(s) for s in shells]

    print('nshells')
    print(nshells)

    #use the highest number of bins that still produces the minimal number of shells
    min_nshells = np.min(nshells)
    possible_shells = np.where(nshells == min_nshells)[0]
    chosen_shells = shells[possible_shells[-1]]

    #round to nearest 100
    chosen_shells = np.around(chosen_shells,-2).astype('int')

    print('chosen_shells')
    print(chosen_shells)

    #write to file
    #np.savetxt(snakemake.output[0],chosen_shells,fmt='%d')



    #get bval indices, by mindist to shell
    #bvals
    rep_shells = np.tile(chosen_shells,[len(bvals),1])
    rep_bvals = np.tile(bvals,[len(chosen_shells),1]).T

    print(rep_shells)
    print(rep_bvals)

    #abs diff between bvals and shells
    diff = np.abs(rep_bvals - rep_shells)
    shell_ind = np.argmin(diff,1)

    shell_ind = chosen_shells[shell_ind]

    #get a list of indices shells to vols
    shell_to_vol = dict()
    for shell in chosen_shells.tolist():
        shell_to_vol[shell] = np.where(shell_ind==int(shell))[0].tolist()

    #chosen_shell
    out_dict = dict()
    out_dict['shells'] = chosen_shells.tolist()
    out_dict['vol_to_shell'] = shell_ind.tolist()
    out_dict['shell_to_vol'] = shell_to_vol


#write to json, shells and their indices
with open(snakemake.output[0], 'w') as outfile:
    json.dump(out_dict, outfile,indent=4)

Python numpy JSON From line 1 of scripts/get_shells_from_bvals.py

Simulation Workflow for Human Allele Frequency Change Study (v1.0.2)

import demes
import os
import demesdraw

sc = snakemake.wildcards['sc']

class Scenario:
    pulse_times = [
        150,
        130,
        110,
        90,
        70,
        50,
        30,
        10,
    ]

    def __init__(
        self,
        name: str,
        N_anc: int,
        pop_sizes: list[int],
        pulses: list[list],
        path: str
    ):
        assert len(pulses[0]) == N_anc
        assert len(pop_sizes) == (N_anc + 1)
        assert len(pulses) == len(self.pulse_times)
        self.name = name
        self.N_anc = N_anc
        self.pop_sizes = pop_sizes
        self.pulses = pulses
        self.file_prefix = path + '/scenario_' + name
        self.plot = f"{self.file_prefix}.svg"

    def build(self):
        b = demes.Builder(
            description=self.name,
            time_units="generations",
            generation_time=1,
        )
        b.add_deme(
            "Pop0",
            description="Ancestral 1",
            epochs=[dict(end_time=0, start_size=self.pop_sizes[0])],
        )
        start = 1500
        for i in range(1, self.N_anc):
            b.add_deme(
                f"Pop{i}",
                description=f"Ancestral {i + 1}",
                ancestors=["Pop0"],
                start_time=start,
                epochs=[dict(end_time=0, start_size=self.pop_sizes[i])],
            )
            start -= 200
        b.add_deme(
            f"Pop{self.N_anc}",
            description="Admixed",
            ancestors=["Pop0"],
            start_time=200,
            epochs=[dict(end_time=0, start_size=self.pop_sizes[self.N_anc])],
        )
        for t, p in zip(self.pulse_times, self.pulses):
            b.add_pulse(
                sources=[f"Pop{i}" for i in range(self.N_anc)],
                dest=f"Pop{self.N_anc}",
                proportions=p,
                time=t,
            )
        self.graph = b.resolve()
        demes.dump(self.graph, self.file_prefix + '.yaml')
        demes.dump(self.graph, self.file_prefix + '.json', format='json', simplified=False)
        ax = demesdraw.tubes(self.graph, log_time=True)
        ax.figure.savefig(self.plot)


# ensure pulses 0s are floats!

sc_dict = dict()
# Scenario 2NGF (No Gene Flow)
sc_dict['2NGF'] = Scenario( 
    name="2NGF",
    N_anc=2,
    pop_sizes=[10_000, 10_000, 10_000],
    pulses=[
        [.0, .0],
        [.0, .0],
        [.0, .0],
        [.0, .0],
        [.0, .0],
        [.0, .0],
        [.0, .0],
        [.0, .0],
    ],
    path=snakemake.params['outdir'],
)

# Scenario 2A
sc_dict['2A'] = Scenario(
    name="2A",
    N_anc=2,
    pop_sizes=[10_000, 10_000, 10_000],
    pulses=[
        [.0, 0.2],
        [.0, 0.2],
        [.0, 0.2],
        [.0, .0],
        [.0, .0],
        [.0, 0.2],
        [.0, 0.2],
        [.0, 0.2],
    ],
    path=snakemake.params['outdir'],
)

# Scenario 2B
sc_dict['2B'] = Scenario(
    name="2B",
    N_anc=2,
    pop_sizes=[10_000, 10_000, 10_000],
    pulses=[
        [.0, 0.2],
        [.0, 0.2],
        [.0, 0.2],
        [.2, .0],
        [.2, .0],
        [.0, 0.2],
        [.0, 0.2],
        [.0, 0.2],
    ],
    path=snakemake.params['outdir'],
)

# Scenario 2C
sc_dict['2C'] = Scenario(
    name="2C",
    N_anc=2,
    pop_sizes=[10_000, 1_000, 5_000],
    pulses=[
        [.0, 0.2],
        [.0, 0.2],
        [.0, 0.2],
        [.2, .0],
        [.2, .0],
        [.0, 0.2],
        [.0, 0.2],
        [.0, 0.2],
    ],
    path=snakemake.params['outdir'],
)

# Scenario 3A
sc_dict['3A'] = Scenario(
    name="3A",
    N_anc=3,
    pop_sizes=[10_000, 10_000, 10_000, 10_000],
    pulses=[
        [.0, .2, .0],
        [.0, .2, .0],
        [.0, .2, .0],
        [.2, .0, .0],
        [.0, .0, .0],
        [.0, .0, .2],
        [.0, .0, .2],
        [.0, .0, .2],
    ],
    path=snakemake.params['outdir'],
)

# Scenario 3B
sc_dict['3B'] = Scenario(
    name="3B",
    N_anc=3,
    pop_sizes=[5_000, 1_000, 10_000, 5_000],
    pulses=[
        [.0, .2, .0],
        [.0, .0, .2],
        [.0, .2, .0],
        [.2, .0, .0],
        [.2, .0, .0],
        [.0, .0, .2],
        [.0, .2, .0],
        [.0, .0, .2],
    ],
    path=snakemake.params['outdir'],
)


sc_dict[sc].build()

Python JSON Demes demesdraw From line 1 of scripts/prep_scenario.py

Assembly pipeline for 10X chromium genomes of Mytilus species (v1.0)

run:
    import subprocess
    import json
    if os.path.exists(output.fasta):
        os.remove(output.fasta)
    # Get clusters for Mollusca taxid=6447
    url = 'http://www.orthodb.org/search?level=6447&limit=50000'
    cmd = f'wget "{url}" -O {output.clustids}'
    subprocess.run(cmd, shell=True, check=True, stderr=subprocess.DEVNULL)
    # Loop for each cluster
    with open(output.clustids, 'r') as fr:
        clusters = json.load(fr)
    for C_id in clusters['data']:
        url = f'http://www.orthodb.org/fasta?id={C_id}&species=all'
        cmd = f'wget "{url}" -O - >> {output.fasta}'
        subprocess.run(cmd, shell=True, check=True, stderr=subprocess.DEVNULL)

SnakeMake JSON From line 7 of rules/annotation.smk

Workflow Steps and Code Snippets

JSON