Workflow Steps and Code Snippets

372 tagged steps and code snippets that match keyword JSON

Snakemake workflow for building an ANTS template (from antsMultivariateTemplateConstruction2.sh)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import nibabel as nib
import numpy as np


img = nib.load(snakemake.input[0])

print('config:')
print(snakemake.config)


hdr = img.header
shape = np.array(hdr.get_data_shape()).astype('float').tolist()
zooms = np.array(hdr.get_zooms()).astype('float').tolist()

affine = img.affine
origin = affine @ np.array([0,0,0,1]).T
origin = origin[0:3].astype('float').tolist()

template_dict = dict()

#add extras from config file
template_dict.update(snakemake.config['template_description_extras'])

#add shape, zooms, origin, for the resolution
template_dict.update( { 'res':  {'{res:02d}'.format(res=snakemake.config['resolution_index']) : { 'origin': origin, 'shape': shape, 'zooms': zooms } } } )


import json


with open(snakemake.output[0],'w') as outfile:
    json.dump(template_dict,outfile,indent=2)

Neuroimaging Data Processing Workflow

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import numpy as np
import nibabel as nib
import json

#load up tissue probability, warped from template
tissue_prob_vol = dict()

for label,nii in zip(snakemake.config['tissue_labels'], snakemake.input.tissue_priors):
    print(label)
    print(nii)
    tissue_prob_vol[label] = nib.load(nii).get_fdata()


#load up k-class tissue segmentation
tissue_k_seg = nib.load(snakemake.input.seg_channels_4d)
tissue_k_seg.shape

sim_prior_k = np.zeros([len(snakemake.config['tissue_labels']),tissue_k_seg.shape[3]])

#for each prior, need to find the channel that best fits
for i,label in enumerate(snakemake.config['tissue_labels']):
    for k in range(tissue_k_seg.shape[3]):

        print(f'Computing overlap of {label} prior and channel {k}... ')
        #compute intersection over union
        s1 = tissue_prob_vol[label] >0.5
        s2 = tissue_k_seg.slicer[:,:,:,k].get_fdata() >0.5
        sim_prior_k[i,k] = np.sum(np.logical_and(s1,s2).flat) / np.sum(np.logical_or(s1,s2).flat) 


print('Overlap table:')
print(sim_prior_k)

label_to_k_dict = dict()

for i,label in enumerate(snakemake.config['tissue_labels']):
    label_to_k_dict[label] = int(np.argmax(sim_prior_k[i,:]))
    #write nii to file
    print('writing image at channel {} to output file: {}'.format(label_to_k_dict[label], \
                                                    snakemake.output.tissue_segs[i]))
    nib.save(tissue_k_seg.slicer[:,:,:,label_to_k_dict[label]],\
                    snakemake.output.tissue_segs[i])


with open(snakemake.output.mapping_json, 'w') as outfile:
    json.dump(label_to_k_dict, outfile,indent=4)

workflow for processing ISMRM Diffusion Study - Best Practices in Image Preprocessing data

26
27
28
29
30
31
32
run: 
    import json
    with open(input.json) as f:
        json_dict = json.load(f)
    json_dict['PhaseEncodingDirection'] = 'j'
    with open(output.json, 'w') as f:
        json.dump(json_dict, f,indent=4)
46
47
48
49
50
51
52
run: 
    import json
    with open(input.json) as f:
        json_dict = json.load(f)
    json_dict['PhaseEncodingDirection'] = 'j-'
    with open(output.json, 'w') as f:
        json.dump(json_dict, f,indent=4)

template building for cam-can dataset

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import nibabel as nib
import numpy as np
import json


img = nib.load(snakemake.input[0])


hdr = img.header
shape = np.array(hdr.get_data_shape()).astype('float').tolist()
zooms = np.array(hdr.get_zooms()).astype('float').tolist()

affine = img.affine
origin = affine @ np.array([0,0,0,1]).T
origin = origin[0:3].astype('float').tolist()

template_dict = dict()

#add extras from config file
template_dict.update(snakemake.config['template_description_extras'])

#add shape, zooms, origin, for the resolution
template_dict.update( { 'res':  {'{res:02d}'.format(res=snakemake.config['resolution_index']) : { 'origin': origin, 'shape': shape, 'zooms': zooms } } } )

#add cohorts to json, with full list of subjects for each cohort..
cohort_dict = dict()
for cohort in snakemake.params.cohorts:
    cohort_dict[cohort] = { 'participants': snakemake.params.subjects[cohort] } 

template_dict.update( { 'cohort': cohort_dict } )


with open(snakemake.output[0],'w') as outfile:
    json.dump(template_dict,outfile,indent=2)

nnUnet training workflow for fetal bold MRI

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
import json

#load template json
with open(snakemake.input.template_json) as f:
    dataset = json.load(f)

dataset['training'] = [{'image': img, 'label': lbl} for img,lbl in zip(snakemake.params.training_imgs_nosuffix,snakemake.input.training_lbls)]



dataset['numTraining'] = len(dataset['training'])

#write modified json
with open(snakemake.output.dataset_json, 'w') as f:
    json.dump(dataset, f, indent=4)

Snakemake workflow for pre-processing dwi data to re-train using b1000 hcp data

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import json
import nibabel as nib
import numpy as np
import sys

with open(snakemake.input.shells) as f:
  shells_dict = json.load(f)

#get bval parameter:
bval = snakemake.params.bval

#input dwi
dwi_nib = nib.load(snakemake.input.dwi)
print(dwi_nib.shape)

#create output shape
newshape = np.array(dwi_nib.shape[:3])

avg_shell = np.zeros(newshape.astype(int))


indices = shells_dict['shell_to_vol'][bval] 

if len(dwi_nib.shape) == 3 and len(indices) == 1 and indices[0] == 0:
    #we have 3d vol (e.g. b0 only), so just grab it..
    avg_shell = dwi_nib.get_fdata()[:,:,:]
elif len(dwi_nib.shape) == 4:
    #otherwise, pick out indices and average
    avg_shell = np.mean(dwi_nib.get_fdata()[:,:,:,indices],3)
else:
    #if not either of these cases, then something weird with indices and volumes
    print('unable to get map indices to get avg shell')
    sys.exit()

#now save as image
avg_shell_nii = nib.Nifti1Image(avg_shell, affine=dwi_nib.affine )
avg_shell_nii.to_filename(snakemake.output[0])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import numpy as np
import json

#load bvals
bvals = np.loadtxt(snakemake.input[0])


#if just single bval (i.e. rev ph enc b0), then just set manually
if bvals.size == 1:

    out_dict = dict()
    shells = [np.around(bvals,-2).astype('int').tolist()]
    out_dict['shells'] = shells
    out_dict['vol_to_shell'] = shells
    out_dict['shell_to_vol'] = {str(shells[0]): [0]} #point to index 0

#otherwise try to find shells
else:

    shells = []

    # histogram is used to find number of shells, with anywhere from 10 to 100 bins
    #  want to use the highest number of bins that doesn't split up the shells
    #  so that the bin centers can then be used as the shell bvalues..

    for i,nbins in enumerate( np.arange(10,100,5) ):

        counts, bin_edges = np.histogram(bvals, bins=nbins )

        bin_lhs = bin_edges[:-1]
        bin_rhs = bin_edges[1:]
        bin_centers = (bin_lhs + bin_rhs) / 2

        shells.append(bin_centers[np.where(counts>0)])


    #get number of shells for each bin-width choice
    nshells = [len(s) for s in shells]

    print('nshells')
    print(nshells)

    #use the highest number of bins that still produces the minimal number of shells
    min_nshells = np.min(nshells)
    possible_shells = np.where(nshells == min_nshells)[0]
    chosen_shells = shells[possible_shells[-1]]

    #round to nearest 100
    chosen_shells = np.around(chosen_shells,-2).astype('int')

    print('chosen_shells')
    print(chosen_shells)

    #write to file
    #np.savetxt(snakemake.output[0],chosen_shells,fmt='%d')



    #get bval indices, by mindist to shell
    #bvals
    rep_shells = np.tile(chosen_shells,[len(bvals),1])
    rep_bvals = np.tile(bvals,[len(chosen_shells),1]).T

    print(rep_shells)
    print(rep_bvals)

    #abs diff between bvals and shells
    diff = np.abs(rep_bvals - rep_shells)
    shell_ind = np.argmin(diff,1)

    shell_ind = chosen_shells[shell_ind]

    #get a list of indices shells to vols
    shell_to_vol = dict()
    for shell in chosen_shells.tolist():
        shell_to_vol[shell] = np.where(shell_ind==int(shell))[0].tolist()

    #chosen_shell
    out_dict = dict()
    out_dict['shells'] = chosen_shells.tolist()
    out_dict['vol_to_shell'] = shell_ind.tolist()
    out_dict['shell_to_vol'] = shell_to_vol


#write to json, shells and their indices
with open(snakemake.output[0], 'w') as outfile:
    json.dump(out_dict, outfile,indent=4)

Simulation Workflow for Human Allele Frequency Change Study (v1.0.2)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import demes
import os
import demesdraw

sc = snakemake.wildcards['sc']

class Scenario:
    pulse_times = [
        150,
        130,
        110,
        90,
        70,
        50,
        30,
        10,
    ]

    def __init__(
        self,
        name: str,
        N_anc: int,
        pop_sizes: list[int],
        pulses: list[list],
        path: str
    ):
        assert len(pulses[0]) == N_anc
        assert len(pop_sizes) == (N_anc + 1)
        assert len(pulses) == len(self.pulse_times)
        self.name = name
        self.N_anc = N_anc
        self.pop_sizes = pop_sizes
        self.pulses = pulses
        self.file_prefix = path + '/scenario_' + name
        self.plot = f"{self.file_prefix}.svg"

    def build(self):
        b = demes.Builder(
            description=self.name,
            time_units="generations",
            generation_time=1,
        )
        b.add_deme(
            "Pop0",
            description="Ancestral 1",
            epochs=[dict(end_time=0, start_size=self.pop_sizes[0])],
        )
        start = 1500
        for i in range(1, self.N_anc):
            b.add_deme(
                f"Pop{i}",
                description=f"Ancestral {i + 1}",
                ancestors=["Pop0"],
                start_time=start,
                epochs=[dict(end_time=0, start_size=self.pop_sizes[i])],
            )
            start -= 200
        b.add_deme(
            f"Pop{self.N_anc}",
            description="Admixed",
            ancestors=["Pop0"],
            start_time=200,
            epochs=[dict(end_time=0, start_size=self.pop_sizes[self.N_anc])],
        )
        for t, p in zip(self.pulse_times, self.pulses):
            b.add_pulse(
                sources=[f"Pop{i}" for i in range(self.N_anc)],
                dest=f"Pop{self.N_anc}",
                proportions=p,
                time=t,
            )
        self.graph = b.resolve()
        demes.dump(self.graph, self.file_prefix + '.yaml')
        demes.dump(self.graph, self.file_prefix + '.json', format='json', simplified=False)
        ax = demesdraw.tubes(self.graph, log_time=True)
        ax.figure.savefig(self.plot)


# ensure pulses 0s are floats!

sc_dict = dict()
# Scenario 2NGF (No Gene Flow)
sc_dict['2NGF'] = Scenario( 
    name="2NGF",
    N_anc=2,
    pop_sizes=[10_000, 10_000, 10_000],
    pulses=[
        [.0, .0],
        [.0, .0],
        [.0, .0],
        [.0, .0],
        [.0, .0],
        [.0, .0],
        [.0, .0],
        [.0, .0],
    ],
    path=snakemake.params['outdir'],
)

# Scenario 2A
sc_dict['2A'] = Scenario(
    name="2A",
    N_anc=2,
    pop_sizes=[10_000, 10_000, 10_000],
    pulses=[
        [.0, 0.2],
        [.0, 0.2],
        [.0, 0.2],
        [.0, .0],
        [.0, .0],
        [.0, 0.2],
        [.0, 0.2],
        [.0, 0.2],
    ],
    path=snakemake.params['outdir'],
)

# Scenario 2B
sc_dict['2B'] = Scenario(
    name="2B",
    N_anc=2,
    pop_sizes=[10_000, 10_000, 10_000],
    pulses=[
        [.0, 0.2],
        [.0, 0.2],
        [.0, 0.2],
        [.2, .0],
        [.2, .0],
        [.0, 0.2],
        [.0, 0.2],
        [.0, 0.2],
    ],
    path=snakemake.params['outdir'],
)

# Scenario 2C
sc_dict['2C'] = Scenario(
    name="2C",
    N_anc=2,
    pop_sizes=[10_000, 1_000, 5_000],
    pulses=[
        [.0, 0.2],
        [.0, 0.2],
        [.0, 0.2],
        [.2, .0],
        [.2, .0],
        [.0, 0.2],
        [.0, 0.2],
        [.0, 0.2],
    ],
    path=snakemake.params['outdir'],
)

# Scenario 3A
sc_dict['3A'] = Scenario(
    name="3A",
    N_anc=3,
    pop_sizes=[10_000, 10_000, 10_000, 10_000],
    pulses=[
        [.0, .2, .0],
        [.0, .2, .0],
        [.0, .2, .0],
        [.2, .0, .0],
        [.0, .0, .0],
        [.0, .0, .2],
        [.0, .0, .2],
        [.0, .0, .2],
    ],
    path=snakemake.params['outdir'],
)

# Scenario 3B
sc_dict['3B'] = Scenario(
    name="3B",
    N_anc=3,
    pop_sizes=[5_000, 1_000, 10_000, 5_000],
    pulses=[
        [.0, .2, .0],
        [.0, .0, .2],
        [.0, .2, .0],
        [.2, .0, .0],
        [.2, .0, .0],
        [.0, .0, .2],
        [.0, .2, .0],
        [.0, .0, .2],
    ],
    path=snakemake.params['outdir'],
)


sc_dict[sc].build()

Assembly pipeline for 10X chromium genomes of Mytilus species (v1.0)

 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
run:
    import subprocess
    import json
    if os.path.exists(output.fasta):
        os.remove(output.fasta)
    # Get clusters for Mollusca taxid=6447
    url = 'http://www.orthodb.org/search?level=6447&limit=50000'
    cmd = f'wget "{url}" -O {output.clustids}'
    subprocess.run(cmd, shell=True, check=True, stderr=subprocess.DEVNULL)
    # Loop for each cluster
    with open(output.clustids, 'r') as fr:
        clusters = json.load(fr)
    for C_id in clusters['data']:
        url = f'http://www.orthodb.org/fasta?id={C_id}&species=all'
        cmd = f'wget "{url}" -O - >> {output.fasta}'
        subprocess.run(cmd, shell=True, check=True, stderr=subprocess.DEVNULL)
format / edam

JSON

JavaScript Object Notation format; a lightweight, text-based format to represent tree-structured data using key-value pairs.