Pipeline for the manuscript 'Accurate detection of shared genetic architecture from GWAS summary statistics in the small-sample context'

public public 1yr ago 0 bookmarks

This repository contains a snakemake pipeline for the reproduction of the results in the manuscript named above.

Running the pipeline to obtain the manuscript results

Running real data analyses

The file workfl

Code Snippets

 6
 7
 8
 9
10
11
12
run:
  if wildcards.chr == 'chrX':
      shell("wget -O resources/1000g/{wildcards.chr}.vcf.gz       http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.{wildcards.chr}.phase3_shapeit2_mvncall_integrated_v1c.20130502.genotypes.vcf.gz")
  elif wildcards.chr == 'chrY':
      shell("wget -O resources/1000g/{wildcards.chr}.vcf.gz http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chrY.phase3_integrated_v2b.20130502.genotypes.vcf.gz")
  else:
    shell("wget -O resources/1000g/{wildcards.chr}.vcf.gz http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.{wildcards.chr}.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz")
18
19
20
21
22
shell:
    """
    wget -O resources/1000g/integrated_call_samples_v3.20130502.ALL.panel.txt http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel
    wget -O resources/1000g/integrated_call_samples_v3.20200731.ALL.ped.txt http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20200731.ALL.ped
    """
37
38
shell:
    "plink2 --memory {resources.mem_mb} --threads {threads} --vcf {input} --make-bed --out {params.out} --set-all-var-ids @:#:\$r:\$a --max-alleles 2 --new-id-max-allele-len 20 'truncate'"
48
49
script:
  "../scripts/1000g/get_euro_fam_file.R"
65
66
shell:
  "plink2 --memory {resources.mem_mb} --threads {threads} --bfile resources/1000g/{wildcards.chr} --keep resources/1000g/euro.fam --make-bed --silent --out resources/1000g/euro/{wildcards.chr}_euro"
81
82
shell:
    "plink2 --memory {resources.mem_mb} --threads {threads} --bfile resources/1000g/euro/{wildcards.chr}_euro --geno 0.1 --mind 0.1 --maf 0.005 --hwe 1e-50 --rm-dup 'force-first' --make-bed --silent --out resources/1000g/euro/qc/{wildcards.chr}_qc"
93
script: "../scripts/1000g/make_subset_ranges.R"
105
script: "../scripts/1000g/make_subset_ranges.R"
124
125
shell:
  "plink2 --memory {resources.mem_mb} --threads {threads} --bfile {params.bfile} --extract {input.range_file} --make-bed --out {params.out}"
143
144
shell:
  "plink --memory {resources.mem_mb} --threads {threads} --bfile {params.bfile} --indep-pairwise {wildcards.window} {wildcards.step} {params.r2} --out {params.prune_out}"
152
153
shell:
  "for x in {input}; do cat $x >>{output}; done"
161
162
shell:
    "for x in {input}; do cat $x >>{output}; done"
180
181
shell:
    "plink2 --memory {resources.mem_mb} --threads {threads} --bfile {params.input_stem} --snps-only --make-bed --silent --out {params.output_stem}"
17
18
shell:
    "workflow/scripts/gps_cpp/build/apps/computeGpsCLI -i {input} -a {params.a_colname} -b {params.b_colname} -c {wildcards.effect_blocks_A} -d {wildcards.effect_blocks_B} -p 0 -f naive -n {threads} -o {output.result_file} -j {output.timing_file}"
34
35
shell:
    "workflow/scripts/gps_cpp/build/apps/computeGpsCLI -i {input} -a {params.a_colname} -b {params.b_colname} -c {wildcards.effect_blocks_A} -d {wildcards.effect_blocks_B} -p {wildcards.no_of_pert_iterations} -f {wildcards.algorithm} -n {threads} -o {output.result_file} -j {output.timing_file}"
46
47
48
49
50
51
52
53
54
55
56
57
58
59
shell:
    """
    for x in {input.naive}; do
        head -n 1 $x | sed 's/^ \(.*\)s wall,.*$/\1/' >>{output.naive}
    done

    for x in {input.pp}; do
        head -n 1 $x | sed 's/^ \(.*\)s wall,.*$/\1/' >>{output.pp}
    done

    for x in {input.lw}; do
        head -n 1 $x | sed 's/^ \(.*\)s wall,.*$/\1/' >>{output.lw}
    done
    """
77
78
shell:
    "workflow/scripts/gps_cpp/build/apps/permuteTraitsCLI -i {input} -o {output} -a {params.a_colname} -b {params.b_colname} -c {threads} -n {wildcards.draws} -p {params.no_of_pert_iterations}"
19
20
21
22
shell:
    """
    touch {output}
    """
6
7
shell:
  "Rscript workflow/scripts/ukbb/fit_gev_to_increasing_n.R -a {wildcards.trait_A} -b {wildcards.trait_B} -p {input} -n 500 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 -o {output}"
SnakeMake From line 6 of gps/gof.smk
14
15
16
17
18
19
20
shell:
    """
    echo -e 'trait_A\ttrait_B\tn\tloc\tloc.sd\tscale\tscale.sd\tshape\tshape.sd' >> {output}
    for x in {input}; do
    cat <(tail -n +2 $x) >> {output}
    done
    """
SnakeMake From line 14 of gps/gof.smk
27
28
shell:
    "Rscript workflow/scripts/ukbb/plot_gev_estimates_for_increasing_n.R -f {input} -o {output}"
SnakeMake From line 27 of gps/gof.smk
41
42
shell:
  "Rscript workflow/scripts/ukbb/plot_gev_estimates_for_increasing_no_snps.R -i {params.fit_file_string} -n 10000 50000 100000 200000 300000 400000 -o {output}"
SnakeMake From line 41 of gps/gof.smk
56
57
shell:
  "Rscript workflow/scripts/ukbb/plot_null_dists_to_compare.R -f {input.fit_file} -p {input.perm_file} --exp1_null {output.exp1_null} --gev_null {output.gev_null} --exp1_gev_combined {output.exp1_gev_combined}"
SnakeMake From line 56 of gps/gof.smk
69
70
shell:
  "Rscript workflow/scripts/ukbb/plot_gev_gof_plots.R -f {input.fit_file} -p {input.perm_file} -l {wildcards.trait_A}-{wildcards.trait_B} -o {output}"
SnakeMake From line 69 of gps/gof.smk
81
82
83
84
85
86
87
88
shell:
    """
    echo -e "Trait_A\tTrait_B\tGPS" >{output}

    for x in {input}; do
        tail -n 1 $x >>{output}
    done
    """
SnakeMake From line 81 of gps/gof.smk
13
14
shell:
    "workflow/scripts/gps_cpp/build/apps/fitAndEvaluateEcdfsCLI -i {input.sum_stats_file} -a {wildcards.trait_A} -b {wildcards.trait_B} -n {threads} -o {output} -f pp"
26
script: "../../scripts/gps/annotate_intermediate_gps_output.R"
34
35
shell:
    "workflow/scripts/gps/plot_gps_denom_heatmap.R -i {input} -o {output}"
45
script: "../../scripts/gps/compile_top_maximands_for_ukbb_traits.R"
15
16
script:
    "../../scripts/gps/simulate_correlated_p_values.R"
27
28
shell:
    "workflow/scripts/gps_cpp/build/apps/computeGpsCLI -i {input} -a {params.a_colname} -b {params.b_colname} -c {params.a_colname} -d {params.b_colname} -n {threads} -f pp -o {output}"
43
44
shell:
    "workflow/scripts/gps_cpp/build/apps/permuteTraitsCLI -i {input} -o {output} -a {params.a_colname} -b {params.b_colname} -c {threads} -n {wildcards.draws}"
52
53
script:
    "../../scripts/compute_li_gps_pvalue.R"
66
67
script:
    "../../scripts/fit_gev_and_compute_gps_pvalue.R"
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
run:
    gev = []

    for x in input.gev:
        print(x.split('/'))
        rho, zmean_zsd, rep = x.split('/')[2:5]

        rho = float(rho.replace('_', '.'))
        zmean = int(zmean_zsd.split('_')[0])
        zsd = int(zmean_zsd.split('_')[1])
        rep = int(rep)

        with open(x, 'r') as infile:
            line = infile.readline()
            line = infile.readline()
            gps = line.split()[0]
            pvalue = line.split()[8]

        gev.append(
            {
                'rho' : rho,
                'zmean' : zmean,
                'zsd' : zsd,
                'rep' : rep,
                'gps' : gps,
                'pvalue' : pvalue,
                'stat' : 'GPS-GEV'
            }
         )

    gev_daf = pd.DataFrame(gev)

    li = []

    for x in input.li:
        rho, zmean_zsd, rep = x.split('/')[2:5]

        rho = float(rho.replace('_', '.'))
        zmean = int(zmean_zsd.split('_')[0])
        zsd = int(zmean_zsd.split('_')[1])
        rep = int(rep)

        with open(x, 'r') as infile:
            line = infile.readline()
            line = infile.readline()
            gps = line.split()[0]
            pvalue = line.split()[1]

            li.append(
                {
                    'rho' : rho,
                    'zmean' : zmean,
                    'zsd' : zsd,
                    'rep' : rep,
                    'gps' : gps,
                    'pvalue' : pvalue,
                    'stat' : 'GPS-Exp'
                }
            )

    li_daf = pd.DataFrame(li)

    pd.concat([gev_daf, li_daf]).to_csv(output[0], index = False, sep = '\t')
155
156
script:
    "../../scripts/gps/plot_fig_s10.R"
16
17
shell:
    "workflow/scripts/gps_cpp/build/apps/computeGpsCLI -i {input.sum_stats_file} -a {params.a_colname} -b {params.b_colname} -c {wildcards.effect_blocks_A} -d {wildcards.effect_blocks_B} -n {threads} -f pp -o {output}"
32
33
shell:
    "workflow/scripts/gps_cpp/build/apps/permuteTraitsCLI -i {input.sum_stats_file} -o {output} -a {params.a_colname} -b {params.b_colname} -c {threads} -n {wildcards.draws}"
42
43
script:
    "../../scripts/compute_li_gps_pvalue.R"
56
57
script:
    "../../scripts/fit_gev_and_compute_gps_pvalue.R"
70
script: "../../scripts/simgwas/compute_hoeffdings.R"
18
19
shell:
    "workflow/scripts/gps_cpp/build/apps/computeGpsCLI -i {input.sum_stats_file} -a {params.a_colname} -b {params.b_colname} -c {wildcards.effect_blocks_A} -d {wildcards.effect_blocks_B} -n {threads} -f pp -o {output}"
35
36
shell:
    "workflow/scripts/gps_cpp/build/apps/permuteTraitsCLI -i {input.sum_stats_file} -o {output} -a {params.a_colname} -b {params.b_colname} -c {threads} -n {wildcards.draws}"
44
45
script:
    "../../scripts/compute_li_gps_pvalue.R"
59
60
script:
    "../../scripts/fit_gev_and_compute_gps_pvalue.R"
74
75
76
77
78
79
shell:
    """
    Rscript workflow/scripts/compute_hoeffdings.R -i {input.sum_stats_file} -a {params.a_colname} -b {params.b_colname} -o {output} -nt 1
    sed -i 's/{params.a_colname}/{wildcards.effect_blocks_A}_{wildcards.shared_effect_blocks}/' {output}
    sed -i 's/{params.b_colname}/{wildcards.effect_blocks_B}_{wildcards.shared_effect_blocks}/' {output}
    """
93
94
shell:
    "workflow/scripts/gps_cpp/build/apps/fitAndEvaluateEcdfsCLI -i {input.sum_stats_file} -a {params.a_colname} -b {params.b_colname} -n {threads} -o {output}"
16
17
shell:
  "workflow/scripts/gps_cpp/build/apps/computeGpsCLI -i {input.sum_stats_file} -a {wildcards.trait_A} -b {wildcards.trait_B} -c {wildcards.trait_A} -d {wildcards.trait_B} -n {threads} -f pp -o {output}"
30
31
shell:
  "workflow/scripts/gps_cpp/build/apps/permuteTraitsCLI -i {input.sum_stats_file} -o {output} -a {wildcards.trait_A} -b {wildcards.trait_B} -c {threads} -n {wildcards.draws}"
44
45
script:
  "../../scripts/fit_gev_and_compute_gps_pvalue.R"
54
55
script:
    "../../scripts/compute_li_gps_pvalue.R"
25
26
27
28
shell:
    """
    python2 $ldsc/munge_sumstats.py --sumstats {input} --N-con-col ncontrols --N-cas-col ncases --snp id --out {params.output_filename} --signed-sumstats {params.signed_sumstats_col} --p {params.pvalue_col} --a1 a1 --a2 a0 --frq EUR;
    """
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
run:
    block_files = get_randomised_block_files_for_pair(wildcards)

    # Probably too clever by half
    a_block_files = block_files[-(2*params.no_of_blocks_in_genome):-params.no_of_blocks_in_genome]
    b_block_files = block_files[-params.no_of_blocks_in_genome:]

    file_m = re.compile(params.file_regex)

    a_dicts = []

    for x in a_block_files:
        print(x)

        m = file_m.match(x)

        a_dicts.append(
            {
                "chr" : m.group("chr"),
                "block" : m.group("block_no"),
                "effect" : m.group("effect"),
                "no_cvs" : cv_per_block_dict[m.group("effect")]
            }
        )

    block_a_daf = pd.DataFrame(a_dicts)

    block_a_daf.sort_values(by = ['chr', 'block', 'effect', 'no_cvs'], inplace = True)

    b_dicts = []

    for x in b_block_files:
        m = file_m.match(x)

        b_dicts.append(
            {
                "chr" : m.group("chr"),
                "block" : m.group("block_no"),
                "effect" : m.group("effect"),
                "no_cvs" : cv_per_block_dict[m.group("effect")]
            }
        )

    block_b_daf = pd.DataFrame(b_dicts)

    block_b_daf.sort_values(by = ['chr', 'block', 'effect', 'no_cvs'], inplace = True)

    block_a_daf.to_csv(output.a_block_file, index = False, sep = '\t')

    block_b_daf.to_csv(output.b_block_file, index = False, sep = '\t')
136
script: "../../scripts/ldsc/calculate_theoretical_rg_randomised_blocks.R"
22
23
24
25
shell:
    """
    python2 $ldsc/munge_sumstats.py --sumstats {input} --N-con-col ncontrols --N-cas-col ncases --snp id --out {params.output_filename} --signed-sumstats {params.signed_sumstats_col} --p {params.pvalue_col} --a1 a1 --a2 a0 --frq EUR;
    """
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
run:
    block_files = get_randomised_block_files_for_pair(wildcards)

    # Probably too clever by half
    a_block_files = block_files[-(2*params.no_of_blocks_in_genome):-params.no_of_blocks_in_genome]
    b_block_files = block_files[-params.no_of_blocks_in_genome:]

    file_m = re.compile("results/simgwas/simulated_sum_stats/block_sum_stats/(?P<no_reps>\d+)_reps/(?P<effect>\w+)(/(?P<cv>\d+)_cv)?/(?P<ncases>\d+)_(?P<ncontrols>\d+)/chr(?P<chr>\d+)/block_(?P<block_no>\d+)_seed_(?P<seed>\d+)_sum_stats\.tsv\.gz")

    a_dicts = []

    for x in a_block_files:
        print(x)

        m = file_m.match(x)

        a_dicts.append(
            {
                "chr" : m.group("chr"),
                "block" : m.group("block_no"),
                "effect" : m.group("effect"),
                "no_cvs" : cv_per_block_dict[m.group("effect")]
            }
        )

    block_a_daf = pd.DataFrame(a_dicts)

    block_a_daf.sort_values(by = ['chr', 'block', 'effect', 'no_cvs'], inplace = True)

    b_dicts = []

    for x in b_block_files:
        m = file_m.match(x)

        b_dicts.append(
            {
                "chr" : m.group("chr"),
                "block" : m.group("block_no"),
                "effect" : m.group("effect"),
                "no_cvs" : cv_per_block_dict[m.group("effect")]
            }
        )

    block_b_daf = pd.DataFrame(b_dicts)

    block_b_daf.sort_values(by = ['chr', 'block', 'effect', 'no_cvs'], inplace = True)

    block_a_daf.to_csv(output.a_block_file, index = False, sep = '\t')

    block_b_daf.to_csv(output.b_block_file, index = False, sep = '\t')
131
script: "../../scripts/ldsc/calculate_theoretical_rg_randomised_blocks.R"
11
12
shell:
    "wget -O {output} https://storage.googleapis.com/broad-alkesgroup-public/LDSCORE/eur_w_ld_chr.tar.bz2"
SnakeMake From line 11 of ldsc/ldsc.smk
24
25
shell:
    "tar -xjf {input} -C {params.output_root}"
SnakeMake From line 24 of ldsc/ldsc.smk
34
script: "../../scripts/ldsc/write_out_ld_score_snps.R"
SnakeMake From line 34 of ldsc/ldsc.smk
16
script: "../../scripts/ldsc/preprocess_ukbb_sum_stats_for_ldsc.R"
38
39
40
41
shell:
    """
    python $ldsc/munge_sumstats.py --sumstats {input} --N-col {params.n_col} --snp {params.id_col} --out {params.output_filename} --signed-sumstats {params.signed_sumstats_col} --p {params.pvalue_col} --a1 a1 --a2 a2 --frq EUR;
    """
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
run:
    perm_gps = []

    sim_gps = []

    with open(input.perm_file, 'r') as perm_file:
        line = perm_file.readline()
        for i in range(len(input.gps_files)):
            line = perm_file.readline()
            perm_gps.append(float(line.strip()))

    for x in input.gps_files:
        with open(x, 'r') as gps_file:
            line = gps_file.readline()
            line = gps_file.readline()
            sim_gps.append(float(line.split('\t')[0]))

    pd.DataFrame({'sim_gps': sim_gps, 'perm_gps': perm_gps}).to_csv(output[0], sep = '\t', index = False)
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
run:
    daf = pd.DataFrame(columns = ['ncases.A', 'ncontrols.A', 'ncases.B', 'ncontrols.B', 'blocks.A',
                                  'blocks.B', 'shared_blocks', 'tag_pair', 'seed', 'gps', 'pval'])

    for x in r2_values:
        gps_daf = compile_gps_results_into_daf(get_test_files("results/simgwas/simulation_parameters.tsv", reps = 400, filetype = 'gps', subset = f"a_blocks == \'{wildcards.blocks}\' & shared_blocks == \'{wildcards.shared_blocks}\' & ncases_A == {wildcards.ncases}", r2 = x))
        gps_daf.drop(columns = ['n', 'loc', 'loc.sd', 'scale', 'scale.sd', 'shape', 'shape.sd'], inplace = True)
        gps_daf = gps_daf.assign(r2 = x, dist = 'gev')

        li_gps_daf = compile_li_gps_results_into_daf(get_test_files("results/simgwas/simulation_parameters.tsv", reps = 400, filetype = 'li_gps', subset = f"a_blocks == \'{wildcards.blocks}\' & shared_blocks == \'{wildcards.shared_blocks}\' & ncases_A == {wildcards.ncases}", r2 = x))
        li_gps_daf = li_gps_daf.assign(r2 = x, dist = 'exp')

        daf = pd.concat([daf, gps_daf, li_gps_daf])

    daf.to_csv(output[0], sep = '\t', index = False)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
from scipy.stats import chi2
import os
import pandas as pd
import re
from numpy import nan

def get_all_block_files(sample_sizes, effect):

    block_files = []

    seed_label = f"{effect}_seed"

    for row in block_daf.itertuples():
        for sample_size in sample_sizes:
            if effect == 'tiny':
                block_files.append(f"results/simgwas/simulated_sum_stats/block_sum_stats/400_reps/{effect}/2_cv/{sample_size[0]}_{sample_size[1]}/chr{row.chr}/block_{row.block}_seed_{getattr(row, seed_label)}_sum_stats.tsv.gz")
            else:
                block_files.append(f"results/simgwas/simulated_sum_stats/block_sum_stats/400_reps/{effect}/1_cv/{sample_size[0]}_{sample_size[1]}/chr{row.chr}/block_{row.block}_seed_{getattr(row, seed_label)}_sum_stats.tsv.gz")


    return block_files

def get_all_simulation_done_files(simulation_pars_file, reps, subset = None):
    daf = pd.read_csv(simulation_pars_file, sep = '\t')

    if subset:
        daf = daf.query('a_blocks == @subset & b_blocks == @subset')

    return [f"results/simgwas/done/{reps}_reps/randomised/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/seed_{row.seed}_tags_{row.tag_A}-{row.tag_B}.done" for row in daf.itertuples()]

def get_all_randomised_block_files(simulation_pars_file, reps):
    daf = pd.read_csv(simulation_pars_file, sep = '\t')

    a_block_files =  [f"results/simgwas/simulated_sum_stats/whole_genome_sum_stats/{reps}_reps/randomised/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/seed_{row.seed}_sum_stats_A_tags_{row.tag_A}-{row.tag_B}_files.txt" for row in daf.itertuples()]
    b_block_files = [f"results/simgwas/simulated_sum_stats/whole_genome_sum_stats/{reps}_reps/randomised/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/seed_{row.seed}_sum_stats_B_tags_{row.tag_A}-{row.tag_B}_files.txt" for row in daf.itertuples()]

    return a_block_files+b_block_files

def get_test_files(simulation_pars_file, reps, filetype, subset = None, draws = "3000", window = "1000kb", step = 50, r2 = 0.2):
    daf = pd.read_csv(simulation_pars_file, sep = '\t')

    str_r2 = str(r2).replace('.', '_')

    if subset:
        daf = daf.query(subset)

    if filetype == 'ldsc':
        files = [f"results/ldsc/simgwas/{reps}_reps/randomised/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/rg/fixed_h2_free_rg_intercept/seed_{row.seed}_tags_{row.tag_A}-{row.tag_B}.log" for row in daf.itertuples()]
    elif filetype == 'sumher':
        files = [f"results/ldak/ldak-thin/simgwas/{reps}_reps/randomised/rg/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/seed_{row.seed}_tags_{row.tag_A}-{row.tag_B}.cors" for row in daf.itertuples()]
    elif filetype == 'hoeffdings':
        files = [f"results/hoeffdings/simgwas/{reps}_reps/randomised/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/window_{window}_step_{step}_r2_{str_r2}/seed_{row.seed}_tags_{row.tag_A}-{row.tag_B}_hoeffdings.tsv" for row in daf.itertuples()]
    elif filetype == 'gps':
        files = [f"results/gps/simgwas/{reps}_reps/randomised/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/window_{window}_step_{step}_r2_{str_r2}/{draws}_permutations/seed_{row.seed}_tags_{row.tag_A}-{row.tag_B}_gps_pvalue.tsv" for row in daf.itertuples()]
    elif filetype == 'theo':
        files = [f"results/ldsc/simgwas/{reps}_reps/randomised/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/theoretical_rg/seed_{row.seed}_{row.tag_A}-{row.tag_B}_theo_rg.tsv" for row in daf.itertuples()]
    elif filetype == 'li_gps':
        files = [f"results/gps/simgwas/{reps}_reps/randomised/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/window_{window}_step_{step}_r2_{str_r2}/seed_{row.seed}_tags_{row.tag_A}-{row.tag_B}_li_gps_pvalue.tsv" for row in daf.itertuples()]
    elif filetype == 'mean_stat':
        files = [f"results/gps/simgwas/{reps}_reps/randomised/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/window_{window}_step_{step}_r2_{str_r2}/seed_{row.seed}_tags_{row.tag_A}-{row.tag_B}/mean_stat/{draws}_permutations/pp_pert_1_pvalue.tsv" for row in daf.itertuples()]
    elif filetype == 'done':
        files = [f"results/simgwas/done/{reps}_reps/randomised/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/seed_{row.seed}_tags_{row.tag_A}-{row.tag_B}.done" for row in daf.itertuples()]
    else:
        raise Exception(f"Invalid filetype specified: {filetype}")

    return files

def compile_sumher_results_into_daf(input_files):
    d = []

    for x in input_files:

        m = re.match(r"results/ldak/ldak-thin/simgwas/(?P<no_reps>\d+)_reps/randomised/rg(/chr\d+)?/(?P<ncases_A>\d+)_(?P<ncontrols_A>\d+)_(?P<ncases_B>\d+)_(?P<ncontrols_B>\d+)/(?P<a_blocks>[\w-]+)_(?P<b_blocks>[\w-]+)_(?P<shared_blocks>[\w-]+)/seed_(?P<seed>\w+)_tags_(?P<tag_a>\d+)-(?P<tag_b>\d+)\.cors", x)

        try:
            with open(x, 'r') as infile:

                line = infile.readline()

                while re.match("^Her1_All", line) is None:
                    line = infile.readline()

                h2_A, h2_A_se = re.match("Her1_All (-?\d+\.\d+|-?nan) (\d+\.\d+|nan)", line).groups()

                line = infile.readline()

                h2_B, h2_B_se = re.match("Her2_All (-?\d+\.\d+|-?nan) (\d+\.\d+|nan)", line).groups()

                line = infile.readline()

                cov, cov_se = re.match("Coher_All (-?\d+\.\d+|-?nan) (\d+\.\d+|nan)", line).groups()

                line = infile.readline()

                rg, rg_se = re.match("Cor_All (-?\d+\.\d+|-?nan) (\d+\.\d+|nan)", line).groups()

            rg_z = float(rg)/float(rg_se)

            rg_p = chi2.sf(rg_z**2, df = 1, loc = 0, scale = 1)

            d.append(
                {
                    'ncases.A' : m.group('ncases_A'),
                    'ncontrols.A' : m.group('ncontrols_A'),
                    'ncases.B' : m.group('ncases_B'),
                    'ncontrols.B' : m.group('ncontrols_B'),
                    'blocks.A' : m.group('a_blocks'),
                    'blocks.B' : m.group('b_blocks'),
                    'shared_blocks' : m.group('shared_blocks'),
                    'tag_pair' : f"{m.group('tag_a')}-{m.group('tag_b')}",
                    'seed' : f"{m.group('seed')}",
                    'h2.A' : float(h2_A),
                    'h2.A.se' : float(h2_A_se),
                    'h2.B' : float(h2_B),
                    'h2.B.se' : float(h2_B_se),
                    'gcov' : float(cov),
                    'gcov.se' : float(cov_se),
                    'rg' : float(rg),
                    'rg.se' : float(rg_se),
                    'rg.z' : float(rg_z),
                    'rg.p' : float(rg_p)
                }
            )
        except FileNotFoundError:
            continue

    return pd.DataFrame(d)

def compile_ldsc_results_into_daf(input_files):
    d = []

    h2_regex = r"Total Liability scale h2: (.+)\s+\((.+)\)"
    int_regex = r"Intercept: (.+)\s+\((.+)\)"

    gcov_regex = r"Total Liability scale gencov: (.+)\s+\((.+)\)"
    gcov_zprod_regex = r"Mean z1\*z2: (.+)"

    for x in input_files:

        m = re.match(r"results/ldsc/simgwas/(?P<no_reps>\d+)_reps/randomised/(chr\d+/)?(?P<ncases_A>\d+)_(?P<ncontrols_A>\d+)_(?P<ncases_B>\d+)_(?P<ncontrols_B>\d+)/(?P<a_blocks>[\w-]+)_(?P<b_blocks>[\w-]+)_(?P<shared_blocks>[\w-]+)/rg/fixed_h2_free_rg_intercept/seed_(?P<seed>\w+)_tags_(?P<tag_a>\d+)-(?P<tag_b>\d+)\.log", x)

        try:
            with open(x, 'r') as infile:
                line = infile.readline()

                # TODO fix these for the null case
                while re.match(h2_regex, line) is None and re.match('ERROR', line) is None:
                    line = infile.readline()

                if re.match('ERROR', line):
                    d.append(
                        {
                            'ncases.A' : m.group('ncases_A'),
                            'ncontrols.A' : m.group('ncontrols_A'),
                            'ncases.B' : m.group('ncases_B'),
                            'ncontrols.B' : m.group('ncontrols_B'),
                            #'odds_ratio.A': odds_ratios_A,
                            #'odds_ratio.B': odds_ratios_B,
                            'blocks.A' : m.group('a_blocks'),
                            'blocks.B' : m.group('b_blocks'),
                            'shared_blocks' : m.group('shared_blocks'),
                            'tag_pair' : f"{m.group('tag_a')}-{m.group('tag_b')}",
                            'seed' : f"{m.group('seed')}",
                            'h2.A' : nan,
                            'h2.A.se' : nan,
                            'h2.B' : nan,
                            'h2.B.se' : nan,
                            'gcov' : nan,
                            'gcov.se' : nan,
                            'rg' : nan,
                            'rg.se' : nan,
                            'rg.z' : nan,
                            'rg.p' : nan
                        }
                    )

                else:
                    h2_match_A = re.match(h2_regex, line)
                    h2_A = float(h2_match_A.group(1))
                    h2_A_se = float(h2_match_A.group(2))

                    line = infile.readline()
                    line = infile.readline()
                    line = infile.readline()

                    h2_int_A_match = re.match(int_regex, line)

                    if h2_int_A_match:
                        h2_int_A = float(h2_int_A_match.group(1))
                        h2_int_A_se = float(h2_int_A_match.group(2))
                    elif 'constrained to 1.' in line:
                        h2_int_A = 1.0
                        h2_int_A_se = nan
                    else:
                        raise Exception("No match for h2_B int_regex")

                    while re.match(h2_regex, line) is None:
                        line = infile.readline()

                    h2_match_B = re.match(h2_regex, line)
                    h2_B = float(h2_match_B.group(1))
                    h2_B_se = float(h2_match_B.group(2))

                    line = infile.readline()
                    line = infile.readline()
                    line = infile.readline()

                    h2_int_B_match = re.match(int_regex, line)

                    if h2_int_B_match:
                            h2_int_B = float(h2_int_B_match.group(1))
                            h2_int_B_se = float(h2_int_B_match.group(2))
                    elif 'constrained to 1.' in line:
                            h2_int_B = 1.0
                            h2_int_B_se = nan
                    else:
                            raise Exception("No match for h2_A int_regex")

                    while re.match(gcov_regex, line) is None:
                        line = infile.readline()

                    gcov_match = re.match(gcov_regex, line)
                    gcov = float(gcov_match.group(1))
                    gcov_se = float(gcov_match.group(2))

                    line = infile.readline()

                    gcov_zprod_match = re.match(gcov_zprod_regex, line)
                    gcov_zprod = float(gcov_zprod_match.group(1))

                    line = infile.readline()

                    gcov_int_match = re.match(int_regex, line)

                    if gcov_int_match:
                        gcov_int = float(gcov_int_match.group(1))
                        gcov_int_se = float(gcov_int_match.group(2))
                    elif 'constrained to 0.' in line:
                        gcov_int = 0.0
                        gcov_int_se = nan
                    else:
                        raise Exception("No match for gcov_int_regex")

                    line = infile.readline()

                    while re.match("^p1\s", line) is None:
                        line = infile.readline()

                    line = infile.readline()

                    rg, rg_se, rg_z, rg_p = [float(z) if z != 'NA' else nan for z in line.split()[2:6]]

                d.append(
                    {
                        'ncases.A' : m.group('ncases_A'),
                        'ncontrols.A' : m.group('ncontrols_A'),
                        'ncases.B' : m.group('ncases_B'),
                        'ncontrols.B' : m.group('ncontrols_B'),
                        #'odds_ratio.A': odds_ratios_A,
                        #'odds_ratio.B': odds_ratios_B,
                        'blocks.A' : m.group('a_blocks'),
                        'blocks.B' : m.group('b_blocks'),
                        'shared_blocks' : m.group('shared_blocks'),
                        'tag_pair' : f"{m.group('tag_a')}-{m.group('tag_b')}",
                        'seed' : f"{m.group('seed')}",
                        'h2.A' : h2_A,
                        'h2.A.se' : h2_A_se,
                        'h2.B' : h2_B,
                        'h2.B.se' : h2_B_se,
                        'gcov' : gcov,
                        'gcov.se' : gcov_se,
                        'rg' : rg,
                        'rg.se' : rg_se,
                        'rg.z' : rg_z,
                        'rg.p' : rg_p
                    }
                )
        except FileNotFoundError:
            continue
        except AttributeError:
            print(x)


    return pd.DataFrame(d)

def compile_hoeffdings_results_into_daf(input_files):
    d = []

    for x in input_files:
        m = re.match(r"results/hoeffdings/simgwas/(?P<no_reps>\d+)_reps/randomised(/chr\d+)?/(?P<ncases_A>\d+)_(?P<ncontrols_A>\d+)_(?P<ncases_B>\d+)_(?P<ncontrols_B>\d+)/(?P<a_blocks>[\w-]+)_(?P<b_blocks>[\w-]+)_(?P<shared_blocks>[\w-]+)/window_(?P<window>\d+kb)_step_(?P<step>\d+)_r2_(?P<r2>\d+_\d+)/seed_(?P<seed>\w+)_tags_(?P<tag_a>\d+)-(?P<tag_b>\d+)_hoeffdings\.tsv", x)

        try:
            with open(x, 'r') as infile:
                lines = [x.strip() for x in infile.readlines()]

            # NB: The 'n' here is no. of SNPs, not no. of permutations as with GPS
            _, _, n, Dn, scaled, pval = lines[1].split('\t')

            d.append(
                {
                    'ncases.A' : m.group('ncases_A'),
                    'ncontrols.A' : m.group('ncontrols_A'),
                    'ncases.B' : m.group('ncases_B'),
                    'ncontrols.B' : m.group('ncontrols_B'),
                    'blocks.A' : m.group('a_blocks'),
                    'blocks.B' : m.group('b_blocks'),
                    'shared_blocks' : m.group('shared_blocks'),
                    'tag_pair' : f"{m.group('tag_a')}-{m.group('tag_b')}",
                    'seed' : f"{m.group('seed')}",
                    'window': m.group('window'),
                    'step': m.group('step'),
                    'r2': float(m.group('r2').replace('_', '.')),
                    'hoeff.p' : pval
                }
            )

        except ValueError:
            _, _, n, Dn, scaled = lines[1].split('\t')
            pval = nan

            d.append(
                {
                    'ncases.A' : m.group('ncases_A'),
                    'ncontrols.A' : m.group('ncontrols_A'),
                    'ncases.B' : m.group('ncases_B'),
                    'ncontrols.B' : m.group('ncontrols_B'),
                    'blocks.A' : m.group('a_blocks'),
                    'blocks.B' : m.group('b_blocks'),
                    'shared_blocks' : m.group('shared_blocks'),
                    'tag_pair' : f"{m.group('tag_a')}-{m.group('tag_b')}",
                    'seed' : f"{m.group('seed')}",
                    'window': m.group('window'),
                    'step': m.group('step'),
                    'r2': float(m.group('r2').replace('_', '.')),
                    'hoeff.p' : pval
                }
            )
            continue
        except FileNotFoundError:
            continue

    return pd.DataFrame(d)

def compile_gps_results_into_daf(input_files):
    d = []

    for x in input_files:

        m = re.match(r"results/gps/simgwas/(?P<no_reps>\d+)_reps/randomised/(chr\d+/)?(?P<ncases_A>\d+)_(?P<ncontrols_A>\d+)_(?P<ncases_B>\d+)_(?P<ncontrols_B>\d+)/(?P<a_blocks>[\w-]+)_(?P<b_blocks>[\w-]+)_(?P<shared_blocks>[\w-]+)/window_(?P<window>\d+kb)_step_(?P<step>\d+)_r2_(?P<r2>\d+_\d+)/(?P<draws>\d+)_permutations/seed_(?P<seed>\w+)_tags_(?P<tag_a>\d+)-(?P<tag_b>\d+)_gps_pvalue\.tsv", x)

        try:
            with open(x, 'r') as infile:
                lines = [x.strip() for x in infile.readlines()]

            gps, n, loc, loc_sd, scale, scale_sd, shape, shape_sd, pval = lines[1].split('\t')

            d.append(
                {
                    'ncases.A' : m.group('ncases_A'),
                    'ncontrols.A' : m.group('ncontrols_A'),
                    'ncases.B' : m.group('ncases_B'),
                    'ncontrols.B' : m.group('ncontrols_B'),
                    'blocks.A' : m.group('a_blocks'),
                    'blocks.B' : m.group('b_blocks'),
                    'shared_blocks' : m.group('shared_blocks'),
                    'tag_pair' : f"{m.group('tag_a')}-{m.group('tag_b')}",
                    'seed' : f"{m.group('seed')}",
                    'window': m.group('window'),
                    'step': m.group('step'),
                    'r2': float(m.group('r2').replace('_', '.')),
                    'gps' : gps,
                    'n' : n,
                    'loc' : loc,
                    'loc.sd' : loc_sd,
                    'scale' : scale,
                    'scale.sd' : scale_sd,
                    'shape' : shape,
                    'shape.sd' : shape_sd,
                    'pval' : pval,
                }
            )
        except FileNotFoundError:
            continue

    return pd.DataFrame(d)

def compile_li_gps_results_into_daf(input_files):
    d = []

    for x in input_files:

        m = re.match(r"results/gps/simgwas/(?P<no_reps>\d+)_reps/randomised/(chr\d+/)?(?P<ncases_A>\d+)_(?P<ncontrols_A>\d+)_(?P<ncases_B>\d+)_(?P<ncontrols_B>\d+)/(?P<a_blocks>[\w-]+)_(?P<b_blocks>[\w-]+)_(?P<shared_blocks>[\w-]+)/window_(?P<window>\d+kb)_step_(?P<step>\d+)_r2_(?P<r2>\d+_\d+)/seed_(?P<seed>\w+)_tags_(?P<tag_a>\d+)-(?P<tag_b>\d+)_li_gps_pvalue\.tsv", x)

        try:
            with open(x, 'r') as infile:
                lines = [x.strip() for x in infile.readlines()]

            gps, pval = lines[1].split('\t')

            d.append(
                {
                    'ncases.A' : m.group('ncases_A'),
                    'ncontrols.A' : m.group('ncontrols_A'),
                    'ncases.B' : m.group('ncases_B'),
                    'ncontrols.B' : m.group('ncontrols_B'),
                    'blocks.A' : m.group('a_blocks'),
                    'blocks.B' : m.group('b_blocks'),
                    'shared_blocks' : m.group('shared_blocks'),
                    'tag_pair' : f"{m.group('tag_a')}-{m.group('tag_b')}",
                    'seed' : f"{m.group('seed')}",
                    'window': m.group('window'),
                    'step': m.group('step'),
                    'r2': float(m.group('r2').replace('_', '.')),
                    'gps' : gps,
                    'pval' : pval
                }
            )
        except FileNotFoundError:
            continue

    return pd.DataFrame(d)

def compile_theo_results_into_daf(input_files):
    d = []

    for x in input_files:

        m = re.match(r"results/ldsc/simgwas/(?P<no_reps>\d+)_reps/randomised/(chr\d+/)?(?P<ncases_A>\d+)_(?P<ncontrols_A>\d+)_(?P<ncases_B>\d+)_(?P<ncontrols_B>\d+)/(?P<a_blocks>[\w-]+)_(?P<b_blocks>[\w-]+)_(?P<shared_blocks>[\w-]+)/theoretical_rg/seed_(?P<seed>\w+)_(?P<tag_a>\d+)-(?P<tag_b>\d+)_theo_rg\.tsv", x)

        with open(x, 'r') as infile:
            lines = [x.strip() for x in infile.readlines()]

        _, _, _, _, _, h2_theo_obs_A, h2_theo_obs_B, h2_theo_liab_A, h2_theo_liab_B, V_A_A, V_A_B, C_A_AB, r_A_AB  = lines[1].split('\t')

        d.append(
            {
                'ncases.A' : m.group('ncases_A'),
                'ncontrols.A' : m.group('ncontrols_A'),
                'ncases.B' : m.group('ncases_B'),
                'ncontrols.B' : m.group('ncontrols_B'),
                'blocks.A' : m.group('a_blocks'),
                'blocks.B' : m.group('b_blocks'),
                'shared_blocks' : m.group('shared_blocks'),
                'tag_pair' : f"{m.group('tag_a')}-{m.group('tag_b')}",
                'seed' : f"{m.group('seed')}",
                "h2.theo.obs.A" : float(h2_theo_obs_A),
                "h2.theo.obs.B" : float(h2_theo_obs_B),
                "h2.theo.liab.A" : float(h2_theo_liab_A),
                "h2.theo.liab.B" : float(h2_theo_liab_B),
                "V_A.A" : float(V_A_A),
                "V_A.B" : float(V_A_B),
                "C_A.AB" : float(C_A_AB),
                "r_A.AB" : float(r_A_AB)
            }
        )

    return pd.DataFrame(d)
49
shell: "touch {output}"
61
shell: "touch {output}"
68
shell: "touch {output}"
75
shell: "touch {output}"
82
shell: "touch {output}"
89
shell: "touch {output}"
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
run:
    ldsc_daf = compile_ldsc_results_into_daf(input.ldsc_files)
    ldsc_daf.to_csv(output.ldsc_out, sep = '\t', index = False)

    sumher_daf = compile_sumher_results_into_daf(input.sumher_files)
    sumher_daf.to_csv(output.sumher_out, sep = '\t', index = False)

    hoeffdings_daf = compile_hoeffdings_results_into_daf(input.hoeffdings_files)
    hoeffdings_daf.to_csv(output.hoeffdings_out, sep = '\t', index = False)

    gps_daf = compile_gps_results_into_daf(input.gps_files)
    gps_daf.to_csv(output.gps_out, sep = '\t', index = False)

    li_gps_daf = compile_li_gps_results_into_daf(input.li_gps_files)
    li_gps_daf.to_csv(output.li_gps_out, sep = '\t', index = False)

    theo_daf = compile_theo_rg_results_into_daf(input.theo_files)
    theo_daf.to_csv(output[0], sep = '\t', index = False)

    shell("touch {output.done_out}")
149
shell: "touch {output}"
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
run:
    ldsc_daf = compile_ldsc_results_into_daf(input.ldsc_files)
    ldsc_daf.to_csv(output.ldsc_out, sep = '\t', index = False)

    sumher_daf = compile_sumher_results_into_daf(input.sumher_files)
    sumher_daf.to_csv(output.sumher_out, sep = '\t', index = False)

    hoeffdings_daf = compile_hoeffdings_results_into_daf(input.hoeffdings_files)
    hoeffdings_daf.to_csv(output.hoeffdings_out, sep = '\t', index = False)

    gps_daf = compile_gps_results_into_daf(input.gps_files)
    gps_daf.to_csv(output.gps_out, sep = '\t', index = False)

    li_gps_daf = compile_li_gps_results_into_daf(input.li_gps_files)
    li_gps_daf.to_csv(output.li_gps_out, sep = '\t', index = False)

    theo_daf = compile_theo_rg_results_into_daf(input.theo_files)
    theo_daf.to_csv(output[0], sep = '\t', index = False)

    shell("touch {output.done_out}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def get_simulation_runtime(wildcards, attempt):
    return 10 + attempt*5

def get_null_block_files(wildcards):
    effect_block_files = get_effect_block_files(wildcards)

    null_block_files_to_omit = [re.sub('tiny|small|medium|large|vlarge|huge|intermediate|random_\d+-\d+', 'null', x) for x in effect_block_files]

    null_block_files = [f"results/simgwas/simulated_sum_stats/block_sum_stats/{{no_reps}}_reps/null/{{ncases}}_{{ncontrols}}/chr{row.chr}/block_{row.block}_seed_{row.null_seed}_sum_stats.tsv.gz" for row in block_daf.itertuples()]

    for x in null_block_files_to_omit:
        if x in null_block_files:
            null_block_files.remove(x)

    return null_block_files

def get_effect_block_files(wildcards):
    if wildcards.effect_blocks == 'null':
        return []

    block_file_format = "results/simgwas/simulated_sum_stats/block_sum_stats/{no_reps}_reps/%s/{ncases}_{ncontrols}/chr%d/block_%d_seed_%d_sum_stats.tsv.gz"

    effect_block_files = []

    for x in wildcards.effect_blocks.split('/')[-1].split('+'):
        block_match = re.match('^(\d+)-(.+)', x)

        chrom = int(block_match.group(1))

        if ':' in block_match.group(2):
            range_match = re.match('([tsmlvhi]|r\d+-\d+-)(\d+):(\d+)', block_match.group(2))

            # random effect
            # TODO no handling of seed in this atm
            if 'r' in range_match.group(1):
                effect = 'random_' + range_match.group(1)[1:-1]

                effect_block_files += [block_file_format % (chrom, effect, y) for y in range(int(range_match.group(2)), int(range_match.group(3))+1) if y in block_dict[chrom]]
            # non-random effect
            else:
                effect = effect_size_dict[range_match.group(1)]

                seed_col = effect + "_seed"

                for block in range(int(range_match.group(2)), int(range_match.group(3))+1):
                    if block in block_daf.query('chr == @chrom')['block']:

                        seed = block_daf.query('chr == @chrom & block == @block')[seed_col].values[0]

                        effect_block_files.append(block_file_format % (effect, chrom, block, seed))

        else:
            singleton_match = re.match('([tsmlvhi]|r\d+-\d+-)(\d+)', x)

            block = int(singleton_match.group(2))

            # random effect
            # TODO no handling of seed in this atm
            if 'r' in x:
                effect = 'random_' + singleton_match.group(1)[1:-1]

                if int(singleton_match.group(2)) in block_dict[chrom]:
                    effect_block_files += [block_file_format % (chrom, effect, int(singleton_match.group(2)))]
            # non-random effect
            else:
                effect = effect_size_dict[singleton_match.group(1)]

                seed_col = effect + "_seed"

                seed = block_daf.query('chr == @chrom & block == @block')[seed_col].values[0]

                if block in block_daf.query('chr == @chrom')['block']:
                    effect_block_files.append(block_file_format % (effect, chrom, block, seed))

    return effect_block_files
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd

def get_one_chrom_simulation_done_files(simulation_pars_file, reps, subset = None):
    daf = pd.read_csv(simulation_pars_file, sep = '\t')

    if subset:
        daf = daf.query(subset)

    #daf.assign(r2 = lambda dataframe: dataframe['r2'].map(lambda r2: float(r2.replace('_', '.'))))
    daf['r2'] = daf['r2'].apply(lambda x: str(x).replace('.', '_'))

    return [f"results/simgwas/done/{reps}_reps/randomised/{row.chr}/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/window_{row.window}_step_{row.step}_r2_{row.r2}/seed_{row.seed}_tags_{row.tag_A}-{row.tag_B}.done" for row in daf.itertuples()]

def get_one_chrom_test_files(simulation_pars_file, reps, filetype, subset = None, draws = None):
    daf = pd.read_csv(simulation_pars_file, sep = '\t')

    if subset:
        daf = daf.query('a_blocks == @subset & b_blocks == @subset')

    daf['r2'] = daf['r2'].apply(lambda x: str(x).replace('.', '_'))

    if filetype == 'ldsc':
        files = [f"results/ldsc/simgwas/{reps}_reps/randomised/{row.chr}/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/rg/fixed_h2_free_rg_intercept/seed_{row.seed}_tags_{row.tag_A}-{row.tag_B}.log" for row in daf.itertuples()]
    elif filetype == 'sumher':
        files = [f"results/ldak/ldak-thin/simgwas/{reps}_reps/randomised/rg/{row.chr}/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/seed_{row.seed}_tags_{row.tag_A}-{row.tag_B}.cors" for row in daf.itertuples()]
    elif filetype == 'hoeffdings':
        files = [f"results/hoeffdings/simgwas/{reps}_reps/randomised/{row.chr}/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/window_{row.window}_step_{row.step}_r2_{row.r2}/seed_{row.seed}_tags_{row.tag_A}-{row.tag_B}_hoeffdings.tsv" for row in daf.itertuples()]
    elif filetype == 'gps':
        files = [f"results/gps/simgwas/{reps}_reps/randomised/{row.chr}/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/window_{row.window}_step_{row.step}_r2_{row.r2}/{draws}_permutations/seed_{row.seed}_tags_{row.tag_A}-{row.tag_B}_gps_pvalue.tsv" for row in daf.itertuples()]
    elif filetype == 'theo':
        files = [f"results/ldsc/simgwas/{reps}_reps/randomised/{row.chr}/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/theoretical_rg/seed_{row.seed}_{row.tag_A}-{row.tag_B}_theo_rg.tsv" for row in daf.itertuples()]
    elif filetype == 'li_gps':
        files = [f"results/gps/simgwas/{reps}_reps/randomised/{row.chr}/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/window_{row.window}_step_{row.step}_r2_{row.r2}/seed_{row.seed}_tags_{row.tag_A}-{row.tag_B}_li_gps_pvalue.tsv" for row in daf.itertuples()]
    elif filetype == 'done':
        files = [f"results/simgwas/done/{reps}_reps/randomised/{row.chr}/{row.ncases_A}_{row.ncontrols_A}_{row.ncases_B}_{row.ncontrols_B}/{row.a_blocks}_{row.b_blocks}_{row.shared_blocks}/seed_{row.seed}_tags_{row.tag_A}-{row.tag_B}.done" for row in daf.itertuples()]
    else:
        raise Exception(f"Invalid filetype specified: {filetype}")

    return files
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import pandas as pd

# TODO why do we store block nos in a list rather than in the dict?
# TODO should probably store no of causal variants per block, too
def get_randomised_chrom_block_tuples_for_pair(wildcards):
    random.seed(wildcards.seed)

    shared_chrom_block_nos = []
    shared_chrom_block_dict = {}

    if wildcards.shared_effect_blocks != 'null':
        for token in wildcards.shared_effect_blocks.split('-'):
            block_match = re.match('([tsmlvhi])(\d+)', token)

            if not block_match:
                raise ValueError("Invalid block format: %s" % token)

            shared_effect = effect_size_dict[block_match.group(1)]

            no_of_shared_blocks = int(block_match.group(2))

            no_of_cvs_per_block = cv_per_block_dict[shared_effect]

            no_of_shared_blocks /= no_of_cvs_per_block

            i = 0

            shared_chrom_block_dict[shared_effect] = []

            while i < max(no_of_shared_blocks, 0):
                chrom = random.choice(range(1, 23))
                block_no = random.choice(block_daf.query('chr == @chrom')[['block']].values.transpose().tolist()[0])

                if (chrom, block_no) not in shared_chrom_block_nos:
                    shared_chrom_block_nos.append((chrom, block_no))
                    shared_chrom_block_dict[shared_effect].append((chrom, block_no, no_of_cvs_per_block))
                    i += 1

    a_chrom_block_nos = []
    a_chrom_block_dict = {}

    if wildcards.effect_blocks_A != 'null':
        for token in wildcards.effect_blocks_A.split('-'):
            block_match_a = re.match('([tsmlvhi])(\d+)', token)

            if not block_match_a:
                raise ValueError("Invalid block format: %s" % token)

            effect_a = effect_size_dict[block_match_a.group(1)]

            no_of_blocks_a = int(block_match_a.group(2))

            no_of_cvs_per_block_a = cv_per_block_dict[effect_a]

            no_of_blocks_a /= no_of_cvs_per_block_a

            i = 0

            a_chrom_block_dict[effect_a] = []

            if effect_a not in shared_chrom_block_dict.keys():
                shared_chrom_block_dict[effect_a] = []

            while i < max(no_of_blocks_a-len(shared_chrom_block_dict[effect_a]), 0):
                chrom = random.choice(range(1, 23))

                block_no = random.choice(block_daf.query('chr == @chrom')[['block']].values.transpose().tolist()[0])

                if (chrom, block_no) not in shared_chrom_block_nos and (chrom, block_no) not in a_chrom_block_nos:
                    a_chrom_block_nos.append((chrom, block_no))
                    a_chrom_block_dict[effect_a].append((chrom, block_no, no_of_cvs_per_block_a))
                    i += 1

    b_chrom_block_nos = []
    b_chrom_block_dict = {}

    if wildcards.effect_blocks_B != 'null':
        for token in wildcards.effect_blocks_B.split('-'):
            block_match_b = re.match('([tismlvh])(\d+)', token)

            if not block_match_b:
                raise ValueError("Invalid block format: %s" % token)

            effect_b = effect_size_dict[block_match_b.group(1)]

            no_of_blocks_b = int(block_match_b.group(2))

            no_of_cvs_per_block_b = cv_per_block_dict[effect_b]

            no_of_blocks_b /= no_of_cvs_per_block_b

            i = 0

            b_chrom_block_dict[effect_b] = []

            if effect_b not in shared_chrom_block_dict.keys():
                shared_chrom_block_dict[effect_b] = []

            while i < max(no_of_blocks_b-len(shared_chrom_block_dict[effect_b]), 0):
                chrom = random.choice(range(1, 23))
                block_no = random.choice(block_daf.query('chr == @chrom')[['block']].values.transpose().tolist()[0])


                if (chrom, block_no) not in shared_chrom_block_nos and (chrom, block_no) not in a_chrom_block_nos and (chrom, block_no) not in b_chrom_block_nos:
                    b_chrom_block_nos.append((chrom, block_no))
                    b_chrom_block_dict[effect_b].append((chrom, block_no, no_of_cvs_per_block_b))
                    i += 1

    return (shared_chrom_block_nos, a_chrom_block_nos, b_chrom_block_nos, shared_chrom_block_dict, a_chrom_block_dict, b_chrom_block_dict)

def get_randomised_block_files_for_pair(wildcards):
    shared_chrom_block_nos, a_chrom_block_nos, b_chrom_block_nos, shared_chrom_block_dict, a_chrom_block_dict, b_chrom_block_dict = get_randomised_chrom_block_tuples_for_pair(wildcards)

    block_files = []

    a_block_files = []
    b_block_files = []

    for k in shared_chrom_block_dict.keys():
        seed_label = k + '_seed'

        for v in shared_chrom_block_dict[k]:
            seed = block_daf.query('chr == @v[0] & block == @v[1]')[seed_label].values[0]

            a_file = f"results/simgwas/simulated_sum_stats/block_sum_stats/400_reps/{k}/{v[2]}_cv/{wildcards.ncases_A}_{wildcards.ncontrols_A}/chr{v[0]}/block_{v[1]}_seed_{seed}_sum_stats.tsv.gz"

            b_file = f"results/simgwas/simulated_sum_stats/block_sum_stats/400_reps/{k}/{v[2]}_cv/{wildcards.ncases_B}_{wildcards.ncontrols_B}/chr{v[0]}/block_{v[1]}_seed_{seed}_sum_stats.tsv.gz"

            block_files.append(a_file)

            a_block_files.append(a_file)
            b_block_files.append(b_file)

            if wildcards.ncases_A != wildcards.ncases_B or wildcards.ncontrols_A != wildcards.ncontrols_B:
                block_files.append(b_file)

    for k in a_chrom_block_dict.keys():
        seed_label = k + '_seed'

        for v in a_chrom_block_dict[k]:
            seed = block_daf.query('chr == @v[0] & block == @v[1]')[seed_label].values[0]

            a_file = f"results/simgwas/simulated_sum_stats/block_sum_stats/400_reps/{k}/{v[2]}_cv/{wildcards.ncases_A}_{wildcards.ncontrols_A}/chr{v[0]}/block_{v[1]}_seed_{seed}_sum_stats.tsv.gz"

            block_files.append(a_file)
            a_block_files.append(a_file)

    for k in b_chrom_block_dict.keys():
        seed_label = k + '_seed'

        for v in b_chrom_block_dict[k]:
            seed = block_daf.query('chr == @v[0] & block == @v[1]')[seed_label].values[0]

            b_file = f"results/simgwas/simulated_sum_stats/block_sum_stats/400_reps/{k}/{v[2]}_cv/{wildcards.ncases_B}_{wildcards.ncontrols_B}/chr{v[0]}/block_{v[1]}_seed_{seed}_sum_stats.tsv.gz"

            b_block_files.append(b_file)

            if b_file not in block_files:
                block_files.append(b_file)

    for chrom in range(1,23):
        for block in block_daf.query('chr == @chrom')['block']:
            seed = block_daf.query('chr == @chrom & block == @block')['null_seed'].values[0]

            if (chrom, block) not in a_chrom_block_nos and (chrom, block) not in shared_chrom_block_nos:
                a_file = f"results/simgwas/simulated_sum_stats/block_sum_stats/400_reps/null/0_cv/{wildcards.ncases_A}_{wildcards.ncontrols_A}/chr{chrom}/block_{block}_seed_{seed}_sum_stats.tsv.gz"
                block_files.append(a_file)
                a_block_files.append(a_file)

            if (chrom, block) not in b_chrom_block_nos and (chrom, block) not in shared_chrom_block_nos:
                b_file = f"results/simgwas/simulated_sum_stats/block_sum_stats/400_reps/null/0_cv/{wildcards.ncases_B}_{wildcards.ncontrols_B}/chr{chrom}/block_{block}_seed_{seed}_sum_stats.tsv.gz"
                b_block_files.append(b_file)

                if b_file not in block_files:
                    block_files.append(b_file)

    all_files = block_files+a_block_files+b_block_files

    return all_files
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import pandas as pd

def get_randomised_chrom_block_tuples_for_chrom_for_pair(wildcards):
    random.seed(wildcards.seed)

    chrom = int(wildcards.chr.replace('chr', ''))
    shared_chrom_block_nos = []
    shared_chrom_block_dict = {}

    if wildcards.shared_effect_blocks != 'null':
        for token in wildcards.shared_effect_blocks.split('-'):
            block_match = re.match('([tsmlvhi])(\d+)', token)

            if not block_match:
                raise ValueError("Invalid block format: %s" % token)

            shared_effect = effect_size_dict[block_match.group(1)]

            no_of_shared_blocks = int(block_match.group(2))

            no_of_cvs_per_block = cv_per_block_dict[shared_effect]

            no_of_shared_blocks /= no_of_cvs_per_block

            i = 0

            shared_chrom_block_dict[shared_effect] = []

            while i < max(no_of_shared_blocks, 0):
                block_no = random.choice(block_daf.query('chr == @chrom')[['block']].values.transpose().tolist()[0])

                if (chrom, block_no) not in shared_chrom_block_nos:
                    shared_chrom_block_nos.append((chrom, block_no))
                    shared_chrom_block_dict[shared_effect].append((chrom, block_no, no_of_cvs_per_block))
                    i += 1

    a_chrom_block_nos = []
    a_chrom_block_dict = {}

    if wildcards.effect_blocks_A != 'null':
        for token in wildcards.effect_blocks_A.split('-'):
            block_match_a = re.match('([tsmlvhi])(\d+)', token)

            if not block_match_a:
                raise ValueError("Invalid block format: %s" % token)

            effect_a = effect_size_dict[block_match_a.group(1)]

            no_of_blocks_a = int(block_match_a.group(2))

            no_of_cvs_per_block_a = cv_per_block_dict[effect_a]

            no_of_blocks_a /= no_of_cvs_per_block_a

            i = 0

            a_chrom_block_dict[effect_a] = []

            if effect_a not in shared_chrom_block_dict.keys():
                shared_chrom_block_dict[effect_a] = []

            while i < max(no_of_blocks_a-len(shared_chrom_block_dict[effect_a]), 0):
                block_no = random.choice(block_daf.query('chr == @chrom')[['block']].values.transpose().tolist()[0])

                if (chrom, block_no) not in shared_chrom_block_nos and (chrom, block_no) not in a_chrom_block_nos:
                    a_chrom_block_nos.append((chrom, block_no))
                    a_chrom_block_dict[effect_a].append((chrom, block_no, no_of_cvs_per_block_a))
                    i += 1

    b_chrom_block_nos = []
    b_chrom_block_dict = {}

    if wildcards.effect_blocks_B != 'null':
        for token in wildcards.effect_blocks_B.split('-'):
            block_match_b = re.match('([tismlvh])(\d+)', token)

            if not block_match_b:
                raise ValueError("Invalid block format: %s" % token)

            effect_b = effect_size_dict[block_match_b.group(1)]

            no_of_blocks_b = int(block_match_b.group(2))

            no_of_cvs_per_block_b = cv_per_block_dict[effect_b]

            no_of_blocks_b /= no_of_cvs_per_block_b

            i = 0

            b_chrom_block_dict[effect_b] = []

            if effect_b not in shared_chrom_block_dict.keys():
                shared_chrom_block_dict[effect_b] = []

            while i < max(no_of_blocks_b-len(shared_chrom_block_dict[effect_b]), 0):
                block_no = random.choice(block_daf.query('chr == @chrom')[['block']].values.transpose().tolist()[0])


                if (chrom, block_no) not in shared_chrom_block_nos and (chrom, block_no) not in a_chrom_block_nos and (chrom, block_no) not in b_chrom_block_nos:
                    b_chrom_block_nos.append((chrom, block_no))
                    b_chrom_block_dict[effect_b].append((chrom, block_no, no_of_cvs_per_block_b))
                    i += 1

    return (shared_chrom_block_nos, a_chrom_block_nos, b_chrom_block_nos, shared_chrom_block_dict, a_chrom_block_dict, b_chrom_block_dict)

def get_randomised_block_files_for_chrom_for_pair(wildcards):
    shared_chrom_block_nos, a_chrom_block_nos, b_chrom_block_nos, shared_chrom_block_dict, a_chrom_block_dict, b_chrom_block_dict = get_randomised_chrom_block_tuples_for_chrom_for_pair(wildcards)

    block_files = []

    a_block_files = []
    b_block_files = []

    chrom = int(wildcards.chr.replace('chr', ''))

    for k in shared_chrom_block_dict.keys():
        seed_label = k + '_seed'

        for v in shared_chrom_block_dict[k]:
            seed = block_daf.query('chr == @v[0] & block == @v[1]')[seed_label].values[0]

            a_file = f"results/simgwas/simulated_sum_stats/block_sum_stats/400_reps/{k}/{v[2]}_cv/{wildcards.ncases_A}_{wildcards.ncontrols_A}/chr{v[0]}/block_{v[1]}_seed_{seed}_sum_stats.tsv.gz"

            b_file = f"results/simgwas/simulated_sum_stats/block_sum_stats/400_reps/{k}/{v[2]}_cv/{wildcards.ncases_B}_{wildcards.ncontrols_B}/chr{v[0]}/block_{v[1]}_seed_{seed}_sum_stats.tsv.gz"

            block_files.append(a_file)

            a_block_files.append(a_file)
            b_block_files.append(b_file)

            if wildcards.ncases_A != wildcards.ncases_B or wildcards.ncontrols_A != wildcards.ncontrols_B:
                block_files.append(b_file)

    for k in a_chrom_block_dict.keys():
        seed_label = k + '_seed'

        for v in a_chrom_block_dict[k]:
            seed = block_daf.query('chr == @v[0] & block == @v[1]')[seed_label].values[0]

            a_file = f"results/simgwas/simulated_sum_stats/block_sum_stats/400_reps/{k}/{v[2]}_cv/{wildcards.ncases_A}_{wildcards.ncontrols_A}/chr{v[0]}/block_{v[1]}_seed_{seed}_sum_stats.tsv.gz"

            block_files.append(a_file)
            a_block_files.append(a_file)

    for k in b_chrom_block_dict.keys():
        seed_label = k + '_seed'

        for v in b_chrom_block_dict[k]:
            seed = block_daf.query('chr == @v[0] & block == @v[1]')[seed_label].values[0]

            b_file = f"results/simgwas/simulated_sum_stats/block_sum_stats/400_reps/{k}/{v[2]}_cv/{wildcards.ncases_B}_{wildcards.ncontrols_B}/chr{v[0]}/block_{v[1]}_seed_{seed}_sum_stats.tsv.gz"

            b_block_files.append(b_file)

            if b_file not in block_files:
                block_files.append(b_file)

    # Fill in null files
    for block in block_daf.query('chr == @chrom')['block']:
        seed = block_daf.query('chr == @chrom & block == @block')['null_seed'].values[0]

        if (chrom, block) not in a_chrom_block_nos and (chrom, block) not in shared_chrom_block_nos:
            a_file = f"results/simgwas/simulated_sum_stats/block_sum_stats/400_reps/null/0_cv/{wildcards.ncases_A}_{wildcards.ncontrols_A}/chr{chrom}/block_{block}_seed_{seed}_sum_stats.tsv.gz"
            block_files.append(a_file)
            a_block_files.append(a_file)

        if (chrom, block) not in b_chrom_block_nos and (chrom, block) not in shared_chrom_block_nos:
            b_file = f"results/simgwas/simulated_sum_stats/block_sum_stats/400_reps/null/0_cv/{wildcards.ncases_B}_{wildcards.ncontrols_B}/chr{chrom}/block_{block}_seed_{seed}_sum_stats.tsv.gz"
            b_block_files.append(b_file)

            if b_file not in block_files:
                block_files.append(b_file)

    all_files = block_files+a_block_files+b_block_files

    return all_files
24
25
26
27
28
29
30
31
32
run:
    a_block_files = input.block_files[-(2*params.no_of_blocks_in_chrom):-params.no_of_blocks_in_chrom]
    b_block_files = input.block_files[-params.no_of_blocks_in_chrom:]

    with open(output.a_block_file, 'w') as a_out:
        a_out.writelines([f"{x}\n" for x in a_block_files])

    with open(output.b_block_file, 'w') as b_out:
        b_out.writelines([f"{x}\n" for x in b_block_files])
51
52
53
54
55
56
57
58
59
60
61
62
63
64
shell:
    """
    split --numeric-suffixes=1 -nl/{params.no_of_splits} {input.a_block_file} {params.a_file_prefix} --additional-suffix={params.suffix}
    split --numeric-suffixes=1 -nl/{params.no_of_splits} {input.b_block_file} {params.b_file_prefix} --additional-suffix={params.suffix}

    # TODO hard-coding bad
    for i in {{1..9}}; do
        mv {params.a_file_prefix}"0"$i"-of-12.txt"  {params.a_file_prefix}$i"-of-12.txt"
    done

    for i in {{1..9}}; do
        mv {params.b_file_prefix}"0"$i"-of-12.txt"  {params.b_file_prefix}$i"-of-12.txt"
    done
    """
79
script: "../../scripts/simgwas/combine_randomised_block_sum_stats.R"
93
script: "../../scripts/simgwas/gather_split_block_files.R"
109
script: "../../scripts/simgwas/merge_sim_sum_stats.R"
122
123
shell:
    "Rscript workflow/scripts/simgwas/prune_sim_sum_stats.R --sum_stats_file {input.sum_stats_file} --bim_file {input.bim_file} --prune_file {input.pruned_range_file} -o {output} -nt {threads}"
135
136
shell:
    "gunzip -c {input} >{output}"
145
146
shell:
    "zcat {input} | tail -n +2 | wc -l >{output}"
155
156
shell:
    "zcat {input} | tail -n +2 | wc -l >{output}"
21
22
23
24
25
26
27
28
29
30
run:
    # Probably too clever by half
    a_block_files = input.block_files[-(2*params.no_of_blocks_in_genome):-params.no_of_blocks_in_genome]
    b_block_files = input.block_files[-params.no_of_blocks_in_genome:]

    with open(output.a_block_file, 'w') as a_out:
        a_out.writelines([f"{x}\n" for x in a_block_files])

    with open(output.b_block_file, 'w') as b_out:
        b_out.writelines([f"{x}\n" for x in b_block_files])
49
50
51
52
53
54
55
56
57
58
59
60
61
62
shell:
    """
    split --numeric-suffixes=1 -nl/{params.no_of_splits} {input.a_block_file} {params.a_file_prefix} --additional-suffix={params.suffix}
    split --numeric-suffixes=1 -nl/{params.no_of_splits} {input.b_block_file} {params.b_file_prefix} --additional-suffix={params.suffix}

    # TODO hard-coding bad
    for i in {{1..9}}; do
        mv {params.a_file_prefix}"0"$i"-of-12.txt"  {params.a_file_prefix}$i"-of-12.txt"
    done

    for i in {{1..9}}; do
        mv {params.b_file_prefix}"0"$i"-of-12.txt"  {params.b_file_prefix}$i"-of-12.txt"
    done
    """
79
script: "../../scripts/simgwas/combine_randomised_block_sum_stats.R"
96
script: "../../scripts/simgwas/gather_split_block_files.R"
112
script: "../../scripts/simgwas/merge_sim_sum_stats.R"
125
126
shell:
    "Rscript workflow/scripts/simgwas/prune_sim_sum_stats.R --sum_stats_file {input.sum_stats_file} --bim_file {input.bim_file} --prune_file {input.pruned_range_file} -o {output} -nt {threads}"
138
139
shell:
    "gunzip -c {input} >{output}"
149
script: "../scripts/write_out_summary_statistics_per_chromosome.R"
156
157
shell:
    "zcat {input} | tail -n +2 | wc -l >{output}"
164
165
shell:
    "zcat {input} | tail -n +2 | wc -l >{output}"
23
24
25
26
27
shell:
    """
    zcat {input} | awk -F' ' '($9 >= 0.01 && $9 <= 0.99) || NR == 1' >{params};
    gzip {params}
    """
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
shell:
    """
    paste -d' ' <(zcat {input.hap_file}) <(zcat {input.legend_file} | cut -d' ' -f9 | tail -n +2)  >{params.temp_hap_with_maf_file}

    awk -F' ' '$5009 >= 0.01 && $5009 <= 0.99' {params.temp_hap_with_maf_file} | cut -d' ' -f1-5008  >>{params.temp_hap_filtered_maf_file}

    for j in {{1..4}}; do
        for i in $(tail -n +2 {input.samples_file} | cut -f"$j" -d' ' | tr '\n' ' '); do echo -n "$i $i "; done >>{params.temp_hap_file}
    echo "" >>{params.temp_hap_file};
    done

    cat {params.temp_hap_filtered_maf_file} >>{params.temp_hap_file}

    head -n 3 {params.temp_hap_file} | tail -n 1 | awk -F' ' '{{for(i=1; i<=NF; i++){{ if($i == "EUR") {{printf "%s ", i}} }}}}'| sed 's/ /,/g' | sed 's/,$//g' > {params.temp_eur_cols_file}

    cut -d' ' -f$(cat {params.temp_eur_cols_file}) {params.temp_hap_file} >{params.uncomp_hap_file}; gzip {params.uncomp_hap_file};

    rm {params.temp_eur_cols_file} {params.temp_hap_file} {params.temp_hap_filtered_maf_file} {params.temp_hap_with_maf_file}
    """
79
80
shell:
    "Rscript workflow/scripts/simgwas/write_out_ld_block_files.R --hap_file {input.haplotype_file} --leg_file {input.legend_file} -b {input.block_file} --chr_no {params.chr_no} -o {params.output_root} -nt {threads}"
93
94
shell:
    "Rscript workflow/scripts/simgwas/compute_block_ld_matrix.R --hap_file {input.block_haplotype_file} --leg_file {input.block_legend_file} --output_file {output} -nt {threads}"
115
116
script:
    "../../scripts/simgwas/simulate_sum_stats_by_ld_block.R"
134
135
script:
    "../../scripts/simgwas/get_causal_variants_by_ld_block.R"
142
143
144
145
146
147
run:
    for i,x in enumerate(input):
        if i == 0:
            shell("cat %s > %s" % (x, output[0]))
        else:
            shell("cat %s | tail -n +2  >> %s" % (x, output[0]))
156
157
158
159
160
161
162
163
164
165
run:
    header_string = "position\ta0\ta1\tid\tTYPE\tEUR\tchr"

    shell(f"echo -e \"{header_string}\" > {params.uncomp_output}")

    for x in input:
        print(x)
        shell(f"zcat {x} | cut -f1-4,6-7,92 >> {params.uncomp_output}")

        shell("gzip {params.uncomp_output}")
20
21
script:
    "../../scripts/process_combined_simgwas_sum_stats_for_sumher.R"
42
43
44
45
shell:
    """
    $ldakRoot/ldak --sum-cors {params.output_stem} --tagfile {input.tagging_file} --summary {input.sum_stats_file_A} --summary2 {input.sum_stats_file_B} --allow-ambiguous YES --check-sums NO --cutoff 0.01 > {log.log_file}
    """
19
20
21
22
23
shell:
    """
    $ldakRoot/ldak --thin {params.output_stem} --bfile {params.input_stem} --window-prune .98 --window-kb 100 --max-threads {threads} > {log.log_file};
    awk < {output.thin_file} '{{print $1, 1}}' > {output.weights_file}
    """
41
42
shell:
    "$ldakRoot/ldak --calc-tagging {params.output_stem} --bfile {params.input_stem} --weights {input.weights_file} --chr {wildcards.chr} --window-kb 1000 --power -.25 --max-threads {threads} > {log.log_file}"
56
57
58
59
60
61
62
63
shell:
    """
    for x in {input}; do
        echo $x >> {output.chrom_taggings_file}
    done;

    $ldakRoot/ldak --join-tagging {params.output_stem} --taglist {output.chrom_taggings_file} > {log.log_file}
    """
83
84
script:
    "../../scripts/process_combined_simgwas_sum_stats_for_sumher.R"
105
106
107
108
shell:
    """
    $ldakRoot/ldak --sum-cors {params.output_stem} --tagfile {input.wg_tagging_file} --summary {input.sum_stats_file_A} --summary2 {input.sum_stats_file_B} --allow-ambiguous YES --check-sums NO --cutoff 0.01 > {log.log_file}
    """
16
17
script:
    "../../scripts/process_ukbb_sum_stats.R"
33
34
35
36
shell:
    """
    $ldakRoot/ldak --sum-cors {params.output_stem} --tagfile {input.wg_tagging_file} --summary {input.sum_stats_file_A} --summary2 {input.sum_stats_file_B} --allow-ambiguous YES --check-sums NO --cutoff 0.01 > {log.log_file}
    """
8
9
script:
    "../../scripts/ukbb/compute_hoeffdings.R"
20
21
22
23
24
25
26
27
28
29
run:
    manifest = pd.read_csv(input.manifest, sep = '\t', header = 0)
    manifest = manifest.query('Sex == \'both_sexes\'')
    manifest = manifest.assign(field = manifest['UK Biobank Data Showcase Link'].str.split("field\.cgi\?id=", expand = True)[[1]])

    trait_metadata = pd.read_csv(input.trait_metadata, sep = '\t', header = 0)

    merged = manifest.merge(trait_metadata, left_on = 'Phenotype Code', right_on = 'code')

    merged.to_csv(output[0], sep = '\t', index = False)
41
42
43
shell: """
    cut -f6019-6023,6762-6795,10124,15054-15133 -d$'\t' {input} >{output}
"""
53
script: "../../scripts/recode_ukbb_fields.R"
64
65
66
67
68
69
70
71
72
73
run:
    a_cases = int(shell("tail -n +2 {input} | grep -c {params.trait_A_token}", read = True))
    b_cases = int(shell("tail -n +2 {input} | grep -c {params.trait_B_token}", read = True))
    ab_cases = int(shell("tail -n +2 {input} | grep {params.trait_A_token} | grep -c {params.trait_B_token}", read = True))
    a_controls = int(shell("tail -n +2 {input} | grep -v -c {params.trait_A_token}", read = True))
    b_controls = int(shell("tail -n +2 {input} | grep -v -c {params.trait_B_token}", read = True))
    ab_controls = int(shell("tail -n +2 {input} | grep -v {params.trait_A_token} | grep -v -c {params.trait_B_token}", read = True))

    with open(output[0], 'w') as out:
        out.write(f"{wildcards.trait_A}\t{wildcards.trait_B}\t{a_cases}\t{b_cases}\t{ab_cases}\t{a_controls}\t{b_controls}\t{ab_controls}\n")
85
86
87
88
shell:"""
    echo -e "trait_A\ttrait_B\tA_cases\tB_cases\tAB_cases\tA_controls\tB_controls\tAB_controls" >>{output}
    cat {input} >>{output}
"""
100
script: "../../scripts/ukbb/annotate_overlap_results.R"
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
import pandas as pd

def compile_ukbb_gps_results_into_daf(input_files):
    d = []

    for x in input_files:
        m = re.match(r"results/gps/(?P<snp_set>\w+)/(?P<variant_set>\w+)/window_(?P<window>\d+kb)_step_(?P<step>\d+)_r2_(?P<r2>0_\d+)/(?P<trait_A>\w+)-(?P<trait_B>\w+)_(?P<draws>\d+)_permutations_gps_pvalue.tsv", x)

        try:
            with open(x, 'r') as infile:
                lines = [x.strip() for x in infile.readlines()]

            gps, n, loc, loc_sd, scale, scale_sd, shape, shape_sd, pval = lines[1].split('\t')

            d.append(
                {
                    'snp_set' : m.group('snp_set'),
                    'variant_set' : m.group('variant_set'),
                    'window' : m.group('window'),
                    'step' : m.group('step'),
                    'r2' : m.group('r2').replace('_', '.'),
                    'trait_A' : m.group('trait_A'),
                    'trait_B' : m.group('trait_B'),
                    'draws' : m.group('draws'),
                    'gps' : gps,
                    'n' : n,
                    'loc' : loc,
                    'loc.sd' : loc_sd,
                    'scale' : scale,
                    'scale.sd' : scale_sd,
                    'shape' : shape,
                    'shape.sd' : shape_sd,
                    'pval' : pval,
                }
            )
        except FileNotFoundError:
            continue

    return pd.DataFrame(d)

def compile_ukbb_li_gps_results_into_daf(input_files):
    d = []

    for x in input_files:
        m = re.match(r"results/gps/(?P<snp_set>\w+)/(?P<variant_set>\w+)/window_(?P<window>\d+kb)_step_(?P<step>\d+)_r2_(?P<r2>0_\d+)/(?P<trait_A>\w+)-(?P<trait_B>\w+)_li_gps_pvalue.tsv", x)

        try:
            with open(x, 'r') as infile:
                lines = [x.strip() for x in infile.readlines()]

            gps, pval = lines[1].split('\t')

            d.append(
                {
                    'snp_set' : m.group('snp_set'),
                    'variant_set' : m.group('variant_set'),
                    'window' : m.group('window'),
                    'step' : m.group('step'),
                    'r2' : m.group('r2').replace('_', '.'),
                    'trait_A' : m.group('trait_A'),
                    'trait_B' : m.group('trait_B'),
                    'gps' : gps,
                    'pval' : pval
                }
            )
        except FileNotFoundError:
            continue

    return pd.DataFrame(d)

def compile_ukbb_ldsc_results_into_daf(input_files):
    d = []

    h2_regex = r"Total \w+ scale h2: (.+)\s+\((.+)\)"
    int_regex = r"Intercept: (.+)\s+\((.+)\)"

    gcov_regex = r"Total \w+ scale gencov: (.+)\s+\((.+)\)"
    gcov_zprod_regex = r"Mean z1\*z2: (.+)"

    for x in input_files:

        m = re.match(r"results/ldsc/rg/ukbb/(?P<snp_set>\w+)/fixed_h2_free_rg_intercept/(?P<trait_A>\w+)-(?P<trait_B>\w+)\.log", x)

        try:
            with open(x, 'r') as infile:
                line = infile.readline()

                # TODO fix these for the null case
                while re.match(h2_regex, line) is None and re.match('ERROR', line) is None:
                    line = infile.readline()

                if re.match('ERROR', line):
                    d.append(
                        {
                            'trait_A' : m.group('trait_A'),
                            'trait_B' : m.group('trait_B'),
                            'snp_set' : m.group('snp_set'),
                            'h2.A.obs.ldsc' : nan,
                            'h2.A.obs.se.ldsc' : nan,
                            'h2.B.obs.ldsc' : nan,
                            'h2.B.obs.se.ldsc' : nan,
                            'gcov.obs.ldsc' : nan,
                            'gcov.obs.se.ldsc' : nan,
                            'rg.ldsc' : nan,
                            'rg.se.ldsc' : nan,
                            'rg.z.ldsc' : nan,
                            'rg.p.ldsc' : nan
                        }
                    )

                else:
                    h2_match_A = re.match(h2_regex, line)
                    h2_A = float(h2_match_A.group(1))
                    h2_A_se = float(h2_match_A.group(2))

                    line = infile.readline()
                    line = infile.readline()
                    line = infile.readline()

                    h2_int_A_match = re.match(int_regex, line)

                    if h2_int_A_match:
                        h2_int_A = float(h2_int_A_match.group(1))
                        h2_int_A_se = float(h2_int_A_match.group(2))
                    elif 'constrained to 1.' in line:
                        h2_int_A = 1.0
                        h2_int_A_se = nan
                    else:
                        raise Exception("No match for h2_B int_regex")

                    while re.match(h2_regex, line) is None:
                        line = infile.readline()

                    h2_match_B = re.match(h2_regex, line)
                    h2_B = float(h2_match_B.group(1))
                    h2_B_se = float(h2_match_B.group(2))

                    line = infile.readline()
                    line = infile.readline()
                    line = infile.readline()

                    h2_int_B_match = re.match(int_regex, line)

                    if h2_int_B_match:
                            h2_int_B = float(h2_int_B_match.group(1))
                            h2_int_B_se = float(h2_int_B_match.group(2))
                    elif 'constrained to 1.' in line:
                            h2_int_B = 1.0
                            h2_int_B_se = nan
                    else:
                            raise Exception("No match for h2_A int_regex")

                    while re.match(gcov_regex, line) is None:
                        line = infile.readline()

                    gcov_match = re.match(gcov_regex, line)
                    gcov = float(gcov_match.group(1))
                    gcov_se = float(gcov_match.group(2))

                    line = infile.readline()

                    gcov_zprod_match = re.match(gcov_zprod_regex, line)
                    gcov_zprod = float(gcov_zprod_match.group(1))

                    line = infile.readline()

                    gcov_int_match = re.match(int_regex, line)

                    if gcov_int_match:
                        gcov_int = float(gcov_int_match.group(1))
                        gcov_int_se = float(gcov_int_match.group(2))
                    elif 'constrained to 0.' in line:
                        gcov_int = 0.0
                        gcov_int_se = nan
                    else:
                        raise Exception("No match for gcov_int_regex")

                    line = infile.readline()

                    while re.match("^p1\s", line) is None:
                        line = infile.readline()

                    line = infile.readline()

                    rg, rg_se, rg_z, rg_p = [float(z) if z != 'NA' else nan for z in line.split()[2:6]]

                    d.append(
                        {
                            'trait_A' : m.group('trait_A'),
                            'trait_B' : m.group('trait_B'),
                            'snp_set' : m.group('snp_set'),
                            'h2.A.obs.ldsc' : h2_A,
                            'h2.A.obs.se.ldsc' : h2_A_se,
                            'h2.B.obs.ldsc' : h2_B,
                            'h2.B.obs.se.ldsc' : h2_B_se,
                            'gcov.obs.ldsc' : gcov,
                            'gcov.obs.se.ldsc' : gcov_se,
                            'rg.ldsc' : rg,
                            'rg.se.ldsc' : rg_se,
                            'rg.z.ldsc' : rg_z,
                            'rg.p.ldsc' : rg_p
                        }
                    )
        except FileNotFoundError:
            continue
        except AttributeError:
            print(x)

    return pd.DataFrame(d)

def compile_ukbb_sumher_results_into_daf(input_files):
    d = []

    for x in input_files:
        m = re.match(r"results/ldak/ldak-thin/(?P<snp_set>\w+)/rg/(?P<trait_A>\w+)-(?P<trait_B>\w+).cors.full", x)

        with open(x, 'r') as infile:
            line = infile.readline()
            line = infile.readline()

        # Category Trait1_Her SD Trait2_Her SD Both_Coher SD Correlation SD
        _, h2_A, h2_A_se, h2_B, h2_B_se, cov, cov_se, rg, rg_se = line.split()

        rg_z = float(rg)/float(rg_se)

        rg_p = chi2.sf(rg_z**2, df = 1, loc = 0, scale = 1)

        d.append(
            {
                'trait_A' : m.group('trait_A'),
                'trait_B' : m.group('trait_B'),
                'snp_set' : m.group('snp_set'),
                'h2.A.obs.sr' : float(h2_A),
                'h2.A.obs.se.sr' : float(h2_A_se),
                'h2.B.obs.sr' : float(h2_B),
                'h2.B.obs.se.sr' : float(h2_B_se),
                'gcov.obs.sr' : float(cov),
                'gcov.obs.se.sr' : float(cov_se),
                'rg.sr' : float(rg),
                'rg.se.sr' : float(rg_se),
                'rg.z.sr' : rg_z,
                'rg.p.sr' : rg_p
            }
        )

    return pd.DataFrame(d)

def compile_ukbb_hoeffdings_results_into_daf(input_files):
    d = []

    for x in input_files:
        m = re.match(r"results/(?P<snp_set>\w+)/(?P<variant_set>\w+)/window_(?P<window>\d+kb)_step_(?P<step>\d+)_r2_(?P<r2>0_\d+)/(?P<trait_A>\w+)-(?P<trait_B>\w+)_hoeffdings.tsv", x)

        with open(x, 'r') as infile:
            line = infile.readline()
            line = infile.readline()

        trait_A, trait_B, n, Dn, scaled, pvalue = line.split()

        d.append(
            {
                'trait_A' : m.group('trait_A'),
                'trait_B' : m.group('trait_B'),
                'snp_set' : m.group('snp_set'),
                'variant_set' : m.group('variant_set'),
                'window' : m.group('window'),
                'step' : m.group('step'),
                'r2' : m.group('r2'),
                'pval': pvalue
            }
        )

    return pd.DataFrame(d)
11
12
13
run:
    daf = compile_ukbb_gps_results_into_daf(input)
    daf.to_csv(output[0], sep = '\t', index = False)
20
21
22
run:
    daf = compile_ukbb_li_gps_results_into_daf(input)
    daf.to_csv(output[0], sep = '\t', index = False)
29
30
31
run:
    daf = compile_ukbb_ldsc_results_into_daf(input)
    daf.to_csv(output[0], sep = '\t', index = False)
38
39
40
run:
    daf = compile_ukbb_sumher_results_into_daf(input)
    daf.to_csv(output[0], sep = '\t', index = False)
47
48
49
run:
    daf = compile_ukbb_hoeffdings_results_into_daf(input)
    daf.to_csv(output[0], sep = '\t', index = False)
 7
 8
 9
10
shell:
    """
    wget https://broad-ukb-sumstats-us-east-1.s3.amazonaws.com/round2/additive-tsvs/{wildcards.id}.gwas.imputed_v3.both_sexes.tsv.bgz -O resources/ukbb_sum_stats/{wildcards.id}.gwas.imputed_v3.both_sexes.tsv.bgz
    """
SnakeMake From line 7 of ukbb/ukbb.smk
21
22
23
24
shell:
    """
    gunzip -c {input} >{output.decomp_file} && touch {output.flag_file}
    """
SnakeMake From line 21 of ukbb/ukbb.smk
38
script: "../../scripts/ukbb/merge_ukbb_sum_stats.R"
SnakeMake From line 38 of ukbb/ukbb.smk
51
script: "../../scripts/ukbb/prune_merged_sum_stats.R"
SnakeMake From line 51 of ukbb/ukbb.smk
62
script: "../../scripts/ukbb/remove_mhc_from_pruned_merged_sum_stats.R"
SnakeMake From line 62 of ukbb/ukbb.smk
71
72
script:
 "../../scripts/ukbb/downsample_sum_stats.R"
SnakeMake From line 71 of ukbb/ukbb.smk
79
80
shell:
    "tail -n +2 {input} | wc -l >{output}"
SnakeMake From line 79 of ukbb/ukbb.smk
87
88
shell:
    "tail -n +2 {input} | wc -l >{output}"
SnakeMake From line 87 of ukbb/ukbb.smk
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
panel <- read.table(snakemake@input[['panel_file']], header = T)
ped <- read.table(snakemake@input[['ped_file']], header = T, sep = '\t')

merged <- merge(panel, ped[c('Individual.ID', 'Paternal.ID', 'Maternal.ID')], by.x = 'sample', by.y = 'Individual.ID', all.x = T)

# Get unrelated European samples
euro <- subset(merged, super_pop == 'EUR' & Paternal.ID == 0 & Maternal.ID == 0)

euro <- euro[c('sample', 'sample', 'Paternal.ID', 'Maternal.ID', 'gender')]

names(euro) <- c('SampleID', 'SampleID', 'FatherID', 'MotherID', 'Sex')

# Fix to work with keep as implemented in plink2
write.table(euro[, c('SampleID')], file = snakemake@output[[1]], sep = ' ', col.names = F, row.names = F, quote = F)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
library(data.table)
setDTthreads(snakemake@threads)

bim <- fread(snakemake@input[['bim']], sep = '\t', header = F, col.names = c('chr', 'ID', 'Cm', 'bp', 'A1', 'A2'))

if(snakemake@wildcards[['snp_set']] == 'ukbb_sans_mhc') {
  bim <- bim[!(chr == 6 & bp %between% c(24e6, 45e6))]
}

if(snakemake@wildcards[['snp_set']] == 'ukbb_sans_mhc' | snakemake@wildcards[['snp_set']] == 'ukbb') {
  ukbb <- fread(snakemake@input[['ukbb']], sep = '\t', header = T, select = 'variant')

  bim[, variant_12 := paste(chr, bp, A1, A2, sep = ':')]
  bim[, variant_21 := paste(chr, bp, A2, A1, sep = ':')]

  bim <- bim[variant_12 %in% ukbb$variant | variant_21 %in% ukbb$variant]
}

fwrite(bim, sep = '\t', col.names = F, file = snakemake@output[[1]])
1
2
3
4
5
daf <- read.table(snakemake@input[[1]], sep = '\t', header = T)

out_daf <- data.frame(gps = daf$GPS, pval = pexp(daf$GPS^-2))

write.table(out_daf, file = snakemake@output[[1]], sep = '\t', row.names = F, quote = F)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
library(data.table)
library(evd)
library(fitdistrplus)

gps_dat <- fread(snakemake@input[['gps_file']], sep = '\t', header = T)

gps <- gps_dat[Trait_A == snakemake@params[['trait_A']] & Trait_B == snakemake@params[['trait_B']], GPS]

perm_dat <- fread(snakemake@input[['perm_file']], sep = '\t', header = T)

if(perm_dat[is.infinite(GPS), .N] > 0) {
  stop(sprintf("%d infinite-valued permuted GPS realisations", perm_dat[is.infinite(GPS), .N]))
}

fgev.fit <- tryCatch(
   fgev(perm_dat$GPS),
  error = function(c) {
    msg <- conditionMessage(c)
    if(msg == "observed information matrix is singular; use std.err = FALSE"){
      fgev(perm_dat$GPS, std.err = F)
    } else {
      stop(msg)
      }
    }
)

fgev.fitdist <- fitdist(perm_dat$GPS, 'gev', start = list(loc = fgev.fit$estimate[['loc']], scale = fgev.fit$estimate[['scale']], shape = fgev.fit$estimate[['shape']]))

loc <- fgev.fitdist$estimate[['loc']]
loc.sd <- fgev.fitdist$sd[['loc']]
scale <- fgev.fitdist$estimate[['scale']]
scale.sd <- fgev.fitdist$sd[['scale']]
shape <- fgev.fitdist$estimate[['shape']]
shape.sd <- fgev.fitdist$sd[['shape']]

pvalue <- pgev(gps, loc = loc, scale = scale, shape = shape, lower.tail = F)

fwrite(data.table(gps = gps, n = nrow(perm_dat), loc = loc, loc.sd = loc.sd, scale = scale, scale.sd = scale.sd, shape = shape, shape.sd = shape.sd, pval = pvalue), sep = '\t', file = snakemake@output[[1]])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
library(data.table)

setDTthreads(snakemake@threads)

intermediates_dat <- fread(snakemake@input[['intermediates_file']], sep = '\t', header = T)
sum_stats_dat <- fread(snakemake@input[['sum_stats_file']], sep = '\t', header = T, select = 'variant')

sum_stats_dat[, c('CHR19', 'BP19', 'ALT', 'REF') := tstrsplit(variant, split = ':', keep = 1:4)]

if(sum_stats_dat[, .N] != intermediates_dat[, .N]) stop("No. of rows in sum stats and intermediate output files differs")

out_dat <- cbind(sum_stats_dat, intermediates_dat)

fwrite(out_dat, file = snakemake@output[[1]], sep = '\t', col.names = T)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
library(data.table)
library(magrittr)
library(stringr)

setDTthreads(snakemake@threads)

for(i in seq_along(snakemake@input[['annot_files']])) {
  base <- basename(snakemake@input[['annot_files']][i])

  str_replace(base, pattern = '_intermediates_annot.tsv', replacement = '')[[1]] %>%
    str_split(., pattern = '-') %>%
    unlist -> trait_pair

  dat <- fread(snakemake@input[['annot_files']][i], sep = '\t', header = T)

  dat <- dat[order(maximand, decreasing = T)][1]

  dat[, `:=` (trait_A = trait_pair[[1]], trait_B = trait_pair[[2]])]

  pval_dat <- fread(snakemake@input[['pvalue_files']][i], sep = '\t', header = T)

  if(i == 1) {
    out_dat <- cbind(dat, pval_dat[, .(gps, pval)])
  } else {
    out_dat <- rbind(out_dat, cbind(dat, pval_dat[, .(gps, pval)]))
  }
}

fwrite(out_dat, file = snakemake@output[[1]], sep = '\t', col.names = T)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
library(data.table)
library(ggplot2)
library(patchwork)
library(Hmisc)
library(scales)
library(mvtnorm)

theme_set(
  theme_bw()+
  theme(
                                        #text = element_text(family = 'LM Roman 10'),
    axis.title = element_text(size = 12),
    plot.title = element_text(hjust = 0.5, size = 18),
    strip.text = element_text(size = 10),
    axis.text.x = element_text(size = 8, angle = 90, color = "black"),
    axis.text.y = element_text(size = 8, color = "black"),
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 10),
    plot.tag = element_text(size = 10),
    strip.background = element_rect(fill = 'white')
  )
)

stat_palette <- hue_pal()(5)
gps_col <- stat_palette[1]
li_gps_col <- stat_palette[5]

test_stats <- fread(snakemake@input[['test_statistics']], sep = '\t', header = T)

t1e_dat <- test_stats[, .(true = sum(pvalue <= 0.05), false = sum(pvalue > 0.05)), by = .(rho, zmean, zsd, stat)]

t1e_dat <- t1e_dat[, .(rho, zmean, zsd, stat, true, n = true+false, binconf(x = true, n = true+false, alpha = 0.05, method = 'wilson', return.df = T))]

t1e_dat[, propsig := pnorm(qnorm(5e-8/2), mean = -zmean, sd = sqrt(1+zsd^2))]
t1e_dat[, nsig := pnorm(qnorm(5e-8/2), mean = -zmean, sd = sqrt(1+zsd^2))*400]

#pval_dats <- lapply(snakemake@input[['pvalue_files']], fread, sep = '\t', header = T, col.names = c('p.1', 'p.2'))
#
#zmeans <- c(1, 2, 3, 1, 2, 3)
#zsds <- c(1, 1, 1, 2, 2, 2)
#
#for(i in 1:6) {
#  pval_dats[[i]][, `:=` (zmean = zmeans[i], zsd = zsds[i])]
#  pval_dats[[i]][1:40000, `:=` (null.1 = F, null.2 = F)]
#  pval_dats[[i]][40001:40400, `:=` (null.1 = T, null.2 = F)]
#  pval_dats[[i]][40401:40800, `:=` (null.1 = F, null.2 = T)]
#}
#
#pval_dat <- rbindlist(pval_dats)

simp=function(N00, N01, N10, rho, zmean=ZM, zsd=ZS) {
  S=matrix(c(1, rho, rho, 1), 2, 2)
  Z=rmvnorm(N00+N01+N10, sigma=S)
  if(N01>0) 
    Z[ N00+(1:N01), 2]= Z[ N00+(1:N01), 2] + rnorm(N01,zmean,sd=zsd)
  if(N10>0) 
    Z[ N00+N01+(1:N10), 1]= Z[ N00+N01+(1:N10), 1] + rnorm(N10,zmean,sd=zsd)
  P=2 * pnorm(-abs(Z))
  if(which.max(P[,1])==which.max(P[,2])) { # catch rare complication
    return(simz(N00, N01, N10, rho=r,zmean=5))
  }
  P 
}

p11=simp(40000, 400, 400, rho=0, zmean=1, zsd=1)[,2]

dt=data.table(p=p11, lp=-log10(p11), assoc=rep(c("null","assoc"), times=c(40400, 400)))

lp=function(zmean, zsd, w=400/41200) {
  x=seq(-10,0,by=0.1)
  y0=2*dnorm(x, mean=0, sd=1) #* (1-w)
  y1=2*dnorm(x, mean=-zmean, sd=sqrt(zsd^2 + 1)) #* w
  lp=-log10(pnorm(x)*2)
  data.table(z=x, prob0=y0, prob1=y1, prob=y0+y1, lp=lp, zmean=zmean, zsd=zsd)
}

dt <- melt(rbind(lp(1,1), lp(1,2), lp(2,1), lp(2,2), lp(3,1), lp(3,2)), measure.vars=c("prob0","prob1"), variable.name="associated")

dt[,associated:=c("null","associated")[associated]]

pl1 <- ggplot(dt[zmean > 0]) +
  geom_path(aes(x=lp,y=value,col=associated)) +
  facet_grid(zmean ~ zsd, labeller=label_both) +
  labs(x="-log10 p-value", y="Density") +
  scale_colour_manual(name = '', values=c(null="grey20",associated="slateblue")) +
  geom_vline(xintercept=-log10(5e-8), linetype="dashed", colour="black") +
  geom_text(aes(label=paste0(100*round(propsig,3),"%")),x=15,y=0.4, data = t1e_dat[zmean > 0], col="slateblue") +
  theme(legend.position="bottom")

pl2 <- ggplot(data = t1e_dat[zmean > 0], aes(x = rho, y = PointEst, ymin = Lower, ymax = Upper, col = stat, group = stat)) +
  geom_pointrange(size = 0.05)+
  geom_path()+
  geom_hline(yintercept = 0.05, linetype = 2) +
  scale_colour_discrete("Method")+
  labs(x="between dataset correlation, rho", y="estimated type 1 error rate") +
  facet_grid(zmean ~ zsd, labeller=label_both) +
  ylim(c(0, .375))+
  scale_colour_manual(name = '', values = c('GPS-Exp' = li_gps_col, 'GPS-GEV' = gps_col))+
  theme(legend.position="bottom")+
  ylab('Type 1 error rate')+
  xlab('Between-data set effect estimate correlation')

fig_s10 <- pl1 / pl2+
  plot_annotation(tag_levels = 'A')

ggsave(fig_s10, file = snakemake@output[[1]], width = 8, height = 10)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
library(data.table)
library(argparse)
library(ggplot2)

parser <- ArgumentParser(description = 'Plot heatmap of GPS statistic denominator')
parser$add_argument('-i', '--input_file', type = 'character', help = 'Path to input file')
parser$add_argument('-o', '--output_file', type = 'character', help = 'Path to output file')

args <- c('--input_file', 'results/gps/ukbb/all_pruned_snps/window_1000kb_step_50_r2_0_2/20002_1111-20002_1113_ecdf.tsv', '-o',"results/plots/all_pruned_snps/gps_heatmaps/20002_1111-20002_1113.png")

args <- parser$parse_args()

dat <- fread(args$input_file, sep = '\t', header = T)

dat[ , denom := sqrt(F_u*F_v - (F_u^2)*(F_v^2))]
dat[, num := abs(F_uv - F_u*F_v)]
dat[, maximand := sqrt(.N/log(.N))*num/denom]
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
library(data.table)
library(mvtnorm)

##' simulate correlated p-values
##'
##' @param N00 number of snps null for both datasets
##' @param N01 number of snps null for dataset 1, non-null for dataset 2
##' @param N10 number of snps null for dataset 2, non-null for dataset 1
##' @param rho correlation between z scores
##' @param zmean mean z score at non-null snps
##' @return (N00+N01+N10) x 2 matrix of z scores
##' @export 
##' @author Chris Wallace
simp=function(N00, N01, N10, rho, zmean=ZM, zsd=ZS) {
  S=matrix(c(1, rho, rho, 1), 2, 2)

  Z=rmvnorm(N00+N01+N10, sigma=S)

  if(N01>0) {
    Z[ N00+(1:N01), 2]= Z[ N00+(1:N01), 2] + rnorm(N01,zmean,sd=zsd)
  }

  if(N10>0) {
    Z[ N00+N01+(1:N10), 1]= Z[ N00+N01+(1:N10), 1] + rnorm(N10,zmean,sd=zsd)
  }

  P=2 * pnorm(-abs(Z))

  P
}

fwrite(data.table(simp(N00 = snakemake@params[['N00']], N01 = snakemake@params[['N01']], N10 = snakemake@params[['N10']], rho = snakemake@params[['rho']], zmean = snakemake@params[['zmean']], zsd = snakemake@params[['zsd']])), sep = '\t', file = snakemake@output[[1]])
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
library(data.table)
library(stringr)
library(magrittr)

obs_liab_trans <- function(h2.obs, P, K) {
  z_2 <- dnorm(qnorm(1-K))^2

  h2.obs * ((K*(1-K))^2/(P*(1-P)))/z_2
}

odds_ratios <- list('null' = 1,
                    'tiny' = 1.02,
                    'small' = 1.05,
                    'infinitesimal' = 1.1,
                    'medium' = 1.2,
                    'large' = 1.4,
                    'vlarge' = 2)

setDTthreads(snakemake@threads)
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3059431/
# According to the paper in which the ascertainment-corrected liability scale transformation is set out, the convention is that P is the sample prevalence and K the population prevalence
P_a <- snakemake@params[['sample_prevalence_A']]
P_b <- snakemake@params[['sample_prevalence_B']]
K_a <- snakemake@params[['population_prevalence_A']]
K_b <- snakemake@params[['population_prevalence_B']]

cv_dat <- fread(snakemake@input[['combined_causal_variants_file']], sep = '\t', header = T)

a_blocks_dat <- fread(snakemake@input[['a_block_file']], sep = '\t', header = T)
b_blocks_dat <- fread(snakemake@input[['b_block_file']], sep = '\t', header = T)

cv_dat[, geno_var := 2*EUR*(1-EUR)]

cv_dat[, c('in_a_blocks', 'in_b_blocks') := F]
cv_dat[, c('odds_ratio_a', 'odds_ratio_b') := 1]

for(i in 1:nrow(a_blocks_dat)) {
  if(a_blocks_dat[i, no_cvs] > 0) {
    if(cv_dat[a_blocks_dat[i, chr] == chr & a_blocks_dat[i, block] == block, .N] == 0) {
      print(sprintf("Missing chr%d:block%d in a file", a_blocks_dat[i, chr], a_blocks_dat[i, block]))
      } else {
        for(j in 1:a_blocks_dat[i, no_cvs]) {
        # Need to iterate over j here otherwise we'll get two cvs where we sometimes only want one
          cv_dat[a_blocks_dat[i, chr] == chr & a_blocks_dat[i, block] == block & a_blocks_dat[i, effect] != 'null'][j] <- cv_dat[a_blocks_dat[i, chr] == chr & a_blocks_dat[i, block] == block & a_blocks_dat[i, effect] != 'null'][j][, `:=` (odds_ratio_a = unlist(odds_ratios[[a_blocks_dat[i, effect]]]), in_a_blocks = T)]
        }
    }
  }
}

for(i in 1:nrow(b_blocks_dat)) {
  if(b_blocks_dat[i, no_cvs] > 0) {
    if(cv_dat[b_blocks_dat[i, chr] == chr & b_blocks_dat[i, block] == block, .N] == 0) {
      print(sprintf("Missing chr%d:block%d in a file", b_blocks_dat[i, chr], b_blocks_dat[i, block]))
    } else {
      for(j in 1:b_blocks_dat[i, no_cvs]) {
      # Need to iterate over j here otherwise we'll get two cvs where we sometimes only want one
        cv_dat[b_blocks_dat[i, chr] == chr & b_blocks_dat[i, block] == block & b_blocks_dat[i, effect] != 'null'][j] <- cv_dat[b_blocks_dat[i, chr] == chr & b_blocks_dat[i, block] == block & b_blocks_dat[i, effect] != 'null'][j][, `:=` (odds_ratio_b = unlist(odds_ratios[[b_blocks_dat[i, effect]]]), in_b_blocks = T)]
      }
    }
  }
}

if(cv_dat[in_a_blocks == T, .N] != a_blocks_dat[, sum(no_cvs)]) stop(sprintf('Missing %d causal variants from A set', a_blocks_dat[, sum(no_cvs)] - cv_dat[in_a_blocks == T, .N] ))

if(cv_dat[in_b_blocks == T, .N] != b_blocks_dat[, sum(no_cvs)]) stop(sprintf('Missing %d causal variants from B set', b_blocks_dat[, sum(no_cvs)] - cv_dat[in_b_blocks == T, .N]))

if(cv_dat[in_a_blocks == T & in_b_blocks == T, .N] != merge(a_blocks_dat, b_blocks_dat, by = c('chr', 'block'))[effect.x != 'null' & effect.y != 'null', sum(no_cvs.x)]) stop('Missing shared causal variants')

cv_dat[in_a_blocks == T, beta.A := log(odds_ratio_a)]
cv_dat[in_b_blocks == T, beta.B := log(odds_ratio_b)]

cv_dat[in_a_blocks == T, beta_2.A := beta.A^2]
cv_dat[in_b_blocks == T, beta_2.B := beta.B^2]

V_A.A <- with(cv_dat[in_a_blocks == T], sum(beta_2.A*geno_var))
V_A.B <- with(cv_dat[in_b_blocks == T], sum(beta_2.B*geno_var))

h2.theo.obs.A <- V_A.A/(P_a*(1-P_a))
h2.theo.obs.B <- V_A.B/(P_b*(1-P_b))

h2.theo.liab.A <- obs_liab_trans(h2.theo.obs.A, P = P_a, K = K_a)
h2.theo.liab.B <- obs_liab_trans(h2.theo.obs.B, P = P_b, K = K_b)

C_A.AB <- with(cv_dat[in_a_blocks == T & in_b_blocks == T], sum(beta.A * beta.B * geno_var))

r_A.AB <- C_A.AB/(sqrt(V_A.A)*sqrt(V_A.B))

res_dat <- data.table(odds_ratio.A = paste(unique(cv_dat[in_a_blocks == T, odds_ratio_a]), collapse = ','),
                      odds_ratio.B = paste(unique(cv_dat[in_b_blocks == T, odds_ratio_b]), collapse = ','),
                      no_blocks.A = cv_dat[in_a_blocks == T, .N],
                      no_blocks.B = cv_dat[in_b_blocks == T, .N],
                      no_shared_blocks = cv_dat[in_a_blocks == T & in_b_blocks == T, .N],
                      h2.theo.obs.A,
                      h2.theo.obs.B,
                      h2.theo.liab.A,
                      h2.theo.liab.B,
                      V_A.A,
                      V_A.B,
                      C_A.AB,
                      r_A.AB)

fwrite(res_dat, file = snakemake@output[['theo_rg_file']], sep = '\t', na = 'NA')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
library(data.table)

setDTthreads(snakemake@resources[['threads']])

sum_stats_dat <- fread(snakemake@input[['sum_stats_file']], sep = '\t', header = T, tmpdir = snakemake@resources[['tmpdir']], select = c('variant', snakemake@params[['pval_col']], snakemake@params[['tstat_col']], snakemake@params[['n_col']]))

sum_stats_dat[, c('chr', 'bp', 'ref', 'alt') := tstrsplit(variant, split = ':')]

sum_stats_dat[, chr := as.character(chr)]
sum_stats_dat[, bp := as.character(bp)]

snp_dat <- fread(snakemake@input[['snplist_file']], sep = '\t', header = T)

snp_dat[, CHR := as.character(CHR)]
snp_dat[, BP := as.character(BP)]

merged_dat <- merge(sum_stats_dat, snp_dat, by.x = c('chr', 'bp'), by.y = c('CHR', 'BP'))

merged_dat <- merged_dat[(ref == A1  & alt == A2) | (ref == A2 & alt == A1)]

merged_dat[, c('ref', 'alt') := NULL]

setnames(merged_dat, c('chr', 'bp'), c('CHR', 'BP'))

fwrite(merged_dat, file = snakemake@output[[1]], sep = '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
library(magrittr)
library(data.table)

setDTthreads(snakemake@threads)

lapply(snakemake@input[['ldsc_files']], fread, sep = '\t', select = c('CHR', 'SNP', 'BP')) %>% rbindlist -> merged_dat

allele_dat <- fread(snakemake@input[['snplist_file']], sep = '\t')

merged_dat <- merge(merged_dat, allele_dat, by = 'SNP')

fwrite(merged_dat, file = snakemake@output[[1]], sep = '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
library(data.table)

setDTthreads(snakemake@threads)

chr_colname <- snakemake@params[['chr_colname']]
bp_colname <- snakemake@params[['bp_colname']]
a1_colname <- snakemake@params[['a1_colname']]
a2_colname <- snakemake@params[['a2_colname']]
z_colname <- snakemake@params[['z_colname']]
n <- snakemake@params[['sample_size']]

dat <- fread(snakemake@input[[1]], sep = '\t', header = T, select = c(chr_colname, bp_colname, a1_colname, a2_colname, z_colname))

setnames(dat, c(a1_colname, a2_colname, z_colname), c('A1', 'A2', 'Z'))

dat[, Predictor := paste(get(chr_colname), get(bp_colname), A2, A1, sep = ':')]

dat[, n := n]

dat[, `:=` (len.A1 = nchar(A1), len.A2 = nchar(A2))]

dat <- dat[len.A1 == 1 & len.A2 == 1]

dat <- dat[!duplicated(dat, by = 'Predictor')]

dat <- na.omit(dat)

dat <- dat[, .(Predictor, A1, A2, n, Z)]

fwrite(dat, file = snakemake@output[[1]], sep = '\t', na = 'NA', quote = F)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
library(data.table)

setDTthreads(snakemake@threads)

z_colname <- snakemake@params[['z_colname']]
sample_size_colname <- snakemake@params[['sample_size_colname']]

dat <- fread(snakemake@input[[1]], sep = '\t', header = T, select = c('variant', z_colname, sample_size_colname))

# The SumHer documentation states that A1 is the 'test allele' and A2 is the 'other allele', so I take this to mean A1 is the minor allele and A2 the major allele

# chr:bp:major:minor in 'variant' column of UKBB sum stats files
dat[, c('chr', 'bp', 'A2', 'A1') := tstrsplit(variant, split = ':', keep = 1:4)]

dat <- dat[!(chr %in% c('X', 'Y', 'MT'))]

dat[, `:=` (len.A1 = nchar(A1), len.A2 = nchar(A2))]

dat <- dat[len.A1 == 1 & len.A2 == 1]

# 'Predictor' has format chr:bp:A2:A1 in my simgwas file, not sure it is correct, though
dat[, Predictor := paste(chr, bp, A2, A1, sep = ':')]

setnames(dat, c(z_colname, sample_size_colname), c('Z', 'n'))

dat <- dat[!duplicated(dat, by = 'Predictor')]

dat <- na.omit(dat)

dat <- dat[, .(Predictor, A1, A2, n, Z)]

fwrite(dat, file = snakemake@output[[1]], sep = '\t', na = 'NA', quote = F)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
library(data.table)
setDTthreads(snakemake@threads)

dat <- fread(snakemake@input[[1]], sep = '\t', header = T)

dat[f.6148.0.0 == 2 | f.6148.0.1 == 2 | f.6148.0.2 == 2 | f.6148.0.3 == 2 | f.6148.0.4 == 2, glaucoma := "6148_2"]
dat[f.6148.0.0 == 5 | f.6148.0.1 == 5 | f.6148.0.2 == 5 | f.6148.0.3 == 5 | f.6148.0.4 == 5, md := "6148_5"]

dat[f.22126.0.0 == 0, f.22126.0.0 := NA]
dat[f.22126.0.0 == 1, f.22126.0.0 := 22126]

fwrite(dat, file = snakemake@output[[1]], sep = '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
library(data.table)
setDTthreads(snakemake@threads)
library(magrittr)

a_block_file <- snakemake@input[['a_block_file']]
b_block_file <- snakemake@input[['b_block_file']]

a_block_files <- scan(a_block_file, what = character())
b_block_files <- scan(b_block_file, what = character())

ncases_column_index <- 8  +  (as.integer(snakemake@wildcards[['no_reps']])  *  4) + 2
ncontrols_column_index <- 8 + (as.integer(snakemake@wildcards[['no_reps']]) * 4) + 3
chr_column_index <- 8 + (as.integer(snakemake@wildcards[['no_reps']]) * 4) + 4
block_effect_column_index <- 8 + (as.integer(snakemake@wildcards[['no_reps']]) * 4) + 5

z_column_index_A <- 8 + as.integer(snakemake@wildcards[['tag_A']])
beta_column_index_A <- 8 + (as.integer(snakemake@wildcards[['no_reps']]) * 2) + as.integer(snakemake@wildcards[['tag_A']])
p_column_index_A <- 8 + (as.integer(snakemake@wildcards[['no_reps']]) * 3) + as.integer(snakemake@wildcards[['tag_A']])

a_cols <- c(1:7, z_column_index_A, beta_column_index_A, p_column_index_A, ncases_column_index, ncontrols_column_index, chr_column_index, block_effect_column_index)

a_block_files %>%
  lapply(., fread, sep = '\t', header = F, select = a_cols) %>%
  rbindlist -> a_block_dat

fwrite(a_block_dat, file = snakemake@output[['combined_sum_stats_A']], sep = '\t', col.names = F)

rm(a_block_dat)

z_column_index_B <- 8 + as.integer(snakemake@wildcards[['tag_B']])
beta_column_index_B <- 8 + (as.integer(snakemake@wildcards[['no_reps']]) * 2) + as.integer(snakemake@wildcards[['tag_B']])
p_column_index_B <- 8 + (as.integer(snakemake@wildcards[['no_reps']]) * 3) + as.integer(snakemake@wildcards[['tag_B']])

b_cols <- c(1:7, z_column_index_B, beta_column_index_B, p_column_index_B, ncases_column_index, ncontrols_column_index, chr_column_index, block_effect_column_index)

b_block_files %>%
  lapply(., fread, sep = '\t', header = F, select = b_cols) %>%
  rbindlist -> b_block_dat

fwrite(b_block_dat, file = snakemake@output[['combined_sum_stats_B']], sep = '\t', col.names = F)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
library(data.table)
library(simGWAS)
library(argparse)
library(parallel)

parser <- ArgumentParser(description = 'Computes blockwise LD matrix')
parser$add_argument('--hap_file', type = 'character', help = 'Path to haplotype file')
parser$add_argument('--leg_file', type = 'character', help = 'Path to legend file')
parser$add_argument('--output_file', type = 'character', help = 'Path to output file', required = T)
parser$add_argument('-nt', '--no_of_threads', type = 'integer', help = 'Number of threads to use', default = 1)

args <- parser$parse_args()

setDTthreads(args$no_of_threads)

leg_dat <- fread(file = args$leg_file, sep = ' ', header = T)
hap_dat <- fread(file = args$hap_file, sep = ' ', header = F)

for(j in 1:(ncol(hap_dat)-2)) {
  set(hap_dat, j = j, value = as.numeric(hap_dat[[j]]))
}

hap_mat <- as.matrix(hap_dat[,1:(ncol(hap_dat)-2)])

freq_dat <- data.table(t(hap_mat)+1)

colnames(freq_dat) <- hap_dat[[(ncol(hap_dat)-1)]]

freq_dat[, Probability := 1/.N]

rm(hap_dat, hap_mat)

ld_mat <- corpcor::make.positive.definite(simGWAS:::wcor2(as.matrix(freq_dat[,setdiff(colnames(freq_dat),"Probability"), with = F][, leg_dat$rs, with = F]), freq_dat$Probability))

save(ld_mat, file = args$output_file)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
library(data.table)
library(Rcpp)
library(independence)

setDTthreads(snakemake@threads)

sourceCpp(code = '
#include <Rcpp.h>
#include <map>

using namespace Rcpp;

// [[Rcpp::export]]
NumericVector perturbDuplicates(NumericVector values) {
  std::map<double, int> freqMap;

  for(size_t i = 0; i < values.size(); ++i){
    freqMap[values[i]]++;

    if(freqMap[values[i]] > 1) {
      for(int j = 1; j < freqMap[values[i]]; ++j) {
        values[i] = values[i] + (freqMap[values[i]] * std::numeric_limits<double>::epsilon());
      }
    }
  }

  return values;
}
')

run_hoeffding <- function(dat, trait_A_code, trait_B_code) {
  dat[, trait_A_code := perturbDuplicates(get(trait_A_code))]
  dat[, trait_B_code := perturbDuplicates(get(trait_B_code))]

  dat <- unique(unique(dat, by = trait_A_code), by = trait_B_code)

  hoeffding.D.test(xs = dat[[trait_A_code]], ys = dat[[trait_B_code]])
}

sum_stats_dat <- fread(snakemake@input[['sum_stats_file']], sep = '\t', header = T, select = c(snakemake@params[['a_colname']], snakemake@params[['b_colname']]))

hoeffding_res <- run_hoeffding(sum_stats_dat, snakemake@params[['a_colname']], snakemake@params[['b_colname']])

res_dat <- data.table(t(unlist(hoeffding_res[c('n', 'Dn', 'scaled', 'p.value')])))

res_dat[, `:=` (trait_A = snakemake@wildcards[['effect_blocks_A']], trait_B = snakemake@wildcards[['effect_blocks_B']])]

res_dat <- res_dat[, c('trait_A', 'trait_B', 'n', 'Dn', 'scaled', 'p.value')]

if(is.na(res_dat$p.value)) {
  stop("Failed to compute Hoeffding's test p-value")
}

fwrite(res_dat, file = snakemake@output[[1]], sep = '\t', col.names = T, row.names = F)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
library(data.table)
setDTthreads(snakemake@threads)
library(magrittr)

z_column_name_A <- sprintf("zsim.%s", snakemake@wildcards[['tag_A']])
beta_column_name_A <- sprintf("betasim.%s", snakemake@wildcards[['tag_A']])
p_column_name_A <- sprintf("p.%s", snakemake@wildcards[['tag_A']])

header_A <- c("position", "a0", "a1", "id", "block", "TYPE", "EUR", z_column_name_A, beta_column_name_A, p_column_name_A, "ncases", "ncontrols", "chr", "block_effect_size")

snakemake@input[['a_files']] %>%
  lapply(., fread, sep = '\t', header = F) %>%
  rbindlist -> a_dat

names(a_dat) <- header_A

fwrite(a_dat, sep = '\t', file = snakemake@output[['combined_sum_stats_A']], col.names = T)

rm(a_dat)

z_column_name_B <- sprintf("zsim.%s", snakemake@wildcards[['tag_B']])
beta_column_name_B <- sprintf("betasim.%s", snakemake@wildcards[['tag_B']])
p_column_name_B <- sprintf("p.%s", snakemake@wildcards[['tag_B']])

header_B <- c("position", "a0", "a1", "id", "block", "TYPE", "EUR", z_column_name_B, beta_column_name_B, p_column_name_B, "ncases", "ncontrols", "chr", "block_effect_size")

snakemake@input[['b_files']] %>%
  lapply(., fread, sep = '\t', header = F) %>%
  rbindlist -> b_dat

names(b_dat) <- header_B

fwrite(b_dat, sep = '\t', file = snakemake@output[['combined_sum_stats_B']], col.names = T)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
library(data.table)

setDTthreads(snakemake@threads)

leg_dat <- fread(file = snakemake@input[['block_legend_file']], sep = ' ', header = T)
hap_dat <- fread(file = snakemake@input[['block_haplotype_file']], sep = ' ', header = F)
bim_dat <- fread(file = snakemake@input[['bim_file']], sep = '\t', header = F, col.names = c('chr', 'bim.id', 'Cm', 'bp', 'A1', 'A2'))
load(file = snakemake@input[['ld_mat_file']])

for(j in 1:(ncol(hap_dat)-2)) {
  set(hap_dat, j = j, value = as.numeric(hap_dat[[j]]))
}

hap_mat <- as.matrix(hap_dat[,1:(ncol(hap_dat)-2)])

hap_meta_dat <- hap_dat[, (ncol(hap_dat)-1):ncol(hap_dat)]

names(hap_meta_dat) <- c('rs', 'block')

rm(hap_dat)

freq_dat <- data.table(t(hap_mat)+1)

rm(hap_mat)

colnames(freq_dat) <- hap_meta_dat$rs

# Required by the make_GenoProbList function below (at least)
freq_dat[, Probability := 1/.N]

cv_ind <- snakemake@params[['causal_variant_indices']]

result_dat <- data.table(leg_dat[cv_ind, .(id, position, block, a0, a1, TYPE, EUR)])

result_dat <- merge(result_dat, bim_dat[, .(bim.id, bp, A1, A2)], by.x = 'position', by.y = 'bp', all.x = T)

result_dat[, c("A1", "A2", "bim.id") := NULL]

result_dat[, chr := snakemake@params[['chr_no']]]

result_dat[, rsID := tstrsplit(id, split = ':', keep = 1)]

fwrite(result_dat, file = snakemake@output[[1]], sep = '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
library(data.table)

setDTthreads(snakemake@threads)

sum_stats_A_dat <- fread(snakemake@input[['sum_stats_file_A']], sep = '\t', header = T, select = c('id', 'chr', 'position', 'a0', 'a1', 'block', snakemake@params[['file_A_stat_cols']], 'block_effect_size'))
sum_stats_B_dat <- fread(snakemake@input[['sum_stats_file_B']], sep = '\t', header = T, select = c('id', 'chr', 'position', 'a0', 'a1', 'block', snakemake@params[['file_B_stat_cols']], 'block_effect_size'))

merged_dat <- merge(sum_stats_A_dat, sum_stats_B_dat, by = c('chr', 'position', 'a0', 'a1', 'block'), suffixes = c(".A", ".B"))

merged_dat[, id.B := NULL]

setnames(merged_dat, 'id.A', 'id')

if(is.null(snakemake@wildcards[['chr']]) & length(unique(merged_dat$chr)) != 22) {
  stop(sprintf("Incorrect number of chromosomes present in merged summary statistics: %s", length(unique(merged_dat$chr))))
} else {
  merged_dat <- unique(merged_dat, by = c('chr', 'position'))
  fwrite(merged_dat, file = snakemake@output[[1]], sep = '\t')
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
library(data.table)
library(argparse)

parser <- ArgumentParser(description = 'Prune simulated summary statistics file')
parser$add_argument('--sum_stats_file', type = 'character', help = 'Path to summary statistics file')
parser$add_argument('--bim_file', type = 'character', help = 'Path to bim file')
parser$add_argument('--prune_file', type = 'character', help = 'Path to file containing pruned IDs')
parser$add_argument('-o', '--output_path', type = 'character', help = 'Path to pruned summary statistics file', required = T)
parser$add_argument('-nt', '--no_of_threads', type = 'integer', help = 'Number of threads to use', default = 1)

args <- parser$parse_args()

setDTthreads(args$no_of_threads)

sum_stats_dat <- fread(args$sum_stats_file, sep = '\t', header = T)

pruned_rsid_dat <- fread(args$prune_file, sep = ' ', header = F, col.names = 'ID')

bim_dat <- fread(args$bim_file, sep = '\t', header = F, col.names = c('chr', 'ID', 'Cm', 'bp', 'A1', 'A2'))

pruned_rsid_dat <- merge(bim_dat, pruned_rsid_dat, by = 'ID')

rm(bim_dat)

merged_dat <- merge(sum_stats_dat, pruned_rsid_dat[ , .(chr, bp, A1, A2)], by.x = c('chr', 'position'), by.y = c('chr', 'bp'))

merged_dat <- merged_dat[(a0 == A2 & a1 == A1) | (a0 == A1 & a1 == A2)]

# Some indels are present as duplicates with flipped A1/A2 entries
merged_dat <- merged_dat[!duplicated(merged_dat, by = c('chr', 'position'))]

merged_dat[, c("A1", "A2") := NULL]

fwrite(merged_dat, file = args$output_path, sep = '\t')
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
library(data.table)
library(simGWAS)
library(magrittr)

simulated_z_score_par <- function(exp_z_score, ld_mat, nrep=1, ncores = 1){
    sim_z_score <- mvnfast::rmvn(n = nrep, mu = exp_z_score, sigma = ld_mat, ncores 
= ncores)
    if(nrep==1)
        return(c(sim_z_score))
    sim_z_score
}

no_reps <- snakemake@params[['no_reps']]

setDTthreads(snakemake@threads)

set.seed(snakemake@wildcards[['seed']])

leg_dat <- fread(file = snakemake@input[['block_legend_file']], sep = ' ', header = T)
hap_dat <- fread(file = snakemake@input[['block_haplotype_file']], sep = ' ', header = F)
bim_dat <- fread(file = snakemake@input[['bim_file']], sep = '\t', header = F, col.names = c('chr', 'bim.id', 'Cm', 'bp', 'A1', 'A2'))
load(file = snakemake@input[['ld_mat_file']])

log_odds_ratio <- log(snakemake@params[['odds_ratio']])

for(j in 1:(ncol(hap_dat)-2)) {
  set(hap_dat, j = j, value = as.numeric(hap_dat[[j]]))
}

hap_mat <- as.matrix(hap_dat[,1:(ncol(hap_dat)-2)])

hap_meta_dat <- hap_dat[, (ncol(hap_dat)-1):ncol(hap_dat)]

names(hap_meta_dat) <- c('rs', 'block')

rm(hap_dat)

freq_dat <- data.table(t(hap_mat)+1)

rm(hap_mat)

colnames(freq_dat) <- hap_meta_dat$rs

# Required by the make_GenoProbList function below (at least)
freq_dat[, Probability := 1/.N]

chosen_snps <- colnames(ld_mat)

cv_ind <- sample(1:length(chosen_snps), size = snakemake@wildcards[['no_cvariants']])

cv_snps <- chosen_snps[cv_ind]

sub_freq_dat <- freq_dat[, c(colnames(ld_mat), 'Probability'), with = F]

geno_probs <- make_GenoProbList(snps = colnames(ld_mat),
                                W = colnames(ld_mat)[cv_ind],
                                freq = sub_freq_dat)

zexp <- expected_z_score(N0 = snakemake@params[['no_controls']],
                          N1 = snakemake@params[['no_cases']],
                          snps = chosen_snps,
                          W = cv_snps,
                          gamma.W = rep(log_odds_ratio, length(cv_snps)),
                          freq = sub_freq_dat,
                          GenoProbList = geno_probs)

zsim <- simulated_z_score_par(exp_z_score = zexp, ld_mat = ld_mat, nrep = no_reps, ncores = snakemake@threads)

# Both vbetasim and betasim overflow if passed as integers
vbetasim <- simulated_vbeta(N0 = as.numeric(snakemake@params[['no_controls']]),
                            N1 = as.numeric(snakemake@params[['no_cases']]),
                            snps = chosen_snps,
                            W = cv_snps,
                            gamma.W = rep(log_odds_ratio, length(cv_snps)),
                            freq = sub_freq_dat,
                            nrep = no_reps)

betasim <- zsim * sqrt(vbetasim)

if(no_reps == 1 ) {
  result_dat <- data.table(leg_dat[rs %in% chosen_snps, .(id, position, block, a0, a1, TYPE, EUR)], zexp, zsim, vbetasim = t(vbetasim), betasim = t(betasim))
} else {
  # Add p-values for multiple reps
  result_dat <- data.table(leg_dat[rs %in% chosen_snps, .(id, position, block, a0, a1, TYPE, EUR)], zexp, t(zsim), t(vbetasim), t(betasim))
}

names(result_dat) <- c('id', 'position', 'block', 'a0', 'a1', 'TYPE', 'EUR', 'zexp',
                       paste0('zsim.', 1:no_reps),
                       paste0('vbetasim.', 1:no_reps),
                       paste0('betasim.', 1:no_reps))

for(j in 1:no_reps) {
  result_dat[, c(paste0('p.', j)) := 2*pnorm(abs(get(paste0('zsim.', j))), lower.tail = F)]
}

result_dat[, or := 1]
result_dat[cv_ind, or := exp(log_odds_ratio)]
setnames(result_dat, 'or', 'chosen_or')

result_dat[, `:=` (ncases = snakemake@params[['no_cases']], ncontrols = snakemake@params[['no_controls']])]

# This merging step confines our SNPs to those present in the bim file; we keep the same order of a0 and a1 alleles, even if this differs in the bim file
result_dat <- merge(result_dat, bim_dat[, .(bim.id, bp, A1, A2)], by.x = 'position', by.y = 'bp', all.x = T)

result_dat <- result_dat[(a0 == A2 & a1 == A1) | (a0 == A1 & a1 == A2)]

result_dat[, c("A1", "A2", "bim.id") := NULL]

result_dat[, chr := snakemake@params[['chr_no']]]

result_dat[, id := tstrsplit(id, split = ':')[[1]]]

# NB: We add 'block_effect_size' to the end of this
cols <- c("position", "a0", "a1", "id", "block", "TYPE", "EUR", "zexp", paste0("zsim.", 1:no_reps), paste0("vbetasim.", 1:no_reps), paste0("betasim.", 1:no_reps), paste0("p.", 1:no_reps), "chosen_or", "ncases", "ncontrols", "chr")

result_dat <- result_dat[, ..cols]

result_dat[, block_effect_size := snakemake@wildcards[['effect_size']]]

fwrite(result_dat, file = snakemake@output[[1]], sep = '\t', col.names = F)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
library(data.table)
library(argparse)

parser <- ArgumentParser(description = 'Chops legend and haplotype files into LD block-specific files')
parser$add_argument('--hap_file', type = 'character', help = 'Path to haplotype file')
parser$add_argument('--leg_file', type = 'character', help = 'Path to legend file')
parser$add_argument('-b', '--block_file', type = 'character', help = 'Path to block file')
parser$add_argument('--chr_no', type = 'integer', help = 'Number of chromosome')
parser$add_argument('-o', '--output_root', type = 'character', help = 'Path to output directory', required = T)
parser$add_argument('-nt', '--no_of_threads', type = 'integer', help = 'Number of threads to use', default = 1)

args <- parser$parse_args()

setDTthreads(args$no_of_threads)

leg_dat <- fread(file = args$leg_file, sep = ' ', header = T)
# Skip leading metadata rows
hap_dat <- fread(file = args$hap_file, sep = ' ', header = F, skip = 4)
block_dat <- fread(file = args$block_file, sep = ' ', header = F, col.names = c('block', 'chr', 'start', 'stop'))

block_dat <- block_dat[chr == args$chr_no]

leg_dat[, rs := make.names(id)]

hap_dat[, rs := leg_dat$rs]

for(i in 1:nrow(block_dat)) {
   leg_dat[position %between% c(block_dat[i, start], block_dat[i, stop]), block := (i-1)]
}

hap_dat <- hap_dat[rs %in% leg_dat$rs]

hap_dat[, block := leg_dat$block]

hap_dat <- hap_dat[, c(seq(1, ncol(hap_dat)-3, by = 2), seq(2, ncol(hap_dat)-2, by = 2), ncol(hap_dat)-1, ncol(hap_dat)), with = F]

for(i in sort(unique(hap_dat$block))) {
  fwrite(leg_dat[block == i], file = file.path(args$output_root, sprintf('block_%s.legend.gz', i)), sep = ' ')
  fwrite(hap_dat[block == i], file = file.path(args$output_root, sprintf('block_%s.hap.gz', i)), sep = ' ', col.names = F)
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
library(data.table)
setDTthreads(snakemake@threads)

overlap <- fread(snakemake@input[['overlap']], sep = '\t', header = T)
metadata <- fread(snakemake@input[['metadata']], sep = '\t', header = T)

overlap <- overlap[trait_A %in% snakemake@params[['traits_to_keep']] & trait_B %in% snakemake@params[['traits_to_keep']]]

m_dat <- merge(overlap, metadata[, .(code, desc_A = long_abbrv, ncases_A = n_cases, ncontrols_A = n_controls)], by.x = 'trait_A', by.y = 'code')
m_dat <- merge(m_dat, metadata[, .(code, desc_B = long_abbrv, ncases_B = n_cases, ncontrols_B = n_controls)], by.x = 'trait_B', by.y = 'code')

cols_to_convert <- c("A_controls", "B_controls", "A_cases", "B_cases", "AB_controls", "AB_cases")
m_dat[, c(cols_to_convert) := lapply(.SD, as.numeric), .SDcols = cols_to_convert]

m_dat[, f.temp := sqrt(A_cases * B_cases / A_controls / B_controls)]
m_dat[, rho := ( AB_controls * f.temp + AB_cases / f.temp ) / sqrt( (A_controls + A_cases) * (B_controls + B_cases) )]
m_dat[, f.temp := NULL]

m_dat[, 'P(AB|A)' := AB_cases/A_cases]
m_dat[, 'P(AB|B)' := AB_cases/B_cases]

m_dat <- m_dat[, .(trait_A = desc_A, trait_B = desc_B, trait_A_code = trait_A, trait_B_code = trait_B, A_cases, B_cases, AB_cases, `P(AB|A)`, `P(AB|B)`, A_controls, B_controls, AB_controls, A_cases.gwas = ncases_A, B_cases.gwas = ncases_B, A_controls.gwas = ncontrols_A, B_controls.gwas = ncontrols_B, rho)]

fwrite(m_dat, file = snakemake@output[[1]], sep = '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
library(data.table)
library(Rcpp)
library(independence)

setDTthreads(snakemake@threads)

sourceCpp(code = '
#include <Rcpp.h>
#include <map>

using namespace Rcpp;

// [[Rcpp::export]]
NumericVector perturbDuplicates(NumericVector values) {
  std::map<double, int> freqMap;

  for(size_t i = 0; i < values.size(); ++i){
    freqMap[values[i]]++;

    if(freqMap[values[i]] > 1) {
      for(int j = 1; j < freqMap[values[i]]; ++j) {
        values[i] = values[i] + (freqMap[values[i]] * std::numeric_limits<double>::epsilon());
      }
    }
  }

  return values;
}
')

run_hoeffding <- function(dat, trait_A_code, trait_B_code) {
  dat[, trait_A_code := perturbDuplicates(get(trait_A_code))]
  dat[, trait_B_code := perturbDuplicates(get(trait_B_code))]

  dat <- unique(unique(dat, by = trait_A_code), by = trait_B_code)

  hoeffding.D.test(xs = dat[[trait_A_code]], ys = dat[[trait_B_code]])
}

sum_stats_dat <- fread(snakemake@input[['sum_stats_file']], sep = '\t', header = T, select = c(snakemake@wildcards[['trait_A']], snakemake@wildcards[['trait_B']]))

hoeffding_res <- run_hoeffding(sum_stats_dat, snakemake@wildcards[['trait_A']], snakemake@wildcards[['trait_B']])

res_dat <- data.table(t(unlist(hoeffding_res[c('n', 'Dn', 'scaled', 'p.value')])))

res_dat[, `:=` (trait_A = snakemake@wildcards[['trait_A']], trait_B = snakemake@wildcards[['trait_B']])]

res_dat <- res_dat[, c('trait_A', 'trait_B', 'n', 'Dn', 'scaled', 'p.value')]

if(is.na(res_dat$p.value)) {
  stop("Failed to compute Hoeffding's test p-value")
}

fwrite(res_dat, file = snakemake@output[[1]], sep = '\t', col.names = T, row.names = F)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
library(data.table)

setDTthreads(snakemake@threads)

sum_stats_dat <- fread(snakemake@input[[1]], sep = '\t', header = T)

no_snps <- min(nrow(sum_stats_dat), as.integer(snakemake@wildcards[['no_snps']]))

sum_stats_dat <- sum_stats_dat[sample(1:no_snps)]

fwrite(sum_stats_dat, file = snakemake@output[[1]], sep = '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
library(argparse)
library(data.table)
library(evd)
library(fitdistrplus)

parser <- ArgumentParser(description = 'Fits GEV to permuted data using increasingly large subset.')

parser$add_argument('-a', '--trait_A', type = 'character', help = 'Trait A label', required = T)
parser$add_argument('-b', '--trait_B', type = 'character', help = 'Trait B label', required = T)
parser$add_argument('-p', '--perm_file', type = 'character', help = 'Path to file containing GPS value generated from permuted data', required = T)
parser$add_argument('-n', '--n_values', type = 'integer', nargs = '+', help = 'Sample sizes to which to fit GEV', required = T)
parser$add_argument('-o', '--output_file', type = 'character', help = 'Path to output file', required = T)

args <- parser$parse_args()

perm_dat <- fread(args$perm_file, sep = '\t', header = T)

estimate_dat <- data.table(trait_A = character(), trait_B = character(), n = integer(), loc = numeric(), loc.sd = numeric(), scale = numeric(), scale.sd = numeric(), shape = numeric(), shape.sd = numeric())

for(x in args$n_values) {
  fgev.fit <- fgev(perm_dat$GPS[1:x])

  fgev.fitdist <- fitdist(perm_dat$GPS[1:x], 'gev', start = list(loc = fgev.fit$estimate[['loc']], scale = fgev.fit$estimate[['scale']], shape = fgev.fit$estimate[['shape']]))

  loc <- fgev.fitdist$estimate[['loc']]
  loc.sd <- fgev.fitdist$sd[['loc']]
  scale <- fgev.fitdist$estimate[['scale']]
  scale.sd <- fgev.fitdist$sd[['scale']]
  shape <- fgev.fitdist$estimate[['shape']]
  shape.sd <- fgev.fitdist$sd[['shape']]

  estimate_dat <- rbind(estimate_dat, data.table(trait_A = args$trait_A, trait_B = args$trait_B, n = x, loc = loc, loc.sd = loc.sd, scale = scale, scale.sd = scale.sd, shape = shape, shape.sd = shape.sd))

}

fwrite(estimate_dat, sep = '\t', file = args$output_file)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
library(data.table)
setDTthreads(snakemake@threads)

ukbb_trait_codes <- snakemake@params[['ukbb_trait_codes']]
sum_stats_files <- snakemake@input[['ukbb_files']]

left_dat <- fread(sum_stats_files[1], sep = '\t', header = T, select = c('variant', 'pval', 'tstat', 'n_complete_samples'))

setnames(left_dat, c('variant', 'pval', 'tstat', 'n_complete_samples'), c('variant', paste0('pval.', ukbb_trait_codes[1]), paste0('tstat.', ukbb_trait_codes[1]), paste0('n_complete_samples.', ukbb_trait_codes[1])))

for(i in 2:length(sum_stats_files)) {
  right_dat <- fread(sum_stats_files[i], sep = '\t', header = T, select = c('variant', 'pval', 'tstat', 'n_complete_samples'))

  setnames(right_dat, c('variant', 'pval', 'tstat', 'n_complete_samples'), c('variant', paste0('pval.', ukbb_trait_codes[i]), paste0('tstat.', ukbb_trait_codes[i]), paste0('n_complete_samples.', ukbb_trait_codes[i])))

  left_dat <- merge(left_dat, right_dat, by = 'variant')
}

if(snakemake@params[['sans_mhc']]) {
  left_dat[, c('chr', 'bp') := tstrsplit(variant, split = ':', keep = 1:2)]

  left_dat <- left_dat[!(chr == 6 & bp %between% c(24e6, 45e6))]

  left_dat[, c('chr', 'bp') := NULL]
}

fwrite(left_dat, file = snakemake@output[[1]], sep = '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
library(ggplot2)
library(ggpubr)
library(argparse)
require(scales)

theme_set(theme_bw()+
          theme(
            axis.title = element_text(size=12),
            plot.title=element_text(hjust=0.5, size=12),
            strip.text=element_text(size=10),
            axis.text.x=element_text(size=10, angle=30, color="black"),
            axis.text.y=element_text(size=10, color="black"),
            legend.title=element_text(size=10),
            legend.text=element_text(size=10)
          )
          )

parser <- ArgumentParser(description = 'Plot GEV parameter estimates as a function of no. of SNPs')
parser$add_argument('-i', '--fit_files', type = 'character', nargs = '+', help = 'Path to fitted parameters files')
parser$add_argument('-n', '--no_snps', type = 'integer', nargs = '+', help = 'No. of SNPs used to fit each file\' estimates')
parser$add_argument('-o', '--output_file', type = 'character', help = 'Path to output file')

args <- parser$parse_args()

if(length(args$fit_files) != length(args$no_snps)) {
  stop("Lengths of fit_files and no_snps do not match")
}

daf <- data.frame(trait_A = character(), trait_B = character(), no_snps = integer(), loc = numeric(), loc.sd = numeric(), scale = numeric(), scale.sd = numeric(), shape = numeric(), shape.sd = numeric())

for(i in seq_along(args$fit_files)) {
  fit_daf <- read.table(args$fit_files[i], sep = '\t', header = T)

  daf <- rbind(daf, data.frame(trait_A = fit_daf$trait_A,
                               trait_B = fit_daf$trait_B,
                               no_snps = args$no_snps[i],
                               loc = fit_daf$loc,
                               loc.sd = fit_daf$loc.sd,
                               scale = fit_daf$scale,
                               scale.sd = fit_daf$scale.sd,
                               shape = fit_daf$shape,
                               shape.sd = fit_daf$shape.sd))
}

pl_loc_estimate <- ggplot(daf[c('no_snps', 'loc', 'loc.sd')], aes(x=no_snps, y=loc))+
  geom_line()+
  geom_errorbar(aes(ymin=loc-1.96*loc.sd, ymax=loc+1.96*loc.sd))+
  ggtitle('Location parameter')+
  ylab('Estimate')+
  theme(legend.position = 'none')+
  scale_x_continuous(labels = scales::comma)

pl_scale_estimate <- ggplot(daf[c('no_snps', 'scale', 'scale.sd')], aes(x=no_snps, y=scale))+
  geom_line()+
  geom_errorbar(aes(ymin=scale-1.96*scale.sd, ymax=scale+1.96*scale.sd))+
  ggtitle('Scale parameter')+
  ylab('Estimate')+
  theme(legend.position = 'none')+
  scale_x_continuous(labels = scales::comma)

pl_shape_estimate <- ggplot(daf[c('no_snps', 'shape', 'shape.sd')], aes(x=no_snps, y=shape))+
  geom_line()+
  geom_errorbar(aes(ymin=shape-1.96*shape.sd, ymax=shape+1.96*shape.sd))+
  ggtitle('Shape parameter')+
  ylab('Estimate')+
  theme(legend.position = 'none')+
  scale_x_continuous(labels = scales::comma)

ggsave(plot = ggarrange(plotlist = list(pl_loc_estimate, pl_scale_estimate, pl_shape_estimate), nrow = 1, ncol = 3, common.legend = T), file = args$output_file, width = 8.1, height = 3)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
library(ggplot2)
library(ggpubr)
library(argparse)

theme_set(theme_bw()+
          theme(
            axis.title = element_text(size=12),
            plot.title=element_text(hjust=0.5, size=12),
            strip.text=element_text(size=10),
            axis.text.x=element_text(size=10, angle=30, color="black"),
            axis.text.y=element_text(size=10, color="black"),
            legend.title=element_text(size=10),
            legend.text=element_text(size=10)
          )
          )

parser <- ArgumentParser(description = 'Plot GEV parameter estimates as a function of sample size')
parser$add_argument('-f', '--fit_file', type = 'character', help = 'Path to fitted parameters file')
parser$add_argument('-o', '--output_file', type = 'character', help = 'Path to output file')

args <- parser$parse_args()

daf <- read.table(args$fit_file, sep = '\t', header = T)

pl_loc_estimate <- ggplot(daf[c('n', 'loc', 'loc.sd')], aes(x=n, y=loc))+
  geom_line()+
  geom_errorbar(aes(ymin=loc-1.96*loc.sd, ymax=loc+1.96*loc.sd))+
  ggtitle('Location parameter')+
  ylab('Estimate')+
  theme(legend.position = 'none')

pl_scale_estimate <- ggplot(daf[c('n', 'scale', 'scale.sd')], aes(x=n, y=scale))+
  geom_line()+
  geom_errorbar(aes(ymin=scale-1.96*scale.sd, ymax=scale+1.96*scale.sd))+
  ggtitle('Scale parameter')+
  ylab('Estimate')+
  theme(legend.position = 'none')

pl_shape_estimate <- ggplot(daf[c('n', 'shape', 'shape.sd')], aes(x=n, y=shape))+
  geom_line()+
  geom_errorbar(aes(ymin=shape-1.96*shape.sd, ymax=shape+1.96*shape.sd))+
  ggtitle('Shape parameter')+
  ylab('Estimate')+
  theme(legend.position = 'none')

ggsave(plot = ggarrange(plotlist = list(pl_loc_estimate, pl_scale_estimate, pl_shape_estimate), nrow = 1, ncol = 3, common.legend = T), file = args$output_file, width = 8.1, height = 3)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
library(argparse)
library(data.table)
library(fitdistrplus)
library(evd)

parser <- ArgumentParser(description = 'Plot goodness-of-fit plots for GEV fit to permuted data')
parser$add_argument('-f', '--fitdist_file', type = 'character', help = 'Path to fitted parameters file')
parser$add_argument('-p', '--perm_file', type = 'character', help = 'Path to file contained permuted null GPS statistics')
parser$add_argument('-l', '--trait_pair_label', type = 'character', help = 'Trait pair label')
parser$add_argument('-o', '--output_path', type = 'character', help = 'Path to output plot file', required = T)

args <- parser$parse_args()

perm_sample <- scan(args$perm_file, skip = 1)

fit_dat <- fread(args$fitdist_file, sep = '\t', header = T)

fgev.fitdist <- fitdist(perm_sample, 'gev', start = list(loc = fit_dat$loc, scale = fit_dat$scale, shape = fit_dat$shape))

fgev.gof <- gofstat(fgev.fitdist)

png(args$output_path)
plot(fgev.fitdist)
title(main = sprintf("%s GOF plots", args$trait_pair_label), outer = T, line = -1)
dev.off()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
library(argparse)
library(data.table)
library(ggplot2)

theme_set(theme_bw()+
          theme(
            axis.title = element_text(size=12),
            plot.title=element_text(hjust=0.5, size=12),
            strip.text=element_text(size=10),
            axis.text.x=element_text(size=10, angle=30, color="black"),
            axis.text.y=element_text(size=10, color="black"),
            legend.title=element_text(size=10),
            legend.text=element_text(size=10)
          )
          )

library(ggpubr)

parser <- ArgumentParser(description = 'Plot GPS null distribution using specified null permutations file')
parser$add_argument('-f', '--fitdist_file', type = 'character', help = 'Path to fitted parameters file')
parser$add_argument('-p', '--perm_file', type = 'character', help = 'Path to file contained permuted null GPS statistics')
parser$add_argument('-a', '--exp1_null', type = 'character', help = 'Path to exp1 output plot file', required = T)
parser$add_argument('-b', '--gev_null', type = 'character', help = 'Path to gev output plot file', required = T)
parser$add_argument('-c', '--exp1_gev_combined', type = 'character', help = 'Path to combined output plot file', required = T)
parser$add_argument('-nt', '--no_of_threads', type = 'integer', help = 'Number of threads to use', default = 1)

args <- parser$parse_args()

perm_sample <- scan(args$perm_file, skip = 1)

exp1_pvals <- pexp(perm_sample^-2)

fit_dat <- fread(args$fitdist_file, sep = '\t', header = T)

gev_pvals <- evd::pgev(perm_sample, loc = fit_dat$loc, scale = fit_dat$scale, shape = fit_dat$shape, lower.tail = F)

pl_exp1_pvals_hist <- ggplot(data = data.frame(p = exp1_pvals))+
  geom_histogram(aes(x = p, y = ..count../sum(..count..)), colour = 'black', fill = 'gray', breaks = seq(0, 1, length.out = 21))+
  geom_hline(yintercept = 0.05, linetype = "dashed", col = "blue")+
  xlab('GPS p-value')+
  ylab('Relative frequency')+
  ggtitle('Histogram of GPS p-values\nfrom Exp(1) under null')+
  scale_x_continuous(limits = c(0,1))+
  ylim(0,0.1)

ggsave(plot = pl_exp1_pvals_hist, file = args$exp1_null, units = "in", width = 2.7, height = 3)

pl_gev_pvals_hist <- ggplot(data = data.frame(p = gev_pvals))+
  geom_histogram(aes(x = p, y = ..count../sum(..count..)), colour = 'black', fill = 'gray', breaks = seq(0, 1, length.out = 21))+
  geom_hline(yintercept = 0.05, linetype = "dashed", col = "blue")+
  xlab('GPS p-value')+
  ylab('Relative frequency')+
  ggtitle('Histogram of GPS p-values\nfrom GEV under null')+
  scale_x_continuous(limits = c(0,1))+
  ylim(0,0.1)

ggsave(plot = pl_gev_pvals_hist, file = args$gev_null, units = "in", width = 2.7, height = 3)

ggsave(plot = ggarrange(plotlist = list(pl_exp1_pvals_hist, pl_gev_pvals_hist), ncol = 2, nrow = 1), file = args$exp1_gev_combined, units = "in", width = 5.4, height = 3)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
library(data.table)

setDTthreads(snakemake@threads)

sum_stats_dat <- fread(snakemake@input[['sum_stats_file']], sep = '\t', header = T)

pruned_rsid_dat <- fread(snakemake@input[['pruned_range_file']], sep = ' ', header = F)

names(pruned_rsid_dat) <- 'ID'

bim_dat <- fread(snakemake@input[['bim_file']], sep = '\t', header = F, col.names = c('chr', 'ID', 'Cm', 'bp', 'A1', 'A2'))

# Prune the rsIDs
bim_dat <- bim_dat[ID %in% pruned_rsid_dat$ID]

bim_dat[, variant_12 := paste(chr, bp, A1, A2, sep = ':')]
bim_dat[, variant_21 := paste(chr, bp, A2, A1, sep = ':')]

# Prune the summary statistics; there are no rsIDs in this file so we need to construct the IDs from coordinates and alleles contained in the concatenated bim file
sum_stats_dat <- sum_stats_dat[variant %in% bim_dat$variant_12 | variant %in% bim_dat$variant_21]

fwrite(sum_stats_dat, file = snakemake@output[[1]], sep = '\t')

system(sprintf("sed -i 's/pval\\.//g' %s", snakemake@output[[1]]))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
library(data.table)
setDTthreads(snakemake@threads)

dat <- fread(snakemake@input[[1]], sep = '\t', header = T)

dat[, c('chr', 'bp') := tstrsplit(variant, split = ':', keep = 1:2)]

dat <- dat[!(chr == 6 & bp %between% c(24e6, 45e6))]

dat[, c('chr', 'bp') := NULL]

fwrite(dat, file = snakemake@output[[1]], sep = '\t')
ShowHide 159 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/twillis209/gps_paper_pipeline
Name: gps_paper_pipeline
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...