This is a repository holding code to obtain preliminary results for the VR Starting Grant 2023.

public public 1yr ago 0 bookmarks

This Snakemake workflow contains all the code necessary to reproduce the preliminary results from my application to VR Starting Grant 2023.

Authors

Code Snippets

14
15
shell:
	"bcftools query -S {input[1]} -r 9:136131322,9:136139265 -f '%CHROM\t%POS\t%REF\t%ALT[\t%GT]\n' {input[0]} > {output[0]}"
24
25
26
27
28
run:
        cols= ['chr','pos','ref','eff'] + [line.strip() for line in open(input[0], 'r')]
        d= pd.read_csv(input[1], header= None, names= cols, sep= '\t')
        d.drop_duplicates(['chr', 'pos'], keep=False, inplace= True)
        d.to_csv(output[0], sep= '\t', header= True, index= False)
SnakeMake From line 24 of rules/ABO.smk
38
39
script:
	"../scripts/ABO.R"
SnakeMake From line 38 of rules/ABO.smk
47
48
shell:
        'touch {output[0]}'
SnakeMake From line 47 of rules/ABO.smk
62
63
64
65
66
67
run:
        d= pd.read_csv(input[0], sep= '\t', header= 0)
        covar= pd.read_csv(input[1], sep= '\t', header= 0)
        d= pd.merge(d, covar, on= 'IID')
        ids= pd.read_csv(input[2], sep= '\t', header= 0)
        d= pd.merge(d, ids, left_on= 'IID', right_on= 'Child')
SnakeMake From line 62 of rules/ABO.smk
91
92
93
94
95
96
97
98
99
run:
        d= pd.read_csv(input[0], sep= '\t', header= 0)
        remove= selectUnrelated(input[1], d, d.Child)
        d= d.loc[~d.Child.isin(remove), :]
        remove= selectUnrelated(input[1], d, d.Mother)
        d= d.loc[~d.Mother.isin(remove), :]
        remove= selectUnrelated(input[1], d, d.Father)
        d= d.loc[~d.Father.isin(remove), :]
        d.to_csv(output[0], sep= '\t', header= True, index= False)
SnakeMake From line 91 of rules/ABO.smk
 9
10
11
12
13
14
15
run:
	d= pd.read_csv(input[0], sep='\t', header=0, usecols= ['CHR', 'POS', 'ID'])
	x= pd.read_csv(input[1], sep= '\t', header= 0, usecols= ['CHR', 'POS'])
	x.columns= ['CHR', 'pos']
	d['CHR']= d.CHR.apply(str)
	d['CHR']= np.where(d.CHR== '23', 'X', d.CHR)
	x['CHR']= x.CHR.apply(str)
32
33
34
35
36
37
run:
	d= pd.read_csv(input[0], sep= '\t', header= None)
	d.columns= ['FID', 'IID']
	remove= selectUnrelated(input[1], d, d.IID)
	d= d.loc[~d.IID.isin(remove), :]
	d.to_csv(output[0], sep= '\t', header= True, index= False)
SnakeMake From line 32 of rules/COJO.smk
53
54
55
56
57
58
59
60
61
62
63
run:
	d= pd.read_csv(input[1], sep= '\t', header= None, names= ['CHR', 'POS', 'POS2', 'ID'])
	d['CHR']= d['CHR'].apply(str)
	wildCHROM= np.where(str(wildcards.CHR)== 'X', '23', wildcards.CHR)
	if str(wildCHROM) not in set(d.CHR.values):
		print('CHR: ' + wildcards.CHR + ' not in ' + wildcards.sample + ' GWAS.')
		shell('touch {output[0]}')
		shell('touch {output[1]}')
		shell('touch {output[2]}')
	else:
		shell('~/soft/plink2 --bfile {params[0]} --keep {input[0]} --extract bed1 {input[1]} --memory 10000 --threads {threads} --make-bed --out {params[1]}')
77
78
79
80
run:
	if os.stat(input[0]).st_size == 0:
		shell('mv {input[0]} {output[0]}')
		shell('mv {input[1]} {output[1]}')
SnakeMake From line 77 of rules/COJO.smk
105
106
107
108
109
run:
	bed_list= [infile.replace('.bim', '') for infile in input]
	with open(output[0], 'w') as f:
		for item in bed_list:
			f.write("%s\n" % item)
SnakeMake From line 105 of rules/COJO.smk
119
120
121
122
123
124
125
126
127
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, compression= 'gzip', usecols= ['CHR', 'POS', 'EFF', 'REF', 'N', 'EAF', 'BETA', 'SE', 'LOG10P', 'ID'])[['CHR', 'POS', 'ID', 'EFF', 'REF', 'N', 'EAF', 'BETA', 'SE', 'LOG10P']]
	d.columns= ['CHR', 'POS', 'SNP', 'A1', 'A2', 'N', 'freq', 'b', 'se', 'p']
	d['p']= 10**-d['p']
	d['CHR']= d.CHR.apply(str)
	d['CHR']= np.where(d.CHR== 'X', '23', d.CHR)
	d['SNP']= d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.A2 + ':' + d.A1
	d.drop_duplicates('SNP', keep= 'first', inplace= True)
	d.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['SNP', 'A1', 'A2', 'freq', 'b', 'se', 'p', 'N'])
SnakeMake From line 119 of rules/COJO.smk
143
144
145
146
147
148
149
150
151
run:
	cma_list= list()
	jma_list= list()
	cma= ['Chr', 'SNP', 'bp', 'refA', 'freq', 'b', 'se', 'p', 'n', 'freq_geno', 'bC', 'bC_se', 'pC', 'locus']
	jma= ['Chr', 'SNP', 'bp', 'refA', 'freq', 'b', 'se', 'p', 'n', 'freq_geno', 'bJ', 'bJ_se', 'pJ', 'LD_r', 'locus']
	d= pd.read_csv(input[1], sep= '\t', header= 0)
	d['CHR']= d['CHR'].apply(str)
	d['CHR']= np.where(d.CHR== 'X', '23', d.CHR)
	wildCHROM= np.where(str(wildcards.CHR)== 'X', '23', wildcards.CHR)
SnakeMake From line 143 of rules/COJO.smk
185
186
187
188
189
	shell:
		'''
		head -1 {input[0]} > {output[0]}
                tail -n +2 -q {input} >> {output[0]}
		'''
SnakeMake From line 185 of rules/COJO.smk
197
198
shell:
	'touch {output[0]}'
SnakeMake From line 197 of rules/COJO.smk
13
14
script:
        '../scripts/coloc.R'
6
7
shell:
	'echo rs6755571 > {output[0]}'
17
18
shell:
	'/home/pol.sole.navais/soft/qctool_v2.2.0/qctool -g {input[0]} -incl-rsids {input[1]} -og {output[0]}'
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
	shell:
                '''
                /home/pol.sole.navais/soft/regenie_v3.2.1.gz_x86_64_Linux \
                --step 1 \
                --threads {threads} \
                --gz \
                --bed {params[0]} \
                --covarFile {input[2]} \
                --phenoFile {input[1]} \
                --keep {input[3]} \
                --extract {input[4]} \
                --bsize 1000 \
                --bt --lowmem \
                --lowmem-prefix {params[2]} \
                --catCovarList cohort \
		--condition-list {input[6]} \
		--condition-file bgen,{input[5]} \
		--condition-file-sample {input[7]} \
                --out {params[1]}
                '''
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
        shell:
                """
		/home/pol.sole.navais/soft/regenie_v3.2.1.gz_x86_64_Linux \
                --step 2 \
                --bgen {input[0]} \
                --covarFile {input[2]} \
                --keep {input[3]} \
                --phenoFile {input[1]} \
                --bsize 400 \
                --bt \
                --firth --approx \
                --minINFO 0.6 \
                --threads {threads} \
                --sample {input[5]} \
                --pred {input[4]} \
                --out {params[0]} \
                --af-cc \
                --catCovarList cohort \
		--condition-list {input[8]} \
                --condition-file bgen,{input[7]} \
		--condition-file-sample {input[9]} \
                --verbose
                """
109
110
111
112
113
shell:
        '''
        head -1 {input[0]} > {output[0]}
        tail -n +2 -q {input} >> {output[0]}
        '''
121
122
shell:
        'gzip -c {input[0]} > {output[0]}'
130
131
shell:
        'touch {output[0]}'
 9
10
shell:
	'bcftools query -l {input[0]} > {output[0]}'
18
19
20
21
22
23
24
run:
	df_list= list()
	for infile in input:
		d= pd.read_csv(infile, header= None, names= ['IID'])
		df_list.append(d)
	d= reduce(lambda x, y: pd.merge(x, y, on = 'IID', how = 'inner'), df_list)
	d.to_csv(output[0], sep= '\t', header= True, index= False)
40
41
42
43
44
45
46
run:
        d= pd.read_csv(input[0], sep= '\t', header= 0)
        flag= pd.read_csv(input[1], sep= '\t', header= 0)
        flag= flag.loc[flag.genotypesOK== True, :]
        flag= flag.loc[flag.phenoOK== True, :]
        pcs= [line.strip() for line in open(input[2], 'r')]
        x= pd.read_csv(input[3], sep= '\t', header= 0)
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
run:
	df_list= list()
	for infile in input:
		d= pd.read_csv(infile, sep = ' ', header= 0)
		d= d.loc[d.LOG10P> 3, :]
		d= d.loc[(d.A1FREQ> 0.005 ) & (d.A1FREQ<0.995), :]
		d.sort_values('LOG10P', inplace= True, ascending= False, ignore_index=True)
		d['GENPOS']= d.GENPOS.apply(int).apply(str)
		d['pos2']= d.GENPOS
		d['CHROM']= d.CHROM.apply(str)
		d['CHROM']= np.where(d.CHROM== '23', 'X', d.CHROM)
		df_list.append(d)
	d= pd.concat(df_list)
	d.sort_values(['CHROM', 'GENPOS'], inplace= True, ascending= True)
	d.to_csv(output[0], header= False, index= False, sep= '\t', columns= ['CHROM', 'GENPOS', 'pos2'])
93
94
run:
        shell("bcftools query -S {input[1]} -R {input[0]} -f '%CHROM\t%POS\t%REF\t%ALT[\t%DS]\n' {input[2]} -o {output[0]}")
103
104
105
106
107
run:
        cols= ['chr','pos','ref','eff'] + [line.strip() for line in open(input[0], 'r')]
        d= pd.read_csv(input[1], header= None, names= cols, sep= '\t')
        d.drop_duplicates(['chr', 'pos'], keep=False, inplace= True)
        d.to_csv(output[0], sep= '\t', header= True, index= False)
118
119
script:
	'../scripts/effect_origin_conditional.R'
127
128
129
130
131
132
	shell:
                '''
                set +o pipefail;
                head -1 {input[0]} > {output[0]}
                cat {input} | grep -v 'term' >> {output[0]}
                '''
 9
10
11
12
13
14
15
16
17
18
19
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	d= d.loc[(d.quant_method== 'ge') | (d.quant_method== 'microarray'), :]
	d['study']= d.study + '_' + d.quant_method + '_' + d.qtl_group
	d= d.loc[d.study == wildcards.eqtl_catalogue, :]
	url= 'http://' + d.ftp_path.values[0].replace('ftp://', '')
	time.sleep(2)
	print('Downloading from the following url: ' + url)
	shell("tabix {url} 2: > {output[0]}")
	tbi= url.split('/')[-1] + '.tbi'
	shell("rm {tbi}")
28
29
30
run:
        cols= ['molecular_trait_id', 'chromosome', 'position', 'ref', 'alt', 'variant', 'ma_samples', 'maf', 'pvalue', 'beta', 'se', 'type', 'ac', 'an', 'r2', 'molecular_trait_object_id', 'gene_id', 'median_tpm', 'rsid']
        d= pd.read_csv(input[0], sep= '\t', header= None, names= cols)
51
52
53
54
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['rsid', 'CHR', 'POS', 'EAF', 'BETA', 'SE', 'N', 'REF', 'EFF', 'ID'])
	d= d.loc[((d.CHR== 2)), :]
	d.drop_duplicates('ID', keep= 'first', inplace= True)
72
73
shell:
	'/home/pol.sole.navais/soft/liftOver {input[0]} {input[1]} {output[0]} {output[1]}'
82
83
84
85
86
87
run:
        x= pd.read_csv(input[1], sep= '\t', header= None, names= ['CHR', 'start', 'POS', 'ID'])
        d= pd.read_csv(input[0], header= 0, sep= '\t', usecols = ['rsid', 'MAF', 'BETA', 'SE', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'ID'])
        d= pd.merge(d, x[['CHR', 'POS', 'ID']], on= 'ID')
        d['CHR']= np.where(d.CHR== 'X', '23', d.CHR)
        d.to_csv(output[0], sep= '\t', header= True, index= False)
100
101
script:
        '../scripts/coloc_eqtl_catalogue.R'
109
110
111
112
113
shell:
	'''
	head -1 {input[0]} > {output[0]}
	tail -n +2 -q {input} >> {output[0]}
	'''
121
122
123
124
125
shell:
        '''
        head -1 {input[0]} > {output[0]}
        tail -n +2 -q {input} >> {output[0]}
        '''
 8
 9
10
11
12
13
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	d['IID']= d.Child
	d.to_csv(output[0], sep= '\t', header= False, index= False, columns = ['Child', 'IID'])
	x= pd.DataFrame({'CHR': [2], 'start': [234450000], 'end': [234750000], 'ID': ['id1']})
	x.to_csv(output[1], sep= '\t', header= False, index= False)
26
27
shell:
	"/home/pol.sole.navais/soft/plink --bfile {params[0]} --keep {input[0]} --extract range {input[1]} --r2 --ld-window-r2 0 --ld-window 7000 --out {params[1]}"
11
12
script:
	'../scripts/figures/manhattan-mother-child.R'
25
26
script:
	'../scripts/figures/fetal-UGT1A4-missense.R'
38
39
script:
	'../scripts/figures/ABO-maternal.R'
51
52
script:
	'../scripts/figures/chrX_locus_haplotype.R'
64
65
script:
	'../scripts/figures/UGT-interactions.R'
76
77
script:
	'../scripts/figures/parental-UGT-variants.R'
91
92
script:
	'../scripts/figures/PGS.R'
102
103
	script:
                '../scripts/figures/QQ-plot.R'
111
112
shell:
	'touch {output[0]}'
122
123
script:
	'../scripts/figures/manhattan.R'
131
132
shell:
        'touch {output[0]}'
145
146
script:
	'../scripts/figures/contrast_polygenicity.R'
157
158
script:
	'../scripts/figures/circular_eQTL_coloc.R'
166
167
shell:
	'touch {output[0]}'
182
183
script:
	'../scripts/figures/UGT-locuszoom.R'
197
198
script:
	'../scripts/figures/UGT1-jaundice-eQTL-correlation.R'
210
211
script:
	'../scripts/figures/replication-UGT-locuszoom.R'
13
14
15
16
17
18
19
20
21
shell:
        '''
        /home/pol.sole.navais/soft/plink2 \
        --bfile {params[0]} \
        --mac 100 \
        --write-snplist \
        --keep {input[1]} \
        --out {params[1]}
        '''
29
30
shell:
        '/home/pol.sole.navais/soft/qctool_v2.2.0/qctool -g {input[0]} -os {output[0]}'
SnakeMake From line 29 of rules/GWAS.smk
40
41
shell:
	"/home/pol.sole.navais/soft/plink2 --vcf {input[0]} dosage='DS' --double-id --make-pgen psam-cols=+fid --out {params[0]}"
60
61
run:
	if wildcards.pheno == 'miscarriage_bin':
SnakeMake From line 60 of rules/GWAS.smk
114
115
116
117
run:
	if wildcards.CHR != 'X':
		if wildcards.pheno == 'miscarriage_bin':
			shell('''
SnakeMake From line 114 of rules/GWAS.smk
201
202
203
204
205
shell:
	'''
	head -1 {input[0]} > {output[0]}
	tail -n +2 -q {input} >> {output[0]}
	'''
SnakeMake From line 201 of rules/GWAS.smk
213
214
shell:
        'gzip -c {input[0]} > {output[0]}'
SnakeMake From line 213 of rules/GWAS.smk
222
223
shell:
	'touch {output[0]}'
SnakeMake From line 222 of rules/GWAS.smk
244
245
run:
	shell("""
SnakeMake From line 244 of rules/GWAS.smk
281
282
run:
	shell('''
SnakeMake From line 281 of rules/GWAS.smk
307
308
309
310
311
shell:
        '''
        head -1 {input[0]} > {output[0]}
        tail -n +2 -q {input} >> {output[0]}
        '''
SnakeMake From line 307 of rules/GWAS.smk
319
320
shell:
        'gzip -c {input[0]} > {output[0]}'
SnakeMake From line 319 of rules/GWAS.smk
328
329
shell:
        'touch {output[0]}'
SnakeMake From line 328 of rules/GWAS.smk
345
346
347
run:
	if (wildcards.pheno != 'micsarriage_bin'):
		shell('''
SnakeMake From line 345 of rules/GWAS.smk
392
393
394
395
396
shell:
        '''
        head -1 {input[0]} > {output[0]}
        tail -n +2 -q {input} >> {output[0]}
        '''
SnakeMake From line 392 of rules/GWAS.smk
404
405
shell:
        'gzip -c {input[0]} > {output[0]}'
SnakeMake From line 404 of rules/GWAS.smk
413
414
shell:
        'touch {output[0]}'
SnakeMake From line 413 of rules/GWAS.smk
 8
 9
10
11
12
13
14
15
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'rsid', 'REF', 'EFF', 'N', 'BETA', 'SE'])[['CHR', 'POS', 'rsid', 'REF', 'EFF', 'N', 'BETA', 'SE']]
	d= d.loc[d.rsid != '.', :]
	d= d.loc[d.rsid != '', :]
	d.drop_duplicates(['CHR', 'POS'], inplace= True)
	d['Z']= d.BETA / d.SE
	d.columns= ['CHR', 'BP', 'SNP', 'A2', 'A1', 'N', 'BETA', 'SE', 'Z']
	d.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['SNP', 'CHR', 'BP', 'A1', 'A2', 'Z', 'N'])
32
33
34
35
36
37
38
39
40
shell:
	'''
	python2 ~/soft/hess-0.5.3-beta/hess.py \
	--local-hsqg {input[0]} \
	--chrom {wildcards.autoCHR} \
	--bfile {params[0]} \
	--partition {input[1]} \
	--out {params[1]}
	'''
SnakeMake From line 32 of rules/HESS.smk
57
58
59
60
shell:
	'''
	python2 ~/soft/hess-0.5.3-beta/hess.py --prefix {params[0]} --out {params[1]}
	'''
SnakeMake From line 57 of rules/HESS.smk
68
69
shell:
	'touch {output[0]}'
SnakeMake From line 68 of rules/HESS.smk
77
78
script:
	'../scripts/contrast_polygenicity.py'
SnakeMake From line 77 of rules/HESS.smk
86
87
script:
	'../scripts/contrast_polygenicity.py'
SnakeMake From line 86 of rules/HESS.smk
95
96
script:
        '../scripts/contrast_polygenicity.py'
SnakeMake From line 95 of rules/HESS.smk
 9
10
script:
	'../scripts/HWE.R'
SnakeMake From line 9 of rules/HWE.smk
18
19
20
21
22
23
shell:
        '''
        set +o pipefail;
        head -1 {input[0]} > {output[0]}
        cat {input} | grep -v 'SNP' >> {output[0]}
        '''
SnakeMake From line 18 of rules/HWE.smk
 9
10
11
12
13
14
15
16
17
18
19
run:
	d= pd.read_csv(input[0], sep= ' ', header= 0)
	d= d.loc[(d.A1FREQ< 0.995) & (d.A1FREQ> 0.005), :]
	d['Predictor']= d.CHROM.astype(str) + ':' + d.GENPOS.astype(str)
	d['Z']= d.BETA / d.SE
	if wildcards.sample== 'fets':
		d.N= d.N / 4
	d= d.loc[:, ['Predictor', 'ALLELE1', 'ALLELE0', 'N', 'Z']]
	d.drop_duplicates('Predictor', inplace= True)
	d.columns= ['Predictor', 'A1', 'A2', 'n', 'Z']
	d.to_csv(output[0], sep= '\t', header= True, index= False)
30
31
shell:
	'/home/pol.sole.navais/soft/ldak/ldak5.2.linux --tagfile {input[1]} --summary {input[0]} --check-sums NO --sum-hers {params[0]}'
SnakeMake From line 30 of rules/LDAK.smk
39
40
shell:
        'touch {output[0]}'
SnakeMake From line 39 of rules/LDAK.smk
60
61
62
63
64
65
66
67
run:
	snps= pd.read_csv(input[0], sep= '\t', header= 0)
	abo= pd.read_csv(input[1], sep= '\t', header= 0)
	abo= abo[['ABO_incompatibility', 'PREG_ID_1724']]
	mfr= pd.read_csv(input[2], sep= ';', header= 0)
	mfr.drop('KJONN', axis= 1, inplace= True)
	q4= pd.read_csv(input[3], sep= ';', header= 0, usecols= ['PREG_ID_1724', 'DD16', 'DD17'])
	df_list= list()
 93
 94
 95
 96
 97
 98
 99
100
run:
        abo= pd.read_csv(input[0], sep= '\t', header= 0)
        abo= abo[['ABO_incompatibility', 'PREG_ID_1724']]
        mfr= pd.read_csv(input[1], sep= ';', header= 0)
        mfr.drop('KJONN', axis= 1, inplace= True)
        ds= pd.read_csv(input[2], header= 0, sep= '\t')
        pgs= pd.read_csv(input[3], header= 0, sep= '\t')
        pgsno2= pd.read_csv(input[4], header= 0, sep= '\t')
10
11
12
13
14
15
16
17
18
19
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, comment= '#')
	d['ID']= np.where(d.effect_allele < d.other_allele, d.chr_name.apply(str) + ':' + d.chr_position.apply(str) + ':' + d.effect_allele + ':' + d.other_allele, d.chr_name.apply(str) + ':' + d.chr_position.apply(str) + ':' + d.other_allele + ':' + d.effect_allele)
	x= pd.read_csv(input[1], sep= '\t', header= 0, usecols= ['POS', 'ID'])
	x.drop_duplicates('ID', inplace= True)
	d= d.loc[d.ID.isin(x.ID.values), :]
	d= d[['chr_name', 'chr_position', 'other_allele', 'effect_allele', 'effect_weight']]
	d.columns= ['chr', 'pos', 'REF', 'EFF', 'beta']
	d.to_csv(output[0], sep= '\t', header= True, index= False)
	d.to_csv(output[1], sep= '\t', columns= ['chr', 'pos', 'pos'], header= False, index= False)
SnakeMake From line 10 of rules/PGS.smk
29
30
shell:
	"bcftools query -S {input[1]} -R {input[0]} -f '%CHROM\t%POS\t%REF\t%ALT[\t%DS]\n' {input[2]} -o {output[0]}"
40
41
script:
	'../scripts/calculate_PGS.py'
SnakeMake From line 40 of rules/PGS.smk
49
50
51
52
53
shell:
        '''
        head -1 {input[0]} > {output[0]}
        tail -n +2 -q {input} >> {output[0]}
        '''
SnakeMake From line 49 of rules/PGS.smk
61
62
run:
        df= pd.read_csv(input[0], sep= '\t', header= 0)
SnakeMake From line 61 of rules/PGS.smk
76
77
78
79
80
81
82
83
84
85
86
87
run:
	df_list= list()
	beta_files= [i for i in input if 'betas' in i]
	for infile in beta_files:
		d= pd.read_csv(infile, sep= '\t', header= 0)
		df_list.append(d)
	d= reduce(lambda df1, df2: pd.merge(df1, df2, on= ['chr', 'pos', 'REF', 'EFF', 'beta']), df_list)
	d.to_csv(output[0], sep= '\t', header= True, index= False)
	df_list= list()
	variant_ids= [i for i in input if i not in beta_files]
	for infile in variant_ids:
		d= pd.read_csv(infile, sep= '\t', header= None, names= ['chr', 'pos1', 'pos2'])
SnakeMake From line 76 of rules/PGS.smk
101
102
script:
        '../scripts/calculate_PGS.py'
SnakeMake From line 101 of rules/PGS.smk
110
111
112
113
114
shell:
        '''
        head -1 {input[0]} > {output[0]}
        tail -n +2 -q {input} >> {output[0]}
        '''
SnakeMake From line 110 of rules/PGS.smk
122
123
run:
        df= pd.read_csv(input[0], sep= '\t', header= 0)
SnakeMake From line 122 of rules/PGS.smk
137
138
139
140
141
shell:
	"""
	tabix -h -R {input[0]} {input[2]} > {output[0]}
	bcftools query -S {input[1]} -f '%CHROM\t%POS\t%REF\t%ALT[\t%GT]\n' {output[0]} -o {output[1]}
	"""
150
151
run:
        cols= ['chr','pos','ref','eff'] + [line.strip() for line in open(input[0], 'r')]
SnakeMake From line 150 of rules/PGS.smk
171
172
script:
        '../scripts/allele_transmission.py'
SnakeMake From line 171 of rules/PGS.smk
181
182
script:
        '../scripts/calculate_PGS.py'
SnakeMake From line 181 of rules/PGS.smk
190
191
192
193
194
shell:
        '''
        head -1 {input[0]} > {output[0]}
        tail -n +2 -q {input} >> {output[0]}
        '''
SnakeMake From line 190 of rules/PGS.smk
202
203
204
205
run:
        df= pd.read_csv(input[0], sep= '\t', header= 0)
        df= df.groupby('PREG_ID').sum().reset_index()
        df.to_csv(output[0], sep= '\t', header= True, index= False)
SnakeMake From line 202 of rules/PGS.smk
214
215
shell:
	'touch {output[0]}'
SnakeMake From line 214 of rules/PGS.smk
229
230
script:
	'../scripts/PGS_haplotype.R'
SnakeMake From line 229 of rules/PGS.smk
27
28
script:
	'../scripts/pheno_file.R'
40
41
42
43
44
45
46
47
48
49
50
51
run:
        d= pd.read_csv(input[1], header= 0, sep= '\t')
        pca= pd.read_csv(input[0], header= 0, sep= '\t')
        d= pd.merge(d, pca, how= 'inner', on= 'IID')
        d.fillna('NA', inplace= True)
        for elem in set(d['cohort']):
                d[str(elem)]= d['cohort'] == elem
        d[['HARVEST', 'NORMENT', 'ROTTERDAM1', 'ROTTERDAM2']] *= 1
        d['FID']= d.IID
        d.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['FID', 'IID', wildcards.pheno])
        d.to_csv(output[1], sep= '\t', header= True, index= False, columns= ['FID', 'IID', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'cohort', 'MOR_FAAR'])
        d.to_csv(output[2], sep= '\t', header= False, index= False, columns= ['FID', 'IID'])
66
67
script:
        '../scripts/pheno_file.R'
79
80
81
82
83
	run:
                d= pd.read_csv(input[1], header= 0, sep= '\t')
                pca= pd.read_csv(input[0], header= 0, sep= '\t')
                d= pd.merge(d, pca, how= 'inner', on= 'IID')
                remove= selectUnrelated(input[2], d, d.IID)
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
run:
	df_list= list()
	for d in pd.read_csv(input[0], sep= ' ', header= 0, chunksize= 200000):
		d= d[['CHROM', 'GENPOS', 'ALLELE0', 'ALLELE1', 'A1FREQ', 'INFO', 'N' ,'BETA', 'SE', 'LOG10P']]
		d= d.loc[d.INFO> 0.7, :]
		d= d.loc[(d.A1FREQ> 0.001) & (d.A1FREQ< 0.991), :]
		d.columns= ['CHR', 'POS', 'REF', 'EFF', 'EAF', 'INFO', 'N', 'BETA', 'SE', 'LOG10P']
		d['CHR']= np.where(d.CHR== 'X', '23', d.CHR)
		d['BETA']= np.where(d.REF > d.EFF, -1 * d.BETA, d.BETA)
		d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)
		d['REF'], d['EFF']= np.where(d['REF'] > d['EFF'], (d['EFF'], d['REF']), (d['REF'], d['EFF']))
		d['ID']= d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.REF + ':' + d.EFF
		df_list.append(d)
	d= pd.concat(df_list)
	d.to_csv(output[0], sep= '\t', header= True, index= False)
	d['start']= d.POS - 1
	d.to_csv(output[1], sep= '\t', header= False, index= False, columns= ['CHR', 'start', 'POS', 'ID'])
33
34
shell:
	'gzip -cd {input[0]} | cut -f1-5 > {output[0]}'
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
run:
	with open(input[0], 'rt') as f:
		dialect = csv.Sniffer().sniff(f.readline(), delimiters= ' \t')
		f.seek(0)
		input_file= csv.DictReader(f, delimiter= dialect.delimiter)
		df_list= list()
		with open(output[0], 'w') as csvfile:
			writer = csv.writer(csvfile, delimiter= '\t')
			writer.writerow([g for g in ['ID', 'rsid']])
		for row in input_file:
			if row['ID']== '.':
				continue
			if row['REF'] > row['ALT']:
				ID= row['#CHROM'] + ':' + row['POS'] + ':' + row['ALT'] + ':' + row['REF']
			else:
				ID= row['#CHROM'] + ':' + row['POS'] + ':' + row['REF'] + ':' + row['ALT']
			df_list.append([ID, row['ID']])
			if len(df_list)== 10**6:
				with open(output[0], 'a', newline= '') as file_handler:
					writer1= csv.writer(file_handler, delimiter= '\t')
					for item in df_list:
						writer1.writerow(item)
				df_list= list()
	with open(output[0], 'a', newline= '') as file_handler:
		writer1= csv.writer(file_handler, delimiter= '\t')
		for item in df_list:
			writer1.writerow(item)
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	d= d.loc[d.cdsStart!= d.cdsEnd, :]
	d['chrom']= d.chrom.str.replace(' ', '')
	d['chrom']= d.chrom.str.replace('chr', '')
	d['chrom']= d.chrom.str.replace('X', '23')
	d['chrom']= pd.to_numeric(d.chrom, errors= 'coerce')
	d.dropna(subset= ['chrom'], inplace= True)
	d.loc[d.txStart > d.txEnd, ['txStart', 'txEnd']]= d.loc[d.txStart > d.txEnd, ['txEnd', 'txStart']].values
	x= d.groupby(['chrom', 'geneSymbol', 'ENSEMBLE_ID'])['txStart'].min().reset_index()
	x1= d.groupby(['chrom', 'geneSymbol', 'ENSEMBLE_ID'])['txEnd'].max().reset_index()
	df= pd.merge(x, x1, on= ['chrom', 'geneSymbol', 'ENSEMBLE_ID'])
	df.columns= ['CHR', 'geneSymbol', 'ENSEMBLE_ID', 'start', 'end']
	df= df.loc[df.start != df.end, :]
	df['start']= df.start - 1
	df.sort_values(by= ['CHR', 'start'], inplace= True)
	df= df[['CHR', 'start', 'end', 'geneSymbol', 'ENSEMBLE_ID']]
	df.to_csv(output[0], sep= '\t', header= True, index= False)
	df[['CHR', 'start', 'end']]= df[['CHR', 'start', 'end']].apply(np.int64)
	df.to_csv(output[1], sep= '\t', header= False, index= False)
105
106
shell:
        'bedtools closest -t all -a {input[0]} -b {input[1]} > {output[0]}'
116
117
118
119
120
121
122
run:
	rsid= pd.read_csv(input[1], sep= '\t', header=0)
	nearest_gene= pd.read_csv(input[2], sep= '\t', header= None, names= ['CHR', 'X', 'POS', 'ID', 'c1', 'p1', 'p2', 'nearestGene', 'Ensembl_gene'], usecols= ['ID', 'nearestGene'])
	for d in pd.read_csv(input[0], sep= '\t', header= 0, chunksize= 200000):
		d= pd.merge(d, rsid, on= 'ID', how= 'left')
		d= pd.merge(d, nearest_gene, on= 'ID', how= 'left')
		d.to_csv(output[0], sep= '\t', header= not os.path.isfile(output[0]), index= False, mode= 'a')
131
132
shell:
	'gzip -c {input[0]} > {output[0]}'
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
run:
        d= pd.read_csv(input[0], sep= '\t', compression= 'gzip')
        df= d.loc[d.LOG10P> -np.log10(5*10**-8), :]
        df.sort_values(by= 'LOG10P', ascending= False, inplace= True)
        df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True)
        df_list= list()
        for chrom in set(df.CHR):
                d_temp= df.loc[df.CHR== chrom, :]
                positions= d_temp.POS.values
                for pos in positions:
                        if pos in d_temp.POS.values:
                                df_list.append(d_temp.loc[d_temp.POS== pos, :])
                                d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :]
                        else:
                                continue
        x= pd.concat(df_list)
        x['CHR']= x.CHR.astype(str)
        x['CHR']= np.where(x.CHR== '23', 'X', x.CHR)
        x.to_csv(output[0], sep='\t', header= True, index= False)
167
168
shell:
	'touch {output[0]}'
11
12
script:
	'../scripts/fetal_miscarriage.Rmd'
 8
 9
10
11
12
13
14
15
16
17
18
19
run:
	array_res= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['#CHROM', 'POS', 'MarkerName', 'ALT', 'REF', 'Effect', 'StdErr', 'P-value'])
	imp_res= pd.read_csv(input[1], sep= '\t', header= 0, usecols= ['#CHROM', 'POS', 'MarkerName', 'ALT', 'REF', 'Effect', 'StdErr', 'P-value', 'MACH_R2', 'MAF'])
	imp_res= imp_res.loc[imp_res.MAF>= 0.01, :]
	imp_res= imp_res.loc[imp_res.MACH_R2>= 0.7, :]
	d= pd.concat([array_res, imp_res])
	print(d.columns)
	d['ALT']= d.ALT.str.upper()
	d['REF']= d.REF.str.upper()
	d= d[['#CHROM', 'POS', 'MarkerName', 'ALT', 'REF', 'Effect', 'StdErr', 'P-value']]
	d.columns= ['CHROM', 'POS', 'RSID', 'EFF', 'REF', 'BETA', 'SE', 'pvalue']
	d.to_csv(output[0], sep= '\t', header= True, index= False, compression= 'gzip')
29
30
run:
        df= pd.read_csv(input[0], sep= '\t')
67
68
run:
        shell("bcftools query -S {input[1]} -R {input[0]} -f '%CHROM\t%POS\t%REF\t%ALT[\t%GT]\n' {input[2]} -o {output[0]}")
77
78
79
80
81
run:
        cols= ['chr','pos','ref','eff'] + [line.strip() for line in open(input[0], 'r')]
        d= pd.read_csv(input[1], header= None, names= cols, sep= '\t')
        d.drop_duplicates(['chr', 'pos'], keep=False, inplace= True)
        d.to_csv(output[0], sep= '\t', header= True, index= False)
89
90
91
92
93
94
shell:
        '''
        set +o pipefail;
        head -1 {input[0]} > {output[0]}
        cat {input} | grep -v 'chr' >> {output[0]}
        '''
108
109
script:
        '../scripts/phase_by_transmission.py'
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
run:
        d= pd.read_csv(input[0], sep= '\t', header= 0)
        covar= pd.read_csv(input[1], sep= '\t', header= 0)
        d= pd.merge(d, covar, on= 'IID')
        ids= pd.read_csv(input[2], sep= '\t', header= 0)
        d= pd.merge(d, ids, left_on= 'IID', right_on= 'Child')
        df_list= list()
        for i in range(3, len(input)):
                x= pd.read_csv(input[i], sep= '\t', header= 0)
                varnames= ('chr' + x.chr.apply(str) + '_' + x.pos.apply(str) + '_' + x.ref + '_' + x.eff).values.tolist()
                x= pd.DataFrame(x.iloc[:, 4:].T)
                haplo= input[i].split('-')[1].replace('_PREG_ID', '')
                x.columns= [i + '_' + haplo for i in varnames]
                x['PREG_ID']= x.index
                df_list.append(x)
        x= reduce(lambda x, y: pd.merge(x, y, on = 'PREG_ID', how = 'inner'), df_list)
        print(x)
        print('Now d')
        print(d)
        x['PREG_ID']= x.PREG_ID.apply(str)
        d['PREG_ID']= d.PREG_ID.apply(str)
        x= pd.merge(x, d, on= 'PREG_ID')
        print(x.columns)
        x.to_csv(output[0], sep= '\t', header= True, index= False)
155
156
157
158
159
160
161
162
163
run:
        d= pd.read_csv(input[0], sep= '\t', header= 0)
        remove= selectUnrelated(input[1], d, d.Child)
        d= d.loc[~d.Child.isin(remove), :]
        remove= selectUnrelated(input[1], d, d.Mother)
        d= d.loc[~d.Mother.isin(remove), :]
        remove= selectUnrelated(input[1], d, d.Father)
        d= d.loc[~d.Father.isin(remove), :]
        d.to_csv(output[0], sep= '\t', header= True, index= False)
177
178
script:
	'../scripts/linear_hypotheses.R'
 9
10
run:
	shell("bcftools query -S {input[0]} -r 2:234627536 -f '%CHROM\t%POS\t%REF\t%ALT[\t%DS]\n' {input[1]} -o {output[0]}")
19
20
run:
	cols= ['chr','pos','ref','eff'] + [line.strip() for line in open(input[0], 'r')]
37
38
39
40
41
run:
	alld= pd.read_csv(input[0], sep= '\t', header= 0)
	trios= pd.read_csv(input[1], sep= '\t', header= 0)
	moms= pd.read_csv(input[2], sep= '\t', header= 0)
	moms= moms.iloc[:, 4:].T.reset_index()
71
72
run:
	shell("bcftools query -S {input[0]} -r 2:234627536 -f '%CHROM\t%POS\t%REF\t%ALT[\t%GT]\n' {input[1]} -o {output[0]}")
81
82
83
84
85
run:
        cols= ['chr','pos','ref','eff'] + [line.strip() for line in open(input[0], 'r')]
        d= pd.read_csv(input[1], header= None, names= cols, sep= '\t')
        d.drop_duplicates(['chr', 'pos'], keep=False, inplace= True)
        d.to_csv(output[0], sep= '\t', header= True, index= False)
 99
100
script:
        '../scripts/phase_by_transmission.py'
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
run:
        d= pd.read_csv(input[0], sep= '\t', header= 0)
        df_list= list()
        for i in range(1, len(input)):
                x= pd.read_csv(input[i], sep= '\t', header= 0)
                varnames= ('chr' + x.chr.apply(str) + '_' + x.pos.apply(str) + '_' + x.ref + '_' + x.eff).values.tolist()
                x= pd.DataFrame(x.iloc[:, 4:].T)
                haplo= input[i].split('/')[-1].replace('_PREG_ID', '')
                x.columns= [i + '_' + haplo for i in varnames]
                x['PREG_ID_1724']= x.index
                df_list.append(x)
        x= reduce(lambda x, y: pd.merge(x, y, on = 'PREG_ID_1724', how = 'inner'), df_list)
        x['PREG_ID_1724']= x.PREG_ID_1724.apply(str)
        d['PREG_ID_1724']= d.PREG_ID_1724.apply(str)
        x= pd.merge(x, d, on= 'PREG_ID_1724')
        x.to_csv(output[0], sep= '\t', header= True, index= False)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
library(data.table)
library(dplyr)
library(tidyr)

format_haps= function(hap){
variants= paste(hap$chr, hap$pos, hap$ref, hap$eff, sep =':')
ids= names(hap)[5:ncol(hap)]
hap= as.data.frame(t(hap[, 5:ncol(hap)]))
names(hap)= variants
hap$PREG_ID= ids
return(hap)
}



d = fread(snakemake@input[[1]])

d= format_haps(d)

variants= names(d[, 1:2])

names(d)= c('V1', 'V2', 'IID')

d$V1= gsub('\\/', '\\|', d$V1)
d$V2= gsub('\\/', '\\|', d$V2)

d$V1= with(d, ifelse(V1== '0|0', 0, ifelse(V1== '1|0' | V1== '0|1', 1, ifelse(V1== '1|1', 2, NA))))
d$V2= with(d, ifelse(V2== '0|0', 0, ifelse(V2== '1|0' | V2== '0|1', 1, ifelse(V2== '1|1', 2, NA))))


d= mutate(d, ABO = ifelse(V2== 0, "O", NA)) %>% mutate(ABO= ifelse((V1== 2 & V2 == 2) | (V1== 1 & V2== 1), 'B', ABO)) %>% mutate(ABO= ifelse(V1== 0 & V2> 0, 'A', ABO)) %>% mutate(ABO= ifelse((V1== 1 & V2== 2), 'AB', ABO))

d= select(d, IID, ABO)

fwrite(d, file = snakemake@output[[1]], sep="\t")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import pandas as pd
import numpy as np
import csv

PREG_ID= 'PREG_ID'
Sentrix= 'IID'

def format_df(df):
	df[['chr', 'pos', 'ref', 'eff']]= df['index'].str.split(':', expand= True)
	cols = list(df.columns.values)
	cols= cols[-4:] + cols[:-4]
	df= df[cols]
	df.drop(['index'], axis= 1, inplace= True)
	return df


d= pd.read_csv(snakemake.input[2], sep= '\t', header= 0)
d.dropna(axis= 0, inplace= True)

with open(snakemake.input[0], 'r') as infile:
    reader= csv.DictReader(infile, delimiter= '\t')
    fets_cols= reader.fieldnames

with open(snakemake.input[1], 'r') as infile:
    reader= csv.DictReader(infile, delimiter= '\t')
    moms_cols= reader.fieldnames

with open(snakemake.input[3], 'r') as infile:
    reader= csv.DictReader(infile, delimiter= '\t')
    dads_cols= reader.fieldnames

d= d.loc[d.Child.isin(fets_cols), :]
d= d.loc[d.Mother.isin(moms_cols), :]
d= d.loc[d.Father.isin(dads_cols), :]

fets_snp_list= list()
h1_df_list= list()
h3_df_list= list()

for fets in pd.read_csv(snakemake.input[0], sep='\t', header= 0, chunksize= 100):
	fets_snp_list.append(fets.chr.apply(str) + ':' + fets.pos.apply(str) + ':' + fets.ref + ':' + fets.eff)
	fets= fets[d.Child]
	fets= fets.astype(str)
	fets= np.where(fets== '0', '0|0', np.where(fets== '1', '1|1', fets))
	h3= np.where((fets== '0|0') | (fets== '0|1'), 0, np.where((fets== '1|0') | (fets== '1|1'), 1,np.nan))
	h1= np.where((fets== '0|0') | (fets== '1|0'), 0, np.where((fets== '0|1') | (fets== '1|1'), 1, np.nan))
	h1_df_list.append(h1)
	h3_df_list.append(h3)

varnames= pd.concat(fets_snp_list).values.tolist()

h1= np.concatenate(h1_df_list)
h3= np.concatenate(h3_df_list)


moms_df_list= list()

for moms in pd.read_csv(snakemake.input[1], sep= '\t', header= 0, chunksize= 100):
	moms= moms[d.Mother]
	moms= np.where(moms== '0|0', 0, np.where((moms== '0|1') | (moms== '1|0'), 1, np.where(moms== '1|1', 2, np.nan)))
	moms_df_list.append(moms)

moms= np.concatenate(moms_df_list)

h2= moms - h1

dads_df_list= list()

for dads in pd.read_csv(snakemake.input[3], sep='\t', header= 0, chunksize= 100):
	dads= dads[d.Father]
	dads= dads.astype(str)
	dads= np.where(dads== '0', '0|0', np.where(dads== '1', '1|1', dads))
	dads= np.where(dads== '0|0', 0, np.where((dads== '0|1') | (dads== '1|0'), 1, np.where(dads== '1|1', 2, np.nan)))
	dads_df_list.append(dads)

dads= np.concatenate(dads_df_list)

h4= dads - h3

h1= pd.DataFrame(data= h1, columns= d[PREG_ID], index= varnames).reset_index()
h2= pd.DataFrame(data= h2, columns= d[PREG_ID], index= varnames).reset_index()
h3= pd.DataFrame(data= h3, columns= d[PREG_ID], index= varnames).reset_index()
h4= pd.DataFrame(data= h4, columns= d[PREG_ID], index= varnames).reset_index()

h1= format_df(h1)
h2= format_df(h2)
h3= format_df(h3)
h4= format_df(h4)

h1.to_csv(snakemake.output[0], header= True, sep= '\t', index= False)
h2.to_csv(snakemake.output[1], header= True, sep= '\t', index= False)
h3.to_csv(snakemake.output[2], header= True, sep= '\t', index= False)
h4.to_csv(snakemake.output[3], header= True, sep= '\t', index= False)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import pandas as pd
import numpy as np

def remove_palindromic(d, REF, EFF):
        return (d.loc[~(((d[REF]== 'T') & (d[EFF]== 'A')) | ((d[REF]== 'A') & (d[EFF]== 'T')) | ((d[REF]== 'C') & (d[EFF]== 'G')) | ((d[REF]== 'G') & (d[EFF]== 'C'))), :])

def flip_alleles(x):
        x= x.str.upper()
        x= x.str.replace('C', 'g')
        x= x.str.replace('G', 'c')
        x= x.str.replace('T', 'a')
        x= x.str.replace('A', 't')
        return x.str.upper()

def harmonize_alleles(d, REF_x, EFF_x, REF_y, EFF_y):
#        d= remove_palindromic(d, REF_x, EFF_x)
#        d= remove_palindromic(d, REF_y, EFF_y)
        d['beta']= np.where((d[EFF_y]== d[EFF_x]) & (d[REF_y] == d[REF_x]), d.beta, np.where((d[EFF_y]== d[REF_x]) & (d[REF_y]== d[EFF_x]), -1 * d.beta, np.where((flip_alleles(d[EFF_y])== d[EFF_x]) & (flip_alleles(d[REF_y])== d[REF_x]), d.beta, np.where((flip_alleles(d[EFF_y])== d[REF_x]) & (flip_alleles(d[REF_y]) == d[EFF_x]), -1 * d.beta, np.nan))))
        d= d.loc[~(d.beta.isnull()), :]
        d[EFF_y]= np.where((d[EFF_y]== d[EFF_x]) & (d[REF_y]== d[REF_x]), d[EFF_x], np.where((d[EFF_y]== d[REF_x]) & (d[REF_y] == d[EFF_x]), d[REF_x], np.where((flip_alleles(d[EFF_y])== d[EFF_x]) & (flip_alleles(d[REF_y])== d[REF_x]), d[EFF_x], np.where((flip_alleles(d[EFF_y])== d[REF_x]) & (flip_alleles(d[REF_y]) == d[EFF_x]), d[REF_x], np.nan))))
        d[REF_y]= np.where((d[EFF_y]== d[EFF_x]) & (d[REF_y]== d[REF_x]), d[REF_x], np.where((d[EFF_y]== d[REF_x]) & (d[REF_y]== d[EFF_x]), d[EFF_x], np.where((flip_alleles(d[EFF_y])== d[EFF_x]) & (flip_alleles(d[REF_y])== d[REF_x]), d[REF_x], np.where((flip_alleles(d[EFF_y])== d[REF_x]) & (flip_alleles(d[REF_y]) == d[EFF_x]), d[EFF_x], np.nan))))
        return d

def calculate_PGS(d, ID):
	d.drop_duplicates(['chr', 'pos'], keep=False, inplace= True)
	d['chr']= d.chr.apply(str)
	d= pd.merge(betas, d, on= ['chr', 'pos']) 
	d= harmonize_alleles(d, 'ref', 'eff', 'REF', 'EFF')
	d.drop_duplicates(['chr', 'pos'], keep=False, inplace= True)
	betas_v= np.array(d.beta)
	d.drop(['ref', 'eff', 'pos', 'chr', 'EFF', 'REF', 'beta'], axis= 1, inplace= True)
	ids= d.columns
	d= pd.DataFrame(np.array(d) * betas_v.reshape(-1, 1), columns= ids)
	d= pd.DataFrame(d.sum(axis= 0))
	d[ID]= d.index
	return d

# Read data
if 'haplotype' not in snakemake.input[0]:
	cols= ['chr','pos','ref','eff'] + [line.strip() for line in open(snakemake.input[0], 'r')]
	betas= pd.read_csv(snakemake.input[2], sep= '\t', header= 0)
	betas['chr']= betas['chr'].apply(str)
	if 'nochr2' in snakemake.output[0]:
		betas= betas.loc[~((betas.chr== '2') & (betas.pos> (234627536 - 1e6)) & (betas.pos < (234627536 + 1e6))), : ]
	df_list= list()
	for d in pd.read_csv(snakemake.input[1], header= None, names= cols, sep= '\t', chunksize= 600):
		d= calculate_PGS(d, 'IID')
		df_list.append(d)
	d= pd.concat(df_list)
else:
	betas= pd.read_csv(snakemake.input[1], sep= '\t', header= 0)
	betas['chr']= betas['chr'].apply(str)
	if 'nochr2' in snakemake.output[0]:
		betas= betas.loc[~((betas.chr== '2') & (betas.pos> (234627536 - 1e6)) & (betas.pos < (234627536 + 1e6))), : ]
	df_list= list()
	for d in pd.read_csv(snakemake.input[0], header= 0, sep= '\t', chunksize= 600):
		d= calculate_PGS(d, 'PREG_ID')
		df_list.append(d)
	d= pd.concat(df_list)

if 'haplotype' in snakemake.input[0]:
	d= d.groupby('PREG_ID').sum().reset_index()
	d.columns= ['PREG_ID', snakemake.wildcards.haplo + '_' + snakemake.wildcards.pheno]
else:
	d= d.groupby('IID').sum().reset_index()
	d.columns.values[0]= snakemake.wildcards.sample + '_' + snakemake.wildcards.pheno


d.to_csv(snakemake.output[0], sep ='\t', header= True, index= False)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
library(data.table)
library(dplyr)
library(coloc)
library(parallel)

pph_outfile= snakemake@output[[1]]
results_outfile= snakemake@output[[2]]

cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tgene\teqtl_data\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tgene\teqtl_data\n', file= snakemake@output[[2]])

prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6

s_cc= ifelse(grepl('fets', snakemake@input[[1]]), 0.069, ifelse(grepl('moms', snakemake@input[[1]]),  0.085, 0.086))

eqtl_coloc= function(temp_df, gene, eqtl_data){
if (nrow(temp_df)== 0) {

        PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, geneid= gene, eqtl_data_n= eqtl_data)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
	res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0,  geneid= gene, eqtl_data_n= eqtl_data)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)    
        print('No data available.')

        } else {

        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N= temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$POS, s= s_cc, MAF= temp_df$MAF)

        data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N= temp_df$N, type= 'quant', snp= temp_df$POS, MAF= temp_df$maf)

        myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)       
        if (length(myres)==1 ) {  
        PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, geneid= gene, eqtl_data_n= eqtl_data)

fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)

res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, geneid= gene, eqtl_data_n= eqtl_data)

	fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)

        print(paste0('Error in coloc.abf. Nrow temp_df:', nrow(temp_df)))

        } else {

        PPH= data.frame(t(myres[[1]]))
        PPH$geneid= gene
	PPH$eqtl_data_n= eqtl_data
        if ((PPH$PP.H3.abf + PPH$PP.H4.abf) >= 0.01) {
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= myres[[2]]
        res$geneid= gene
	res$eqtl_data_n= eqtl_data
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        } else {
        print('Not enough power')
        }

}
}
}

format_eqtl= function(temp_df){
	gene= unique(temp_df$gene_id)
	eqtl_data= unique(temp_df$eqtl_data)
	temp_df = filter(temp_df, SE>0, se> 0)
	print(nrow(temp_df))
	eqtl_coloc(temp_df, gene, eqtl_data)

}


d= fread(snakemake@input[[1]])
d= filter(d, !duplicated(POS))

df= fread(snakemake@input[[2]], h= T)
df= group_by(df, gene_id) %>% filter(!duplicated(position)) %>% ungroup()

if (nrow(df)== 0) {

print('No data for the genes selected.')
} else {

eqtl_data_name= snakemake@wildcards[[1]]

df$eqtl_data= eqtl_data_name

d= inner_join(d, df, by= c('POS'= 'position'))

d= filter(d, EFF == alt | EFF == ref)

(print(paste('# rows:', nrow(d))))

z= mclapply(split(d, d$gene_id), format_eqtl, mc.cores= 3)

} 
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
library(data.table)
library(dplyr)
library(coloc)
library(parallel)

prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6

cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tlocus\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tlocus\n', file= snakemake@output[[2]])


d= fread(snakemake@input[[1]], select= c('ID', 'CHR', 'POS', 'N', 'BETA', 'SE', 'LOG10P', 'EAF'))

d= filter(d, !duplicated(ID))

d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF)

x= fread(snakemake@input[[2]])

x$BETA= ifelse(x$REF > x$EFF, -1 * x$BETA, x$BETA)

x= select(x, CHROM, POS, EFF, REF, BETA, SE, pvalue)

x$TOTALSAMPLESIZE= 363228

x$EFF= toupper(x$EFF)
x$REF= toupper(x$REF)

x$BETA= ifelse(x$REF> x$EFF, -1 * x$BETA, x$BETA)
x$ID= with(x, ifelse(REF> EFF, paste(CHROM, POS, EFF, REF, sep= ':'), paste(CHROM, POS, REF, EFF, sep= ':')))

x= select(x, ID, TOTALSAMPLESIZE, BETA, SE, pvalue)
names(x)= c('ID', 'TOTALSAMPLESIZE', 'beta', 'se', 'p')

d= inner_join(d, x, by= 'ID')

ld_indep= fread(snakemake@input[[3]])

ld_indep$chr= gsub('chr', '', ld_indep$chr)
ld_indep$chr= as.numeric(ld_indep$chr)

ld_indep$locus= paste(ld_indep$chr, ld_indep$start, ld_indep$stop, sep= '-')


funk= function(i) {
        row= ld_indep[i, ]
        locus1= row[, 'locus']
        temp_df= filter(d, CHR== as.integer(row[, 'chr']), POS >= as.integer(row[, 'start']), POS<= as.integer(row[, 'stop']))
	print(nrow(temp_df))

        if (nrow(temp_df)== 0) {
        PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0,  PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus1)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus1)
        fwrite(PPH, snakemake@output[[1]], sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus1)
        fwrite(res, snakemake@output[[2]], sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')

        } else {
        temp_df= filter(temp_df, SE>0, se>0)

        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$N, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.069)

        data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF)
        myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
        if (length(myres)==1 ) {
        PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus1)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus1)
        fwrite(res, snakemake@output[[2]], sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')
        next
        } else {
	print(str(myres))
        PPH= as.data.frame(t(myres[[1]]))
        PPH$locus= locus1
        fwrite(PPH, snakemake@output[[1]], sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= myres[[2]]
        res$locus= rep(locus1, nrow(res))
        fwrite(res, snakemake@output[[2]], sep= '\t', row.names=F, col.names= F, quote=F, append= T)

}
}
}




lapply(1:nrow(ld_indep), funk)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import pandas as pd
import numpy as np

def contrast(input):
    all_local_hsq = pd.read_table(input, sep='\t')
    all_local_hsq.loc[all_local_hsq['local_h2g']<0, 'local_h2g'] = 0.0
    # sort local SNP-heritability by descending order and get total hsq and
    # total number of snps
    # sort local SNP-heritability by descending order
    idx = np.argsort(-all_local_hsq['local_h2g'])
    all_local_hsq_sorted= (all_local_hsq['local_h2g'][idx].reset_index(drop=True))
    all_nsnp_sorted= (all_local_hsq['num_snp'][idx].reset_index(drop=True))
    # get total SNP-heritability and total number of SNPs
    all_total_hsq= (np.sum(all_local_hsq['local_h2g']))
    all_total_nsnp= (np.float(np.sum(all_local_hsq['num_snp'])))
    # get the proportions
    all_xval = np.array([])
    all_yval = np.array([])
    all_yerr = np.array([])
    nloci = all_local_hsq_sorted.shape[0]
    # iterate through the loci
    for j in range(nloci):
        hsq_sum = np.sum(all_local_hsq_sorted[0:j+1])
        hsq_frac = hsq_sum / all_total_hsq
        snp_frac = np.sum(all_nsnp_sorted[0:j+1]) / all_total_nsnp
        all_xval = np.append(all_xval, snp_frac)
        all_yval = np.append(all_yval, hsq_frac)
        # get jack knife standard error
        jk_est = []
        for k in range(nloci):
            total_hsq_jk = all_total_hsq-all_local_hsq_sorted[k]
            hsq_sum_jk = hsq_sum
            if k <= j:
                hsq_sum_jk -= all_local_hsq_sorted[k]
            jk_est.append(hsq_sum_jk/total_hsq_jk)
        jk_est = np.array(jk_est)
        se = np.sqrt((nloci-1)*np.mean(np.square(jk_est-hsq_frac)))
        all_yerr = np.append(all_yerr, se)
    d= pd.DataFrame({'local_h2': all_yval, 'nsnp_sorted': all_xval, 'se': all_yerr})
    return(d)


d= contrast(snakemake.input[0])

d.to_csv(snakemake.output[0], header= True, index= False, sep= '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
library(data.table)
library(dplyr)
library(tidyr)
library(broom)
library(MASS)

format_haps= function(hap){
variants= paste('X', hap$chr, hap$pos, hap$ref, hap$eff, sep ='_')
ids= names(hap)[5:ncol(hap)]
hap= as.data.frame(t(hap[, 5:ncol(hap)]))
names(hap)=  variants
hap$IID= ids

return(hap)
}

fets= fread(snakemake@input[[1]])

fets= format_haps(fets)

pheno= fread(snakemake@input[[2]])
covar= fread(snakemake@input[[3]])

pheno= inner_join(pheno, covar, by= 'IID') 

fets= inner_join(fets, pheno, by= 'IID')

print(nrow(fets))

results_list= lapply(names(fets)[grepl('X_', names(fets))], function(snp) {

print(snp)
fets_temp= data.frame(fets)[, c(snp, 'miscarriage', 'MOR_FAAR', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'cohort')]

names(fets_temp)[1]= 'fet'

m1= glm(miscarriage~ fet + cohort + PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data= fets_temp, family= poisson)

temp_df= tidy(m1) %>% filter(row_number() == 2)
temp_df$term= snp
temp_df$model= 'poisson'
temp_df$loglik= logLik(m1)[1]

m2= glm.nb(miscarriage~ fet + cohort + PC1 + PC2 + PC3 + PC4 + PC5 + PC6, data= fets_temp)

temp_df2= tidy(m2) %>% filter(row_number() == 2)
temp_df2$term= snp
temp_df2$model= 'negative-binomial'
temp_df2$loglik= logLik(m2)[1]

temp_df= bind_rows(temp_df, temp_df2)

return(temp_df)

}

)

print('Analyses performed, saving data.')

x= do.call('rbind', results_list)

fwrite(x, snakemake@output[[1]], sep= '\t')
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
knitr::opts_chunk$set(echo = FALSE)
knitr::opts_chunk$set(warning = FALSE, message = FALSE) 

library(dplyr)
library(ggplot2)
library(data.table)
library(cowplot)
library(knitr)
library(kableExtra)
library(tidyr)
library(broom)
library(fitdistrplus)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

knitr::opts_chunk$set(fig.width=7, fig.height=4)
35
pheno= fread(snakemake@input[[1]])
41
42
43
44
45
46
ggplot(pheno, aes(miscarriage)) +
  geom_histogram() +
  theme_cowplot()


table(pheno$miscarriage)
57
58
59
60
61
fit.nb= fitdist(pheno$miscarriage, distr= 'nbinom')
fit.norm= fitdist(pheno$miscarriage, distr= 'norm')
fit.pois= fitdist(pheno$miscarriage, distr= 'pois')

plot(fit.norm)
68
plot(fit.pois)
75
76
print(paste('Mean:', round(mean(pheno$miscarriage), 3)))
print(paste('Variance:', round(var(pheno$miscarriage), 3)))
83
plot(fit.nb)
92
93
94
gwas= fread(snakemake@input[[2]], select= c('CHROM', 'GENPOS', 'ALLELE1', 'ALLELE0', 'BETA', 'SE', 'LOG10P', 'A1FREQ', 'N', 'INFO'))

gwas= filter(gwas, A1FREQ>= 0.01, A1FREQ< 0.99)
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
desat <- function(cols, sat=0.5) {
  X <- diag(c(1, sat, 1)) %*% rgb2hsv(col2rgb(cols))
  hsv(X[1,], X[2,], X[3,])
}

desat_colorBlindBlack8= desat(colorBlindBlack8, 0.5)


don= gwas %>%
  group_by(CHROM)      %>%
  summarise(chr_len= max(GENPOS)) %>%
  mutate(tot= cumsum(as.numeric(chr_len)) - chr_len) %>% # Calculate cumulative position of each chromosome
  #select(-chr_len) %>%
  left_join(gwas, ., by= 'CHROM') %>%
  arrange(CHROM, GENPOS) %>% # Add a cumulative position of each SNP
  mutate(BPcum=GENPOS+tot) %>%
  ungroup()

axisdf = don %>% group_by(CHROM) %>% summarize(center=( max(BPcum) + min(BPcum)) / 2 )
names(axisdf)= c('CHROM', 'center')

HC= -log10(5*10**-8)

don$CHR2= with(don, ifelse(CHROM %% 2 == T, 'odd', 'even'))

ggplot(data= don, aes(x= BPcum, y= LOG10P, colour= CHR2)) +
  geom_hline(yintercept= 0, linewidth= 0.25, colour= 'black') +
  geom_point(size= 0.07) + # Show all points
  theme_cowplot(font_size= 9) +
  scale_colour_manual(values= c(desat_colorBlindBlack8[6], colorBlindBlack8[6]), guide= F) +
  scale_x_continuous(label = c(1:19, '', 21,'', 'X'), breaks= axisdf$center, expand= expansion(0)) +
  scale_y_continuous(breaks= seq(0, round(max(don$LOG10P)) + 1, 5), labels= seq(0, round(max(don$LOG10P)) + 1, 5),
                     expand= expansion(add= c(0, 1))) +
  ylab(expression(-log[10]~pvalue)) +
  xlab('Chromosome') +
  geom_hline(yintercept= HC, linewidth= 0.2, linetype= 2, colour= '#878787') +
  coord_cartesian(clip = "off") 
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
gwas= arrange(gwas, desc(LOG10P))

df= mutate(gwas, exp1= -log10(1:length(LOG10P)/length(LOG10P)))

chisq= qchisq(1-10**-df$LOG10P, 1)

lambda_gc= median(chisq)/qchisq(0.5, 1)

ggplot(filter(df, LOG10P>3), aes(exp1, LOG10P)) +
  geom_point(size= 0.4, color= colorBlindBlack8[2]) +
  geom_abline(intercept = 0, slope = 1, alpha = .5) +
labs(colour="") +
theme_cowplot(font_size= 10) +
xlab(expression(Expected~-log[10]~pvalue)) +
ylab(expression(Observed~-log[10]~pvalue)) +
geom_text(aes(6, 0), label= paste("lambda", "==", round(lambda_gc, 2)), size= 10/.pt, parse= T)
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#Read SNP data for top variants

x= fread(snakemake@input[[3]])

x$SNP= gsub('_X_', '_23_', x$SNP)
  x$SNP= gsub('X_', '', x$SNP)

  x= separate(x, SNP, into= c('CHROM', 'GENPOS', 'ALLELE0', 'ALLELE1'), sep= '_')
  x$CHROM= as.numeric(x$CHROM)
  x$GENPOS= as.numeric(x$GENPOS)

  x[x$ALLELE1< x$ALLELE0, c('ALLELE0', 'ALLELE1')]= x[(x$ALLELE1< x$ALLELE0), c('ALLELE1', 'ALLELE0')]

  # Flip beta, eaf and alleles to alphabetically higher
  gwas= filter(gwas, GENPOS %in% x$GENPOS)
  gwas$BETA= ifelse(gwas$ALLELE1< gwas$ALLELE0, gwas$BETA * -1, gwas$BETA)
  gwas$A1FREQ= ifelse(gwas$ALLELE1< gwas$ALLELE0, 1 - gwas$A1FREQ, gwas$A1FREQ)

  gwas[gwas$ALLELE1< gwas$ALLELE0, c('ALLELE0', 'ALLELE1')]= gwas[(gwas$ALLELE1< gwas$ALLELE0), c('ALLELE1', 'ALLELE0')]
  gwas= inner_join(gwas, x, by= c('CHROM', 'GENPOS', 'ALLELE0', 'ALLELE1'))

  ggplot(gwas, aes(LOG10P, -log10(hwe_pvalue))) +
      geom_point() + 
  geom_abline(intercept= 0, slope= 1, colour= 'black') + 
  geom_hline(yintercept= 0) + 
  geom_vline(xintercept= 0) +
  xlab('Linear model, -log10(p)') +
  ylab('HWE, -log10(p)') +
  theme_bw()

  ggplot(gwas, aes(BETA, -log10(hwe_pvalue))) +
      geom_point() + 
  geom_abline(intercept= 0, slope= 1, colour= 'black') + 
  geom_hline(yintercept= 0) + 
  geom_vline(xintercept= 0) +
  xlab('BETA, # miscarriages') +
  ylab('HWE, -log10(p)') +
  theme_bw()
212
gwas= filter(gwas, LOG10P>5)
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#Let's plot the known SZ variants on frequency - effect size coordinates
#And draw some power curves there at genome-wide significance threshold
gwas= filter(gwas, A1FREQ>=0.01, A1FREQ< 0.99)
maf = ifelse(gwas$A1FREQ> 0.5, 1-gwas$A1FREQ, gwas$A1FREQ)
beta = abs(gwas$BETA) #effect size on log-odds-ratio scale with positive sign
pw.thresh = 0.8
p.threshold = 5e-8

q = qchisq(p.threshold, df = 1, lower = F) #chi-square value corresp. significance threshold
#matrix of numbers of cases (col1) and controls (col2):
Ns = matrix( c(nrow(pheno), 65000, 300000), ncol = 1 , byrow = T) 
cols=c("green", "cyan", "blue")

f = seq(0.01, 0.5, length = 1000)
b = seq(0, 0.3, length = 1000)


df_list= lapply(1:nrow(Ns), function(set){
  pw = rep(NA, length(b)) #power at each candidate b
  b.for.f = rep(NA,length(f)) #for each f gives the b value that leads to target power
  for(i in 1:length(f)){ 
    pw = pchisq(q, df = 1, ncp = 2*f[i]*(1-f[i])*Ns[set,]*b^2/var(pheno$miscarriage), lower = F)
    b.for.f[i] = b[ min( which(pw > pw.thresh) ) ]
  }
  return(data.frame(maf= f, beta = b.for.f, n= rep(Ns[set,], length(f))))
  legends = c(legends, paste(Ns[set,],collapse = "/") ) #make a "#cases/#controls" tag for legend
}
)

df= do.call('rbind', df_list)
ggplot() +
  geom_point(aes(maf, beta)) +
  geom_line(data= df, aes(maf, beta, colour= factor(n))) +
  theme_bw()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
library("dplyr")
library("cowplot")
library("data.table")
library('showtext')
library(ggplot2)
library(broom)
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[1]], header = T)

m1= glm(jaundice~ chr9_136137065_A_G_h1 + chr9_136137065_A_G_h2 + chr9_136137065_A_G_h3 + chr9_136137065_A_G_h4 + cohort + KJONN, 
        d, family= 'binomial')

ci= data.frame(confint(m1))
ci$term= row.names(ci)
names(ci)= c('lo95', 'up95', 'term')

m1= tidy(m1) %>% filter(grepl('chr9', term)) %>% inner_join(., ci, by= 'term')

m2= glm(jaundice~ chr9_136137065_A_G_h1 + chr9_136137065_A_G_h2 + chr9_136137065_A_G_h3 + chr9_136137065_A_G_h4 + cohort + KJONN +
          ABO_incompatibility, d, family= 'binomial')

ci= data.frame(confint(m2))
ci$term= row.names(ci)
names(ci)= c('lo95', 'up95', 'term')

m2= tidy(m2) %>% filter(grepl('chr9', term)) %>% inner_join(., ci, by= 'term')

m2$cond= 'Adjusted'
m1$cond= 'Not adjusted'

m1= rbind(m1, m2)

m1$term= factor(m1$term, levels= rev(c("chr9_136137065_A_G_h1", "chr9_136137065_A_G_h2", "chr9_136137065_A_G_h3",  
                                       "chr9_136137065_A_G_h4")), labels= rev(c('Maternal\ntransmitted', 'Maternal\nnon-transmitted',
                                                                              'Paternal\ntransmitted', 'Paternal\nnon-transmitted')))
m1$cond= factor(m1$cond, levels= c('Not adjusted', 'Adjusted'))

p1= ggplot(m1, aes(x = term, y = estimate, colour= cond)) + 
  geom_hline(aes(yintercept = 0), size = .2, linetype = "dashed") +
  geom_hline(yintercept = log(setdiff(seq(0.6, 1.6, 0.2), 1)), size = .1, linetype = "dashed", colour= 'grey') + 
  geom_errorbar(aes(ymin = lo95, ymax = up95), size = .5, width= 0, position = position_dodge(width=0.3)) +
  theme_cowplot(font_size= 10) + 
  scale_color_manual(values= colorBlindBlack8[c(2, 6)], name= "Maternal-fetal\nABO incompatibility") +
  geom_point(size = 1, position = position_dodge(width= 0.3)) +
  coord_trans(y = scales:::exp_trans()) +
  scale_y_continuous(breaks = log(seq(0.0000001, 1.6000002, 0.2)), labels = seq(0.0, 1.6, 0.2), limits = log(c(0.60, 1.6000002)),
                     expand= expansion(add=0)) +
  ylab('Odds Ratio') +
  theme(axis.title.x= element_blank(),
        axis.ticks.x= element_blank(),
        axis.text.x= element_text(size= 8),
        axis.line = element_line(color = "black", size = 0.2, lineend = "square"),
        axis.ticks.y = element_line(color = "black", size = 0.2),
	legend.position= 'bottom',
	legend.title=element_text(size=8),
	legend.text= element_text(size = 8))


save_plot(snakemake@output[[1]], plot= p1, base_height= 60, base_width= 90, units= 'mm', dpi= 300)

p2= ggplot(m1, aes(x = term, y = estimate, colour= cond)) + 
  geom_hline(aes(yintercept = 0), size = .2, linetype = "dashed") +
  geom_hline(yintercept = log(setdiff(seq(0.6, 1.6, 0.2), 1)), size = .1, linetype = "dashed", colour= 'grey') + 
  geom_errorbar(aes(ymin = lo95, ymax = up95), size = .5, width= 0, position = position_dodge(width=0.3)) +
  theme_cowplot(font_size= 10) + 
  scale_color_manual(values= c(colorBlindBlack8[2], 'white'), guide= 'none') +
  geom_point(size = 1, position = position_dodge(width= 0.3)) +
  coord_trans(y = scales:::exp_trans()) +
  scale_y_continuous(breaks = log(seq(0.0000001, 1.6000002, 0.2)), labels = seq(0.0, 1.6, 0.2), limits = log(c(0.60, 1.6000002)),
                     expand= expansion(add=0)) +
  ylab('Odds Ratio') +
  theme(axis.title.x= element_blank(),
        axis.ticks.x= element_blank(),
        axis.text.x= element_text(size= 8),
        axis.line = element_line(color = "black", size = 0.2, lineend = "square"),
        axis.ticks.y = element_line(color = "black", size = 0.2))

save_plot(snakemake@output[[2]], plot= p2, base_height= 60, base_width= 90, units= 'mm', dpi= 300)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
library("dplyr")
library("cowplot")
library("data.table")
library('showtext')
library(ggplot2)
library(broom)
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

#font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

#font_add_google("Roboto", "Roboto")

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[1]], header = T)

m1= glm(jaundice~ chrX_109792100_C_T_h1 + chrX_109792100_C_T_h2 + chrX_109792100_C_T_h3 + cohort + PC1 + PC2 + PC3 + PC4 + PC5 + PC6  + PC7 + PC8 + PC9 + PC10, filter(d, KJONN== 0), family= 'binomial')

ci= data.frame(confint(m1))
ci$term= row.names(ci)
names(ci)= c('lo95', 'up95', 'term')

m1= tidy(m1) %>% filter(grepl('chrX', term)) %>% inner_join(., ci, by= 'term')

m1$sex= 'Girls'

m2= glm(jaundice~ chrX_109792100_C_T_h1 + chrX_109792100_C_T_h2 + chrX_109792100_C_T_h4 + cohort + PC1 + PC2 + PC3 + PC4 + PC5 + PC6  + PC7 + PC8 + PC9 + PC10, filter(d, KJONN== 1), family= 'binomial')

ci= data.frame(confint(m2))
ci$term= row.names(ci)
names(ci)= c('lo95', 'up95', 'term')

m2= tidy(m2) %>% filter(grepl('chrX', term)) %>% inner_join(., ci, by= 'term')

m2$sex= 'Boys'

m1= rbind(m1, m2)

m1$term= factor(m1$term, levels= rev(c("chrX_109792100_C_T_h1", "chrX_109792100_C_T_h2", "chrX_109792100_C_T_h3",
                                   "chrX_109792100_C_T_h4")), labels= rev(c('Maternal\ntransmitted', 'Maternal\nnon-transmitted',
                                                                        'Paternal\ntransmitted', 'Paternal\nnon-transmitted')))

p1= ggplot(m1, aes(x = term, y = estimate, colour= sex)) +
  geom_hline(aes(yintercept = 0), size = .2, linetype = "dashed") +
  geom_hline(yintercept = log(setdiff(seq(0.4, 1.3, 0.1), 1)), size = .1, linetype = "dashed", colour= 'grey') +
  geom_errorbar(aes(ymin = lo95, ymax = up95), size = .5, width = 0, position = position_dodge(width=0.3)) +
  theme_cowplot(font_size= 10) +
  geom_point(size = 1, position = position_dodge(width=0.3)) +
scale_color_manual(values= colorBlindBlack8[c(2, 6)], name= "Sex") +
  coord_trans(y = scales:::exp_trans()) +
  scale_y_continuous(breaks = log(seq(0.4, 1.3000002, 0.2)), labels = seq(0.4, 1.3, 0.2), limits = log(c(0.4, 1.3000002)),
                     expand= expansion(add=0)) +
  ylab('Odds Ratio') +
  theme(axis.title.x= element_blank(),
        axis.ticks.x= element_blank(),
        axis.text.x= element_text(size= 8),
        axis.line = element_line(color = "black", size = 0.2, lineend = "square"),
        axis.ticks.y = element_line(color = "black", size = 0.2),
        legend.position = 'bottom',
        legend.box = "horizontal")

save_plot(snakemake@output[[1]], plot= p1, base_height= 60, base_width= 90, units= 'mm', dpi= 300)

x= fread(snakemake@input[[2]])

x= inner_join(x, d, by= 'PREG_ID_1724')

names(x)= gsub(':', '_', names(x))

x$dads_X_109792100_C_T= x$dads_X_109792100_C_T * 2
x$fets_X_109792100_C_T= ifelse(x$KJONN== 1, x$fets_X_109792100_C_T * 2, x$fets_X_109792100_C_T)

m1= glm(jaundice~  fets_X_109792100_C_T + moms_X_109792100_C_T + dads_X_109792100_C_T + cohort + KJONN +
          PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
        x, family= 'binomial')

ci= data.frame(confint(m1))
ci$term= row.names(ci)
names(ci)= c('lo95', 'up95', 'term')

m1= tidy(m1) %>% filter(grepl('_X', term)) %>% inner_join(., ci, by= 'term')

m1$term= factor(m1$term, levels= (c("fets_X_109792100_C_T", "moms_X_109792100_C_T", "dads_X_109792100_C_T")), 
                labels= (c('Neonate', 'Maternal', 'Paternal')))

p1= ggplot(m1, aes(x = term, y = estimate)) +
  geom_hline(aes(yintercept = 0), size = .2, linetype = "dashed") +
  geom_hline(yintercept = log(setdiff(seq(0.6, 1.2, 0.1), 1)), size = .1, linetype = "dashed", colour= 'grey') +
  geom_errorbar(aes(ymin = lo95, ymax = up95), size = .5, width = 0, color = colorBlindBlack8[1]) +
  theme_cowplot(font_size= 10) +
  geom_point(size = 1, color = colorBlindBlack8[1]) +
  coord_trans(y = scales:::exp_trans()) +
  scale_y_continuous(breaks = log(seq(0.6, 1.2000002, 0.2)), labels = seq(0.6, 1.2, 0.2), limits = log(c(0.6, 1.2000002)),
                     expand= expansion(add=0)) +
  ylab('Odds Ratio') +
  theme(axis.title.x= element_blank(),
        axis.ticks.x= element_blank(),
        axis.text.x= element_text(size= 8),
        axis.line = element_line(color = "black", size = 0.2, lineend = "square"),
        axis.ticks.y = element_line(color = "black", size = 0.2))

save_plot(snakemake@output[[2]], plot= p1, base_height= 60, base_width= 90, units= 'mm', dpi= 300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
library(data.table)
library(dplyr)
library(broom)
library(ggplot2)
library(showtext)
library(cowplot)
library(ggrepel)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

d= fread(snakemake@input[[1]])

gene_names= data.frame(gene_id= c('ENSG00000167165','ENSG00000241119','ENSG00000241635',
                                  'ENSG00000242366','ENSG00000242515','ENSG00000244122','ENSG00000244474'), gene_symbol= c('UGT1A6', 'UGT1A9', 'UGT1A1', 'UGT1A8', 'UGT1A10', 'UGT1A7', 'UGT1A4'),

color_blind= colorBlindBlack8[2:8])

gene_names= filter(gene_names, gene_symbol == snakemake@wildcards[['UGT_genes']])

col_blind= unique(gene_names$color_blind)

d= inner_join(d, gene_names, by= c('gene'= 'gene_id'))

ugt1a1= d %>% arrange(PP.H4.abf)
ugt1a1= filter(ugt1a1, row_number() >= n() - 10)

ugt1a1$eqtl_data= gsub('.*ge_', '', ugt1a1$eqtl_data)
ugt1a1$eqtl_data= gsub('.*microarray_', '', ugt1a1$eqtl_data)
ugt1a1$eqtl_data= gsub('_', '\n', ugt1a1$eqtl_data)

ugt1a1$eqtl_data= factor(ugt1a1$eqtl_data, levels= unique(ugt1a1$eqtl_data))

p1= ggplot(ugt1a1) +
  # Make custom panel grid
  geom_hline(
    aes(yintercept = y), 
    data.frame(y = c(0:4) * 0.25),
    color = "lightgrey", size= 0.5
  ) + 
  # Add bars to represent the cumulative track lengths
  # str_wrap(region, 5) wraps the text so each line has at most 5 characters
  # (but it doesn't break long words!)
  geom_col(
    aes(
      x = eqtl_data,
      y = PP.H4.abf,
      fill = PP.H4.abf
    ),
    position = "dodge2",
    show.legend = F,
    alpha = .9
  ) +
  geom_segment(
    aes(
      x = eqtl_data,
      y = 0,
      xend = eqtl_data,
      yend = 1
    ),
    linetype = "dashed",
    color = "gray12"
  ) +
   # Make it circular!
  coord_polar(clip = 'off') +
    theme(
    axis.title = element_blank(),
    axis.ticks = element_blank(),
    axis.text.y = element_blank(),
    # Use gray text for the region names
    axis.text.x = element_text(color = "gray12", size = 7),
    # Move the legend to the bottom
    legend.position = "bottom",
    panel.background = element_rect(fill = "white", color = "white"),
    panel.grid = element_blank(),
    panel.grid.major.x = element_blank()) +
  scale_y_continuous(expand= c(0,0), limits = c(-1, 1), breaks = 0:4 * 0.25) +
  annotate(
    x = 0.75, 
    y = c(0:4 * 0.25), 
    label = c(0:4 * 0.25), 
    geom = "text", 
    color = "gray12",
    size= 2) +
  scale_fill_gradient2(low= colorBlindBlack8[1], high= col_blind) +
  annotate(
    x = 0, 
    y = -1, 
    label = snakemake@wildcards[['UGT_genes']], 
    geom = "text", 
    color = "gray12",
    fontface= 'italic', 
    size= 8/.pt)

save_plot(snakemake@output[[1]], p1, base_height= 60, base_width= 60, units= 'mm', dpi= 300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
library(data.table)
library(dplyr)
library(broom)
library(ggplot2)
library(showtext)
library(cowplot)
library(ggrepel)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[1]], h= T)
d$pheno= 'Miscarriage'

x= fread(snakemake@input[[2]], h= T)

x$pheno= 'Height'

z= fread(snakemake@input[[3]], h=T)

z$pheno= 'Bilirubin'

#d= fread('/mnt/work/pol/neo-jaundice/results/HESS/contrast-polygenicity/jaundice-fets.txt', h= T)

d= rbind(d, x)
d= rbind(d, z)
d$local_h2= ifelse(d$local_h2<0, 0, d$local_h2)
d$pheno= factor(d$pheno, levels= c('Miscarriage', 'Height', 'Bilirubin'))

pheno_colours= colorBlindBlack8[c(2, 4,8)]

p1= ggplot(d, aes(nsnp_sorted * 100, local_h2 * 100)) +
geom_ribbon(aes(ymin= (local_h2 - se) * 100, ymax= (local_h2 + se) * 100, colour= pheno, fill= pheno), alpha= 0.2, size= 0.1) +
  geom_line(aes(colour= pheno), size= 0.4) +
  scale_colour_manual(values= pheno_colours, guide= 'none') +
  scale_fill_manual(values= pheno_colours, guide= 'none') +
theme_cowplot(font_size= 10) + 
  xlab('Proportion of genome, %') + 
  ylab('Proportion of SNP-heritability, %') +
  scale_y_continuous(limits= c(0, 100.01), expand= expansion(add= c(0, 0))) +
  scale_x_continuous(limits= c(0, 100.01), expand= expansion(add= c(0, 0))) +
  theme(axis.text.x= element_text(size= 8),
        axis.ticks.x= element_line(color = "black", size = 0.2),
        axis.line = element_line(color = "black", size = 0.2, lineend = "square"),
        axis.ticks.y = element_line(color = "black", size = 0.2),
        plot.margin = margin(t=5,r= 5, b=5, l=5, "mm")) +
  geom_abline(slope= 1, intercept= 0, size= 0.2)

save_plot(snakemake@output[[1]], plot= p1, base_height= 65, base_width= 65, units= 'mm', dpi= 300)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
library("dplyr")
library("cowplot")
library("data.table")
library('showtext')
library(ggplot2)
library(broom)
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

#font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

#font_add_google("Roboto", "Roboto")

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[1]], h= T)

x= group_by(d, round(fets_UGT_P24T)) %>% summarize(p= mean(jaundice, na.rm= T), lo95= binom.test(sum(jaundice), n())$conf.int[1], 
                                                   up95= binom.test(sum(jaundice), n())$conf.int[2])

names(x)[1]= 'UGT_missense'

x$UGT_missense= factor(x$UGT_missense, levels= c('0', '1', '2'), labels= c('CC\n(n = 20587)', 'CA\n(n = 2779)', 'AA\n(n = 95)'))

p1= ggplot(data= x, aes(UGT_missense, p*100, alpha= UGT_missense)) +
  geom_col(fill = colorBlindBlack8[2], colour= colorBlindBlack8[1]) +
  geom_errorbar( aes(x= UGT_missense, ymin= lo95*100, ymax= up95*100), colour= colorBlindBlack8[7], alpha=1, width= 0.08, 
                 linewidth= 0.7) +
  theme_cowplot(font_size= 10) + 
  scale_alpha_manual(values= c(0.5, 0.5, 1) , guide= 'none') +
    xlab('rs6755571 genotype') + 
  ylab('Jaundice risk, %') +
  scale_y_continuous(limits= c(0, 10), expand= expansion(add= c(0, 0.0002))) +
  theme(axis.text.x= element_text(size= 8, margin(t= 0, r= 0, b= 2, l= 0, unit= 'mm')),
        axis.title.x= element_text(margin(t= 2, r= 0, b= 0, l= 0, unit= 'mm')),
        axis.ticks.x= element_blank(),
        axis.line = element_line(color = "black", linewidth = 0.2, lineend = "square"),
        axis.ticks.y = element_line(color = "black", size = 0.2),
        plot.background = element_rect(colour = colorBlindBlack8[1], fill= NA, linewidth= 0.2))



save_plot(snakemake@output[[1]], plot= p1, base_height= 60, base_width= 70, units= 'mm', dpi= 300)

eaf=  group_by(d, jaundice) %>% summarize(EAF= sum(fets_UGT_P24T) / (n() * 2), 
                                     lo95= binom.test(as.integer(sum(fets_UGT_P24T)), (n() * 2))$conf.int[1],
                                     up95= binom.test(as.integer(sum(fets_UGT_P24T)), (n() * 2))$conf.int[2])

eaf$jaundice= factor(eaf$jaundice, levels= c('0', '1'), labels = c('Controls\n(n = 21828)', 'Cases\n(n = 1633)'))

p2= ggplot(data= eaf, aes(factor(jaundice), EAF * 100)) +
  geom_pointrange(aes(ymin= lo95*100, ymax= up95*100), colour = colorBlindBlack8[2], size= 0.7, fatten= 0.9) +
    theme_cowplot(font_size= 10) + 
    xlab('Jaundice') + 
  ylab('Effect allele frequency') +
  scale_y_continuous(limits= c(0, 8), expand= expansion(add= c(0, 0.0002)), position = "right") +
  theme(axis.text.x= element_text(size= 8, margin(t= 0, r= 0, b= 2, l= 0, unit= 'mm')),
        axis.title.x= element_text(margin(t= 2, r= 0, b= 0, l= 0, unit= 'mm')),
        axis.ticks.x= element_blank(),
        axis.line = element_line(color = "black", size = 0.2, lineend = "square"),
        axis.ticks.y = element_line(color = "black", size = 0.2))


save_plot(snakemake@output[[2]], plot= p2, base_height= 60, base_width= 50, units= 'mm', dpi= 300)


d= fread(snakemake@input[[2]], header = T)

m1= glm(jaundice~ chr2_234627536_C_A_MT + chr2_234627536_C_A_MnT + chr2_234627536_C_A_PT + chr2_234627536_C_A_PnT + cohort + KJONN, 
        d, family= 'binomial')

ci= data.frame(confint(m1))
ci$term= row.names(ci)
names(ci)= c('lo95', 'up95', 'term')

m1= tidy(m1) %>% filter(grepl('chr2', term)) %>% inner_join(., ci, by= 'term')

m1$term= factor(m1$term, levels= rev(c("chr2_234627536_C_A_MT", "chr2_234627536_C_A_MnT", "chr2_234627536_C_A_PT",  
                                   "chr2_234627536_C_A_PnT")), labels= rev(c('Maternal\ntransmitted', 'Maternal\nnon-transmitted', 
                                                                        'Paternal\ntransmitted', 'Paternal\nnon-transmitted')))

p3= ggplot(m1, aes(x = term, y = estimate)) + 
  geom_hline(aes(yintercept = 0), size = .2, linetype = "dashed") +
  geom_hline(yintercept = log(setdiff(seq(0.0, 1.3, 0.2), 1)), size = .1, linetype = "dashed", colour= 'grey') + 
  geom_errorbar(aes(ymin = lo95, ymax = up95), size = .5, width = 0, color = colorBlindBlack8[1]) +
  theme_cowplot(font_size= 10) + 
  geom_point(size = 1, color = colorBlindBlack8[1]) +
  coord_trans(y = scales:::exp_trans()) +
  scale_y_continuous(breaks = log(seq(0.0000001, 1.4000002, 0.2)), labels = seq(0.0, 1.4, 0.2), limits = log(c(0.070, 1.4000002)),
                     expand= expansion(add=0)) +
  ylab('Odds Ratio') +
  theme(axis.title.x= element_blank(),
        axis.ticks.x= element_blank(),
        axis.text.x= element_text(size= 8),
        axis.line = element_line(color = "black", size = 0.2, lineend = "square"),
        axis.ticks.y = element_line(color = "black", size = 0.2))

save_plot(snakemake@output[[3]], plot= p3, base_height= 60, base_width= 90, units= 'mm', dpi= 300)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
library("dplyr")
library("tidyr")
library("cowplot")
library("ggrepel")
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

desat <- function(cols, sat=0.5) {
  X <- diag(c(1, sat, 1)) %*% rgb2hsv(col2rgb(cols))
  hsv(X[1,], X[2,], X[3,])
}

desat_colorBlindBlack8= desat(colorBlindBlack8, 0.5)

#d= fread('/mnt/work/pol/neo-jaundice/results/GWAS/delivery/MoBa-GWAS-jaundice-fets.txt.gz', h= T, select= c('ID', 'CHR', 'POS', 'LOG10P', 'nearestGene', 'rsid'))
d= fread(snakemake@input[[1]], h= T, select= c('ID', 'CHR', 'POS', 'LOG10P', 'nearestGene', 'rsid'))
d$pheno= 'Neonate'

d$GENE= ifelse(d$rsid== 'rs17868338', 'UGT1A*', ifelse(d$ID== '23:109792100:C:T', 'CHRDL1', ''))
d= filter(d, !duplicated(ID))

#x=  fread('/mnt/work/pol/neo-jaundice/results/GWAS/delivery/MoBa-GWAS-jaundice-moms.txt.gz', h= T, select= c('ID', 'CHR', 'POS', 'LOG10P', 'nearestGene', 'rsid'))
x= fread(snakemake@input[[2]], h= T, select= c('ID', 'CHR', 'POS', 'LOG10P', 'nearestGene', 'rsid'))
x$pheno= 'Mother'

x$GENE= ifelse(x$rsid == 'rs687621', 'ABO', ifelse(x$rsid== 'rs17868336', 'UGT1A*', ifelse(x$ID == '23:109840240:C:T', 'CHRDL1', '')))
x= filter(x, !duplicated(ID))

d= arrange(d, POS)

x= arrange(x, POS)


don= d %>%
    group_by(CHR)      %>%
    summarise(chr_len= max(POS)) %>%
    mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome
    select(-chr_len) %>%
    left_join(d, ., by= 'CHR') %>%
    arrange(CHR, POS) %>% # Add a cumulative position of each SNP
    mutate(BPcum=POS+tot) %>%
         ungroup()

 don1= x %>%
    group_by(CHR)      %>%
    summarise(chr_len= max(POS)) %>%
    mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome
    select(-chr_len) %>%
    left_join(x, ., by= 'CHR') %>%
    arrange(CHR, POS) %>% # Add a cumulative position of each SNP
    mutate(BPcum= POS+tot) %>%
         ungroup()

don= rbind(don, don1)

axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum)) / 2 )
  names(axisdf)= c('CHR', 'center')

HC= -log10(5*10**-8)

#font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

#font_add_google("Roboto", "Roboto")

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

don$logpval= with(don, ifelse(pheno== 'Mother', -LOG10P, LOG10P))

don$CHR2= with(don, ifelse(CHR %% 2, 'odd', 'even'))
don$CHR2= paste0(don$CHR2, don$pheno, sep='_')

p1= ggplot(data= don, aes(x= BPcum, y= logpval, colour= CHR2)) +
   geom_hline(yintercept= 0, size= 0.25, colour= 'black') +
  geom_point(size= 0.07) + # Show all points
  theme_cowplot(font_size= 10) +
  scale_colour_manual(values= c(desat_colorBlindBlack8[6], desat_colorBlindBlack8[2], colorBlindBlack8[6], 
                                colorBlindBlack8[2]), guide= F) +
    scale_x_continuous(label = c(1:19, '', 21,'', 'X'), breaks= axisdf$center, expand= expansion(0)) +
  scale_y_continuous(breaks= seq(-10, 60, 10), labels= c(10, seq(0, 60, 10))) +
  ylab(expression(-log[10]~pvalue)) +
  xlab('Chromosome') +
  geom_hline(yintercept= c(HC, -HC), size= 0.2, linetype= 2, colour= '#878787') +
  coord_cartesian(clip = "off") +
  geom_text_repel(data= filter(don, GENE!= ''), aes(x= BPcum, y= logpval, label= GENE),
                  colour= c(colorBlindBlack8[2], colorBlindBlack8[2], colorBlindBlack8[6], colorBlindBlack8[6], colorBlindBlack8[6]),
                  alpha= 1,
                  size= 9/ .pt,
                  force_pull= 0, # do not pull toward data points
                  force= 0.1,
                  nudge_y      =  ifelse(filter(don, GENE!= '') %>% pull(logpval)>0, 1, -1),
                  direction    = "both",
                  hjust        = 1,
                  vjust=  0.5,
	            	  box.padding= 0,
  		            angle= 0,
                  segment.size = 0.1,
                  segment.square= TRUE,
                  segment.inflect= FALSE,
                  segment.colour= colorBlindBlack8[8],
                  segment.linetype = 4,
                  ylim = c(-Inf, 60),
                  xlim = c(-Inf, Inf),
	            	  fontface = "italic") +
  theme(legend.position= 'none',
	plot.margin = unit(c(t= 0, r=0, b= 0, l=0), 'cm'),
        text= element_text(family= "Roboto", size= 10),
	axis.line= element_line(size= 0.1),
	axis.ticks= element_line(size= 0.1))

save_plot(snakemake@output[[1]], plot= p1, base_height= 90, base_width= 180, units= 'mm', dpi= 300, type= 'cairo')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
library("dplyr")
library("tidyr")
library("cowplot")
library("ggrepel")
library("data.table")
library('showtext')
options(warn=-1)
library(MASS)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

desat <- function(cols, sat=0.5) {
  X <- diag(c(1, sat, 1)) %*% rgb2hsv(col2rgb(cols))
  hsv(X[1,], X[2,], X[3,])
}

desat_colorBlindBlack8= desat(colorBlindBlack8, 0.5)

d= fread(snakemake@input[[1]], h= T, select= c('ID', 'CHR', 'POS', 'LOG10P', 'nearestGene', 'rsid'))

if (snakemake@wildcards[['sample']] == 'fets') {

d$GENE= ifelse(d$rsid== 'rs17868338', 'UGT1A*', ifelse(d$ID== '23:109792100:C:T', 'CHRDL1', ''))

} else if (snakemake@wildcards[['sample']] == 'moms') { 
d$GENE= ifelse(d$rsid == 'rs687621', 'ABO', ifelse(d$rsid== 'rs17868336', 'UGT1A*', ifelse(d$ID == '23:109840240:C:T', 'CHRDL1', '')))

} else{

d$GENE= ifelse(d$rsid== 'rs149247216', 'UGT1A*', '')

}

d= filter(d, !duplicated(ID))




don= d %>%
    group_by(CHR)      %>%
    summarise(chr_len= max(POS)) %>%
    mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome
    select(-chr_len) %>%
    left_join(d, ., by= 'CHR') %>%
    arrange(CHR, POS) %>% # Add a cumulative position of each SNP
    mutate(BPcum=POS+tot) %>%
         ungroup()

axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum)) / 2 )
  names(axisdf)= c('CHR', 'center')

HC= -log10(5*10**-8)


showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


don$CHR2= with(don, ifelse(CHR %% 2 == T, 'odd', 'even'))

p1= ggplot(data= don, aes(x= BPcum, y= LOG10P, colour= CHR2)) +
   geom_hline(yintercept= 0, size= 0.25, colour= 'black') +
  geom_point(size= 0.07) + # Show all points
  theme_cowplot(font_size= 9) +
  scale_colour_manual(values= c(desat_colorBlindBlack8[6], colorBlindBlack8[6]), guide= F) +
  scale_x_continuous(label = c(1:19, '', 21,'', 'X'), breaks= axisdf$center, expand= expansion(0)) +
  scale_y_continuous(breaks= seq(0, round(max(don$LOG10P)) + 1, 5), labels= seq(0, round(max(don$LOG10P)) + 1, 5), expand= expansion(add= c(0, 1))) +
  ylab(expression(-log[10]~pvalue)) +
  xlab('Chromosome') +
  geom_hline(yintercept= HC, size= 0.2, linetype= 2, colour= '#878787') +
  coord_cartesian(clip = "off") +
  geom_text_repel(data= filter(don, GENE!= ''), aes(x= BPcum, y= LOG10P, label= GENE),
                  colour= c(colorBlindBlack8[6]),
                  alpha= 1,
                  size= 6/ .pt,
                  force_pull= 0, # do not pull toward data points
                  force= 0.1,
                  direction    = "both",
                  hjust        = 1,
                  vjust=  0.5,
	            	  box.padding= 0,
  		            angle= 0,
                  segment.size = 0.1,
                  segment.square= TRUE,
                  segment.inflect= FALSE,
                  segment.colour= colorBlindBlack8[8],
                  segment.linetype = 4,
                  ylim = c(0, round(max(don$LOG10P))),
                  xlim = c(0, Inf)) +
  theme(legend.position= 'none',
	plot.margin = unit(c(t= 0, r=0, b= 0, l=0), 'cm'),
        text= element_text(family="Roboto", size= 9),
	axis.line= element_line(size= 0.1),
	axis.ticks= element_line(size= 0.1))

save_plot(snakemake@output[[1]], plot= p1, base_height= 90, base_width= 180, units= 'mm', dpi= 300, type= 'cairo')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
library("dplyr")
library("cowplot")
library("data.table")
library('showtext')
library(ggplot2)
library(broom)
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[1]], header = T)

m1= glm(jaundice~ chr2_234638006_A_G_h1 + chr2_234638006_A_G_h2 + chr2_234638006_A_G_h3 + chr2_234638006_A_G_h4 + cohort + KJONN, 
        d, family= 'binomial')

ci= data.frame(confint(m1))
ci$term= row.names(ci)
names(ci)= c('lo95', 'up95', 'term')

m1= tidy(m1) %>% filter(grepl('chr2', term)) %>% inner_join(., ci, by= 'term')

m1$term= factor(m1$term, levels= rev(c("chr2_234638006_A_G_h1", "chr2_234638006_A_G_h2", "chr2_234638006_A_G_h3",  
                                       "chr2_234638006_A_G_h4")), labels= rev(c('Maternal\ntransmitted', 'Maternal\nnon-transmitted', 
                                                                                 'Paternal\ntransmitted', 'Paternal\nnon-transmitted')))

p1= ggplot(m1, aes(x = term, y = estimate)) + 
  geom_hline(aes(yintercept = 0), size = .2, linetype = "dashed") +
  geom_hline(yintercept = log(setdiff(seq(0.0, 1.3, 0.2), 1)), size = .1, linetype = "dashed", colour= 'grey') + 
  geom_errorbar(aes(ymin = lo95, ymax = up95), size = .5, width = 0, color = colorBlindBlack8[2]) +
  theme_cowplot(font_size= 10) + 
  geom_point(size = 1, color = colorBlindBlack8[2]) +
  coord_trans(y = scales:::exp_trans()) +
  scale_y_continuous(breaks = log(seq(0.0000001, 1.4000002, 0.2)), labels = seq(0.0, 1.4, 0.2), limits = log(c(0.040, 1.4000002)),
                     expand= expansion(add=0)) +
  ylab('Odds Ratio') +
  theme(axis.title.x= element_blank(),
        axis.ticks.x= element_blank(),
        axis.text.x= element_text(size= 8),
        axis.line = element_line(color = "black", size = 0.2, lineend = "square"),
        axis.ticks.y = element_line(color = "black", size = 0.2))

save_plot(snakemake@output[[1]], plot= p1, base_height= 60, base_width= 90, units= 'mm', dpi= 300)

m1= glm(jaundice~ chr2_234522619_A_C_h1 + chr2_234522619_A_C_h2 + chr2_234522619_A_C_h3 + chr2_234522619_A_C_h4 + cohort + KJONN, 
        d, family= 'binomial')

ci= data.frame(confint(m1))
ci$term= row.names(ci)
names(ci)= c('lo95', 'up95', 'term')

m1= tidy(m1) %>% filter(grepl('chr2', term)) %>% inner_join(., ci, by= 'term')

m1$term= factor(m1$term, levels= rev(c("chr2_234522619_A_C_h1", "chr2_234522619_A_C_h2", "chr2_234522619_A_C_h3",  
                                       "chr2_234522619_A_C_h4")), labels= rev(c('Maternal\ntransmitted', 'Maternal\nnon-transmitted', 
                                                                                'Paternal\ntransmitted', 'Paternal\nnon-transmitted')))

p2= ggplot(m1, aes(x = term, y = estimate)) + 
  geom_hline(aes(yintercept = 0), size = .2, linetype = "dashed") +
  geom_hline(yintercept = log(setdiff(seq(0.0, 1.3, 0.2), 1)), size = .1, linetype = "dashed", colour= 'grey') + 
  geom_errorbar(aes(ymin = lo95, ymax = up95), size = .5, width = 0, color = colorBlindBlack8[2]) +
  theme_cowplot(font_size= 10) + 
  geom_point(size = 1, color = colorBlindBlack8[2]) +
  coord_trans(y = scales:::exp_trans()) +
  scale_y_continuous(breaks = log(seq(0.0000001, 1.4000002, 0.2)), labels = seq(0.0, 1.4, 0.2), limits = log(c(0.070, 1.4000002)),
                     expand= expansion(add=0)) +
  ylab('Odds Ratio') +
  theme(axis.title.x= element_blank(),
        axis.ticks.x= element_blank(),
        axis.text.x= element_text(size= 8),
        axis.line = element_line(color = "black", size = 0.2, lineend = "square"),
        axis.ticks.y = element_line(color = "black", size = 0.2))

save_plot(snakemake@output[[2]], plot= p2, base_height= 60, base_width= 90, units= 'mm', dpi= 300)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
library(data.table)
library(dplyr)
library(broom)
library(ggplot2)
library(showtext)
library(cowplot)
library(ggh4x)
library(ggrepel)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

SNIN2 = "~/Documents/results/nj/eqtls/UGT1_fets.txt"
SNIN3 = "resources/bilirubin/delivery/total-bilirubin-GWAS.txt.gz"
SNIN4 = "results/LD/delivery/UGT1A.ld"


# -----------------------
# Plot locusZoom for fetal vs adult scores
# and frequency of jaundice and 95%CI by polygenic score


# raw gwas results:
resR = fread(snakemake@input[[1]])
resR = filter(resR, CHR== 2, POS > 234.45e6, POS < 234.75e6) %>% filter(!duplicated(ID))
minpos = min(resR$POS)
maxpos = max(resR$POS)

# adult summaries:
resA = fread(snakemake@input[[2]])
resA = filter(resA, CHROM==2, POS > minpos, POS < maxpos)

# convert string p-vals:
#pvals = strsplit(resA$pvalue, split="e")
#pvals = lapply(pvals, function(x) if(length(x)==1) log10(as.numeric(x)) 
#               else log10(as.numeric(x[[1]])) + as.numeric(x[[2]]))
resA$LOG10P= -1*(pnorm(-abs(resA$BETA / resA$SE),log.p = T)*1/log(10) + log10(2))
#resA$LOG10P = -as.numeric(pvals)

resA = filter(resA, POS %in% resR$POS)

topsnpA = resA$RSID[which.max(resA$LOG10P)]  # rs887829

res = bind_rows("Adult"=resA, "Neonatal"=resR, .id="FA")

# LD
ld = read.table(snakemake@input[[3]], h=T)
ld = filter(ld, SNP_A==topsnpA | SNP_B==topsnpA)
ld$POS = ifelse(ld$SNP_A==topsnpA, ld$BP_B, ld$BP_A)
ld = ld[,c("POS", "R2")]
ld = bind_rows(ld, data.frame(POS=resA$POS[resA$RSID==topsnpA], R2=1))

res = inner_join(res, ld, by="POS")

res = filter(res, !is.na(R2))
# plot
panel_labels = group_by(res, FA) %>% summarize(LOG10P=pmax(max(LOG10P),65)*0.95)
panel_scales = list(
  scale_y_continuous(),
  scale_y_continuous(limits=c(0, 65))
)

# snps showing LD w/ rs887829
p_FA = ggplot(res, aes(x=POS/1e6, y=LOG10P)) + 
  geom_point(aes(col=R2), size= 0.6) +
  geom_point(data=filter(res, POS==resA$POS[resA$RSID==topsnpA]),
             col="purple", pch=18, size=2.5) +
  facet_grid(FA~., scales="free_y") +
  geom_text(data=panel_labels, aes(label=FA, x=234.47), hjust=0) + 
  scale_color_gradient(low="#5782AD", high="#ED1330", name=expression(R^2)) +
  scale_x_continuous(expand = c(0,0), limits=c(234.46, 234.72)) +
  facetted_pos_scales(y=panel_scales) + 
  theme_bw() + xlab("Position, Mbp") + ylab(expression(-log[10]~p)) + 
  theme(text= element_text(family= "Arial", size= 10),
	panel.grid.major.x=element_blank(), 
        panel.grid.minor=element_blank(),
        panel.background = element_rect(fill=NA, colour="grey60"),
        axis.ticks = element_line(colour="grey30", linewidth = 0.1),
        strip.text = element_blank(),
        axis.line.x=element_line(colour="grey30", linewidth= 0.4),
	legend.key.size= unit(4, 'mm'),
        legend.title = element_text(size= 6), #change legend title font size
        legend.text = element_text(size=6),
	axis.text= element_text(color= 'black'))

# ---- PGS subfigure ----

d= fread(snakemake@input[[4]])

d$PGS_cat= ntile(d$fets_jaundice, 10)
d$PGS_cat_nochr2= ntile(d$fets_jaundice_nochr2, 10)

x= group_by(d, PGS_cat) %>%
  summarize(p= mean(jaundice, na.rm= T), lo95= binom.test(sum(jaundice), n())$conf.int[1], 
            up95= binom.test(sum(jaundice), n())$conf.int[2])

names(x)[1]= 'PGS'

x$PGS= factor(x$PGS)
x$chr= 'Full'

x2= group_by(d, PGS_cat_nochr2) %>%
  summarize(p= mean(jaundice, na.rm= T), lo95= binom.test(sum(jaundice), n())$conf.int[1], 
            up95= binom.test(sum(jaundice), n())$conf.int[2])

names(x2)[1]= 'PGS'
x2$chr= 'No UGT1A*'

x2$PGS= factor(x2$PGS)

x= rbind(x, x2)

p2= ggplot(data= x, aes(PGS, p*100, colour= chr, group= chr)) +
  geom_point(position= position_dodge2(width= 0.5)) +
  geom_linerange( aes(x= PGS, ymin= lo95*100, ymax= up95*100, colour= chr), alpha= 1, size= 0.7, 
                  position= position_dodge2(width= 0.5)) +
  scale_color_manual(values= colorBlindBlack8[c(6, 2)], name=NULL, labels= c('Full', expression(Excluding~italic('UGT1A*')~region))) +
  theme_cowplot(font_size= 10) + 
  xlab('Adult bilirubin polygenic\nscore decile') + 
  ylab('Jaundice prevalence, %') +
  scale_y_continuous(limits= c(0, 12), expand= expansion(add= c(0, 0.0002))) +
  theme(text= element_text(size= 10),
        axis.ticks.x= element_blank(),
        axis.line = element_line(color = "black", size = 0.2, lineend = "square"),
        axis.ticks.y = element_line(color = "black", size = 0.2),
        legend.position = c(0.1, 0.1),
	legend.text.align = 0,
	axis.text= element_text(color= 'black'))

p3= plot_grid(p_FA, p2, labels="AUTO", rel_widths = c(6,4), rel_heights= c(6, 4))

save_plot(snakemake@output[[1]], plot= p3, base_height= 80, base_width= 180, units= 'mm', dpi= 300)


# -----------------
# Plot density of polygenic score

p1= ggplot() +
  geom_density(data= d, aes(fets_jaundice), fill= colorBlindBlack8[6], size= 0.1, alpha= 0.6, color= 'black') +
  geom_density(data= d, aes(fets_jaundice_nochr2), fill= colorBlindBlack8[2], size= 0.1, alpha= 0.6, color= 'black') +
  theme_cowplot(font_size= 10) + 
  xlab('Polygenic score of adult bilirubin levels') + 
  ylab('Denstiy') +
  scale_y_continuous(limits= c(0, 5.5), expand= expansion(add= c(0, 0.0002))) +
  theme(axis.text.x= element_text(size= 8),
        axis.ticks.x= element_blank(),
        axis.line = element_line(color = "black", size = 0.2, lineend = "square"),
        axis.ticks.y = element_line(color = "black", size = 0.2))

save_plot(snakemake@output[[2]], plot= p1, base_height= 60, base_width= 110, units= 'mm', dpi= 300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
library("dplyr")
library(ggplot2)
library(cowplot)
library("data.table")
library('showtext')
options(warn=-1)


d= fread(snakemake@input[[1]], h= T, select= c('LOG10P', 'ID'))


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= arrange(d, desc(LOG10P))
d= d[!duplicated(d$ID), ]

df= mutate(d, exp1= -log10(1:length(LOG10P)/length(LOG10P)))

chisq= qchisq(1-10**-df$LOG10P, 1)

lambda_gc= median(chisq)/qchisq(0.5,1)

p1= ggplot(df, aes(exp1, LOG10P)) +
  geom_point(size= 0.4, color= colorBlindBlack8[2]) +
  geom_abline(intercept = 0, slope = 1, alpha = .5) +
labs(colour="") +
theme_cowplot(font_size= 10) +
xlab(expression(Expected~-log[10]~pvalue)) +
ylab(expression(Observed~-log[10]~pvalue)) +
geom_text(aes(6, 0), label= paste("lambda", "==", round(lambda_gc, 2)), size= 10/.pt, parse= T)

ggsave(snakemake@output[[1]], plot= p1, dpi = 300, width= 60, height= 60, units= 'mm')
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
library(ggplot2)
library(dplyr)
library(tidyr)
library(ggh4x)
library(data.table)

# all in hg19

bed = read.table(gzfile(snakemake@input[[1]]), sep="\t", h=F, quote="")

colnames(bed) = c("CHR", "predictor", "type", "start", "end", "V6", "strand", "V8", "ANN")

bed = filter(bed, end>234.45e6, start<234.75e6)

nrow(bed)

bed = filter(bed, type %in% c("gene", "exon", "mRNA"))

# assign exons to genes
bed_exons = filter(bed, type=="exon")
bed_tran = filter(bed, type=="mRNA")
bed_genes = filter(bed, type=="gene")
bed_exons$transcript = gsub(".*transcript:(.*?);.*", "\\1", bed_exons$ANN)
bed_tran$transcript = gsub(".*transcript_id=(.*?);.*", "\\1", bed_tran$ANN)
bed_tran$gene = gsub(".*Parent=gene:(.*?);.*", "\\1", bed_tran$ANN)
bed_genes$gene = gsub(".*ID=gene:(.*?);.*", "\\1", bed_genes$ANN)
bed_genes$name = gsub(".*Name=(.*?);.*", "\\1", bed_genes$ANN)
bed_genes = filter(bed_genes, name!="AC114812.8")

bed_genes$y = seq_along(bed_genes$name)
bed_genes$y[bed_genes$name=="USP40"] = 2
bed_genes$y[bed_genes$name=="HJURP"] = 3
bed_genes$y[bed_genes$name=="MROH2A"] = 2
bed_genes$y = -2*bed_genes$y
#bed_genes$y_label = ifelse(bed_genes$name=="UGT1A1", bed_genes$y, bed_genes$y-3)
bed_genes$y_label = ifelse(grepl('UGT', bed_genes$name), bed_genes$y -4, bed_genes$y)
#bed_genes$x_label = ifelse(bed_genes$name=="UGT1A1", bed_genes$start+50e3, bed_genes$start)/1e6
bed_genes$x_label = ifelse(bed_genes$name=="UGT1A1", bed_genes$start, bed_genes$start)/1e6
bed_genes$x_label[bed_genes$name=="MROH2A"] = bed_genes$x_label[bed_genes$name=="MROH2A"]

# Note: exons from aberrant transcripts etc are dropped here
bed = inner_join(bed_exons[,c("type", "start", "end", "transcript")],
                 bed_tran[,c("transcript", "gene")], by="transcript")
bed = inner_join(bed, bed_genes[,c("gene", "y", "y_label")], by="gene")
bed$source = "Neonatal\njaundice"
bed_genes$source = "Neonatal\njaundice"

# load actual data
pgwas = fread(snakemake@input[[2]])
pgwas= pgwas %>% filter(substr(MarkerName, 1, 2) == "2:")

pgwas = separate(pgwas, MarkerName, into= c('CHR', 'POS'), sep= ':')
names(pgwas)[11]= 'pvalue'
pgwas$LOG10P= -log10(pgwas$pvalue)

pgwas$CHR= as.numeric(pgwas$CHR)
pgwas$POS= as.numeric(pgwas$POS)

nrow(pgwas)

ld = read.table(snakemake@input[[3]], h=T)
ld= filter(ld, BP_A== 234627536 | BP_B == 234627536)
ld$BP_A= ifelse(ld$BP_A== 234627536, ld$BP_B, ld$BP_A)

ld= select(ld, CHR_A, BP_A, R2)

ld= rbind(ld, data.frame(CHR_A= 2, BP_A= 234627536, R2= 1))

names(ld)= c('CHR', 'POS', 'R2')
pall = bind_rows("Neonatal\njaundice"=pgwas, .id="source") %>%
  filter(POS > 234.45e6, POS < 234.75e6)

pall = left_join(pall, ld[, c("POS", "R2")])


# plot
panel_labels = group_by(pall, source) %>% summarize(LOG10P=max(LOG10P)*0.9)
pos_break_fn = function(x) if(max(x)<10) { seq(0,10,2) } else { seq(0, 15, 5) }

p1= pall %>%
filter(!is.na(R2)) %>%
  ggplot(aes(x= POS/1e6, y=LOG10P)) +
    geom_point(size= 0.6, aes(col= R2)) +
  geom_point(data= filter(pall, POS== 234627536), col="purple", pch=18, size= 3) +
  facet_grid2(source~., scales="free_y", axes="all", remove_labels="x") +
  geom_segment(data= bed, aes(x=start/1e6, xend=end/1e6, y=y, yend=y), size=3, col="darkblue") +
  geom_segment(data= bed_genes, aes(x=start/1e6, xend=end/1e6, y=y, yend=y), size=0.5, col="darkblue") +
  geom_text(data= panel_labels, aes(x=234.44, label=source), hjust=0, size=3.5) +
  geom_text(data= filter(bed_genes, !name %in% c("UGT1A5", "UGT1A6", "UGT1A7", "UGT1A3", "UGT1A10")),
            aes(x= pmax(x_label, 234.44), label= name, y= y, vjust= 1.7-0.9 * grepl("UGT", name),
                hjust= -0.1+1.2*grepl("UGT", name)), size= 2.5, col="grey30", fontface=3) +
  coord_cartesian(xlim= c(234.45, 234.72)) +
  scale_color_gradient(low= "#5782AD", high= "#ED1330", name= expression(R^2)) +
  scale_y_continuous(breaks= pos_break_fn, name= expression(-log[10]~pvalue)) +
  theme_minimal() + 
  xlab("Position, Mbp") +
  theme(text= element_text(family= "Roboto", size= 10),
	panel.grid.major.x=element_blank(), 
        panel.grid.minor=element_blank(),
        panel.background = element_rect(fill=NA, colour="grey60"),
        axis.ticks = element_line(colour="grey30", linewidth = 0.1),
        strip.text = element_blank(),
        axis.line.x=element_line(colour="grey30", linewidth= 0.4),
	legend.key.size= unit(4, 'mm'),
	legend.title = element_text(size= 6), #change legend title font size
        legend.text = element_text(size=6),
	plot.margin = unit(c(t= 0, r=0, b= 0, l=0), 'cm'),
	axis.text=element_text(colour="black"))

ggsave(snakemake@output[[1]], plot= p1, width= 180, height= 80, units= 'mm', dpi= 300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
library(ggplot2)
library(dplyr)
library(tidyr)
library(data.table)
library(ggrepel)
library(ggh4x)
library('showtext')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


d= fread(snakemake@input[[1]])
pph= fread(snakemake@input[[2]])
eqtls= filter(pph, PP.H4.abf > 0.9) %>% pull(eqtl_data)
eqtls= c(eqtls, 'GTEx_ge_liver')

d= filter(d, gene== 'ENSG00000241635', eqtl_data %in% eqtls)

link= fread(snakemake@input[[3]]) %>% select(ID, POS, rsid)

link= separate(link, ID, into= c('CHR', 'POS2', 'REF', 'EFF'), sep= ':')

d= inner_join(d, link, by= c('snp'= 'POS'))

d$POS2= as.numeric(d$POS2)
d= filter(d, POS2 > 234.45e6, POS2 < 234.75e6)

ld = read.table(snakemake@input[[4]], h=T)
ld= filter(ld, BP_A== 234627536 | BP_B == 234627536)
ld$BP_A= ifelse(ld$BP_A== 234627536, ld$BP_B, ld$BP_A)

ld= select(ld, CHR_A, BP_A, R2)

ld= rbind(ld, data.frame(CHR_A= 2, BP_A= 234627536, R2= 1))

names(ld)= c('CHR', 'POS2', 'R2')

pall = left_join(d, ld[,c("POS2", "R2")]) %>% filter(!is.na(R2))

pall$z.df2= with(pall, ifelse(z.df1<0, z.df2* -1, z.df2))
pall$z.df1= with(pall, ifelse(z.df1<0, z.df1* -1, z.df1))

pall$label= ifelse(pall$SNP.PP.H4> 0.1 & pall$eqtl_data!= 'GTEx_ge_liver', paste0(pall$rsid, ' (', round(pall$SNP.PP.H4, 2), ')'), NA)

pall$eqtl_data= factor(pall$eqtl_data, levels= c("CEDAR_microarray_transverse_colon", "GTEx_ge_colon_transverse", "GTEx_ge_liver"),
                       labels= c("Tranverse colon\n(microarray)", "Tranverse colon\n(RNA-seq)", "Liver\n(RNA-seq)") )

p1= ggplot(pall, aes(z.df1, z.df2, colour= R2)) +
  geom_point(size= 0.5) +
  facet_grid(vars(eqtl_data)) +
  geom_point(data= filter(pall, POS2==234627536), col="purple", pch=18, size= 3) +
  geom_text_repel(aes(label= label), direction= 'y', hjust= 0, nudge_x= 1) +
  theme_minimal() +
  geom_hline(yintercept= 0) +
  geom_vline(xintercept= 0) +
  scale_color_gradient(low= "#5782AD", high= "#ED1330", name= expression(R^2) ) +
  xlab(expression(Z-score~on~neonatal~jaundice)) +
  xlim(0, 18) +
  ylab(expression(Z-score~on~italic(UGT1A1)~cis-eQTLS)) + 
  theme(text= element_text(family= "Roboto", size= 10, colour= 'black'),
        plot.margin = unit(c(t= 0, r=0, b= 0, l=0), 'cm'),
        legend.key.size= unit(4, 'mm'),
        legend.title = element_text(size= 6), #change legend title font size
        legend.text = element_text(size=6),
        axis.text=element_text(colour="black"))

ggsave(snakemake@output[[1]], plot= p1, width= 180, height= 180, units= 'mm', dpi= 300)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
library(data.table)
library(dplyr)
library(broom)
library(ggplot2)
library(showtext)
library(cowplot)
library(ggrepel)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)
d= fread(snakemake@input[[1]], h= T)

#d= fread('/mnt/work/pol/neo-jaundice/results/UGT-missense/delivery/jaundice.txt')
d= select(d, -names(d)[which(duplicated(names(d)))])

d= filter(d, SVLEN_UL_DG<308, SVLEN_UL_DG> 154)

d$cat_GA= with(d, ifelse(SVLEN_UL_DG< 259, 'Preterm',
                         ifelse(SVLEN_UL_DG< 273, 'Early term',
                                ifelse(SVLEN_UL_DG< 280, 'Term', 
                                       'Post-term'))))

fitted_models= d %>% nest_by(cat_GA) %>% mutate(model= list(glm(jaundice ~ fets_UGT_P24T, data = data, family= 'binomial')))

betas= fitted_models %>% summarize(tidy(model))
cis= (fitted_models %>% summarize(confint(model)))
cis= cbind(data.frame(cis[[1]]), data.frame(term = row.names(cis[[2]]), cis[[2]]))

names(cis)= c('cat_GA', 'term', 'lo95', 'up95')

models= inner_join(betas, cis, by= c('cat_GA', 'term')) %>% filter(grepl('UGT', term))

names(models) = c('cat_GA', 'term', 'estimate', 'se', 'stat', 'pvalue', 'lo95', 'up95')

models$cat_GA= factor(models$cat_GA, levels= c('Preterm', 'Early term', 'Term', 'Post-term'))

p1= ggplot(models, aes(x = cat_GA, y = estimate)) +
    geom_hline(yintercept = log(setdiff(seq(0.0, 1.3, 0.2), 1)), size = .1, linetype = "dashed", colour= 'grey') +
  geom_hline(aes(yintercept = 0), size = .2, linetype = "dashed") +
  geom_errorbar(aes(ymin = lo95, ymax = up95), size = .2, width = 0, color = colorBlindBlack8[2]) +
  theme_cowplot(font_size= 10) +
  geom_point(size = 1.2, color = colorBlindBlack8[2]) +
  coord_trans(y = scales:::exp_trans()) +
  scale_y_continuous(breaks = log(seq(0.0000001, 1.2000002, 0.2)), labels = seq(0.0, 1.2, 0.2), limits = log(c(0.001, 1.2000002)),
                     expand= expansion(add=0)) +
  ylab('Odds Ratio') +
  theme(axis.title.x= element_blank(),
        axis.ticks.x= element_blank(),
        axis.text.x= element_text(size= 8),
        axis.line = element_line(color = "black", size = 0.2, lineend = "square"),
        axis.ticks.y = element_line(color = "black", size = 0.2))

save_plot(snakemake@output[[1]], plot= p1, base_height= 60, base_width= 90, units= 'mm', dpi= 300)

d$fets_UGT_P24T_cat= factor(round(d$fets_UGT_P24T), levels= rev(c(0, 1, 2)), labels= rev(c('CC', 'CA', 'AA')))

dat_text= data.frame(label= c('CC', 'CA', 'AA'), fets_UGT_P24T_cat= c('CC', 'CA', 'AA'), SVLEN_UL_DG= c(156, 156, 156))

p2= ggplot(data= d, aes(SVLEN_UL_DG, group= factor(jaundice),  fill= factor(jaundice))) + 
    scale_colour_manual(values= colorBlindBlack8[c(2, 6)], guide= 'none') +
  scale_fill_manual(values= colorBlindBlack8[c(2, 6)], guide= 'none') +
  facet_grid(vars(fets_UGT_P24T_cat)) +
  geom_density(size= 0.1, alpha= 0.6, color= 'black') + 
  geom_hline(aes(yintercept = 0), size = .2,  colour= 'black') +
  theme_cowplot(font_size= 10) +
  ylab('Density') +
  xlab('Gestational duration, days') +
    scale_x_continuous(limits= c(154, 308), expand= expansion(add= 1)) +
  scale_y_continuous(limits= c(0, 0.055), expand= expansion(add= 0)) +
    theme(axis.title.x= element_blank(),
        axis.ticks.x= element_blank(),
        axis.text.x= element_text(size= 8),
        axis.text.y= element_text(size= 8),
        axis.line = element_line(color = "black", size = 0.2, lineend = "square"),
        axis.ticks.y = element_line(color = "black", size = 0.2),
        strip.background = element_blank(),
        strip.text.y = element_blank()) +
  geom_text_repel(data= dat_text, aes(x= -Inf, y= Inf, label= label),  inherit.aes = FALSE, hjust= 1)

save_plot(snakemake@output[[2]], plot= p2, base_height= 80, base_width= 110, units= 'mm', dpi= 300)

fitted_models= d %>% nest_by(ABO_incompatibility) %>% mutate(model= list(glm(jaundice ~ fets_UGT_P24T, data = data, family= 'binomial')))

betas= fitted_models %>% summarize(tidy(model)) 
cis= (fitted_models %>% summarize(confint(model)))
cis= cbind(data.frame(cis[[1]]), data.frame(term = row.names(cis[[2]]), cis[[2]]))

names(cis)= c('ABO_incompatibility', 'term', 'lo95', 'up95')

models= inner_join(betas, cis, by= c('ABO_incompatibility', 'term')) %>% filter(grepl('UGT', term))

names(models) = c('ABO', 'term', 'estimate', 'se', 'stat', 'pvalue', 'lo95', 'up95')

models$ABO= factor(models$ABO, levels = c(0, 1), labels= c('ABO\ncompatible', 'ABO\nincompatible'))

p3= ggplot(models, aes(x = ABO, y = estimate)) +
  geom_hline(yintercept = log(setdiff(seq(0.0, 1.3, 0.2), 1)), size = .1, linetype = "dashed", colour= 'grey') +
  geom_hline(aes(yintercept = 0), size = .2, linetype = "dashed") +
  geom_errorbar(aes(ymin = lo95, ymax = up95), size = .2, width = 0, color = colorBlindBlack8[2]) +
  theme_cowplot(font_size= 10) +
  geom_point(size = 1.2, color = colorBlindBlack8[2]) +
  coord_trans(y = scales:::exp_trans()) +
  scale_y_continuous(breaks = log(seq(0.0000001, 1.2000002, 0.2)), labels = seq(0.0, 1.2, 0.2), limits = log(c(0.001, 1.2000002)),
                     expand= expansion(add=0)) +
  ylab('Odds Ratio') +
  theme(axis.title.x= element_blank(),
        axis.ticks.x= element_blank(),
        axis.text.x= element_text(size= 8),
        axis.line = element_line(color = "black", size = 0.2, lineend = "square"),
        axis.ticks.y = element_line(color = "black", size = 0.2))

save_plot(snakemake@output[[3]], plot= p3, base_height= 60, base_width= 90, units= 'mm', dpi= 300)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
library(ggplot2)
library(dplyr)
library(tidyr)
library(ggh4x)
library(data.table)

# all in hg19

bed = read.table(gzfile(snakemake@input[[1]]), sep="\t", h=F, quote="")

colnames(bed) = c("CHR", "predictor", "type", "start", "end", "V6", "strand", "V8", "ANN")

bed = filter(bed, end>234.45e6, start<234.75e6)

nrow(bed)

bed = filter(bed, type %in% c("gene", "exon", "mRNA"))

# assign exons to genes
bed_exons = filter(bed, type=="exon")
bed_tran = filter(bed, type=="mRNA")
bed_genes = filter(bed, type=="gene")
bed_exons$transcript = gsub(".*transcript:(.*?);.*", "\\1", bed_exons$ANN)
bed_tran$transcript = gsub(".*transcript_id=(.*?);.*", "\\1", bed_tran$ANN)
bed_tran$gene = gsub(".*Parent=gene:(.*?);.*", "\\1", bed_tran$ANN)
bed_genes$gene = gsub(".*ID=gene:(.*?);.*", "\\1", bed_genes$ANN)
bed_genes$name = gsub(".*Name=(.*?);.*", "\\1", bed_genes$ANN)
bed_genes = filter(bed_genes, name!="AC114812.8")

bed_genes$y = seq_along(bed_genes$name)
bed_genes$y[bed_genes$name=="USP40"] = 2
bed_genes$y[bed_genes$name=="HJURP"] = 3
bed_genes$y[bed_genes$name=="MROH2A"] = 2
bed_genes$y = -5*bed_genes$y
#bed_genes$y_label = ifelse(bed_genes$name=="UGT1A1", bed_genes$y, bed_genes$y-3)
bed_genes$y_label = ifelse(grepl('UGT', bed_genes$name), bed_genes$y -4, bed_genes$y)
#bed_genes$x_label = ifelse(bed_genes$name=="UGT1A1", bed_genes$start+50e3, bed_genes$start)/1e6
bed_genes$x_label = ifelse(bed_genes$name=="UGT1A1", bed_genes$start, bed_genes$start)/1e6
bed_genes$x_label[bed_genes$name=="MROH2A"] = bed_genes$x_label[bed_genes$name=="MROH2A"]

# Note: exons from aberrant transcripts etc are dropped here
bed = inner_join(bed_exons[,c("type", "start", "end", "transcript")],
                 bed_tran[,c("transcript", "gene")], by="transcript")
bed = inner_join(bed, bed_genes[,c("gene", "y", "y_label")], by="gene")
bed$source = "Neonatal\njaundice"
bed_genes$source = "Neonatal\njaundice"

# load actual data
pgwas = fread(snakemake@input[[2]])
peqc = fread(snakemake@input[[3]])
peqc= filter(peqc, gene_id == 'ENSG00000241635')

peql = fread(snakemake@input[[4]])
peql= filter(peql, gene_id == 'ENSG00000241635')

link= fread(snakemake@input[[5]], select= c('ID', 'rsid'))
link= separate(link, ID, into= c('CHR', 'POS', 'REF', 'EFF'), sep= ':')

peql= inner_join(peql, link, by= 'rsid')
peqc= inner_join(peqc, link, by= 'rsid')


colnames(peqc) = c("gene_id", "RSID", "position", "MAF", "N", "REF", "EFF", "LOG10P", "BETA", "SE", "CHR", "POS", "ref", "eff")
colnames(peql) = c("gene_id", "RSID", "position", "MAF", "N", "REF", "EFF", "LOG10P", "BETA", "SE", "CHR", "POS", "ref", "eff")

peqc$LOG10P= -log10(peqc$LOG10P)
peql$LOG10P= -log10(peql$LOG10P)

peql$CHR= as.numeric(peql$CHR)
peql$POS= as.numeric(peql$POS)

peqc$CHR= as.numeric(peqc$CHR)
peqc$POS= as.numeric(peqc$POS)

nrow(peqc)
nrow(peql)
nrow(pgwas)

ld = read.table(snakemake@input[[6]], h=T)
ld= filter(ld, BP_A== 234627536 | BP_B == 234627536)
ld$BP_A= ifelse(ld$BP_A== 234627536, ld$BP_B, ld$BP_A)

ld= select(ld, CHR_A, BP_A, R2)

ld= rbind(ld, data.frame(CHR_A= 2, BP_A= 234627536, R2= 1))

names(ld)= c('CHR', 'POS', 'R2')
pall = bind_rows("Neonatal\njaundice"=pgwas, "eQTL colon"=peqc, "eQTL liver"=peql, .id="source") %>%
  filter(POS>234.45e6, POS<234.75e6)

pall = left_join(pall, ld[,c("POS", "R2")])

dashed_line= data.frame(source= c('Neonatal\njaundice', 'eQTL colon', 'eQTL liver'), x_inter= c(234627536, NA, NA))

# plot
panel_labels = group_by(pall, source) %>% summarize(LOG10P=max(LOG10P)*0.9)
pos_break_fn = function(x) if(max(x)<10) { seq(0,10,2) } else { seq(0, max(x), 20) }

p1= pall %>%
filter(!is.na(R2)) %>%
  ggplot(aes(x= POS/1e6, y=LOG10P)) +
    geom_point(size= 0.6, aes(col=R2)) +
  geom_point(data= filter(pall, POS==234627536), col="purple", pch=18, size= 3) +
  facet_grid2(source~., scales="free_y", axes="all", remove_labels="x") +
  geom_segment(data= bed, aes(x=start/1e6, xend=end/1e6, y=y, yend=y), size=3, col="darkblue") +
  geom_segment(data= bed_genes, aes(x=start/1e6, xend=end/1e6, y=y, yend=y), size=0.5, col="darkblue") +
  geom_text(data= panel_labels, aes(x=234.44, label=source), hjust=0, size=3.5) +
  geom_text(data= filter(bed_genes, !name %in% c("UGT1A5", "UGT1A6", "UGT1A7", "UGT1A3", "UGT1A10")),
            aes(x= pmax(x_label, 234.44), label= name, y= y, vjust= 1.7-0.9 * grepl("UGT", name),
                hjust= -0.1+1.2*grepl("UGT", name)), size= 2.5, col="grey30", fontface=3) +
  #geom_segment(data=filter(bed_genes, name %in% c("UGT1A8", "UGT1A9", "UGT1A4")),
  #             aes(x=x_label, xend=start/1e6-0.001, y=y, yend=y), size=0.3, col="grey30") +
  #geom_segment(data=filter(bed_genes, name=="UGT1A1"),
  #             aes(x=x_label-0.029, xend= end/1e6+0.001, y=y-1, yend=y), size=0.3, col="grey30") +
  coord_cartesian(xlim= c(234.45, 234.72)) +
  scale_color_gradient(low= "#5782AD", high= "#ED1330", name= expression(R^2)) +
  scale_y_continuous(breaks= pos_break_fn, name= expression(-log[10]~pvalue)) +
  force_panelsizes(rows= c(1,1,2)) +
  theme_minimal() + 
  xlab("Position, Mbp") +
  theme(text= element_text(family= "Roboto", size= 10),
	panel.grid.major.x=element_blank(), 
        panel.grid.minor=element_blank(),
        panel.background = element_rect(fill=NA, colour="grey60"),
        axis.ticks = element_line(colour="grey30", linewidth = 0.1),
        strip.text = element_blank(),
        axis.line.x=element_line(colour="grey30", linewidth= 0.4),
	legend.key.size= unit(4, 'mm'),
	legend.title = element_text(size= 6), #change legend title font size
        legend.text = element_text(size=6),
	plot.margin = unit(c(t= 0, r=0, b= 0, l=0), 'cm'),
	axis.text=element_text(colour="black"))  +
  geom_vline(data= dashed_line, aes(xintercept= x_inter/1e6), linetype= 'dashed', color= 'grey30', linewidth= 0.1)

ggsave(snakemake@output[[1]], plot= p1, width= 180, height= 120, units= 'mm', dpi= 300)
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
library(data.table)
library(dplyr)

# This code implements an exact SNP test of Hardy-Weinberg Equilibrium as described in
# Wigginton, JE, Cutler, DJ, and Abecasis, GR (2005) A Note on Exact Tests of 
# Hardy-Weinberg Equilibrium. American Journal of Human Genetics. 76: 000 - 000  

# NOTE: return code of -1.0 signals an error condition

SNPHWE <- function(obs_hets, obs_hom1, obs_hom2)
   {
   print(paste(obs_hets, obs_hom1, obs_hom2))
   if (obs_hom1 < 0 || obs_hom2 < 0 || obs_hets < 0)
      return(-1.0)

   # total number of genotypes
   N <- obs_hom1 + obs_hom2 + obs_hets

   # rare homozygotes, common homozygotes
   obs_homr <- min(obs_hom1, obs_hom2)
   obs_homc <- max(obs_hom1, obs_hom2)

   # number of rare allele copies
   rare  <- obs_homr * 2 + obs_hets

   # Initialize probability array
   probs <- rep(0, 1 + rare)

   # Find midpoint of the distribution
   mid <- floor(rare * ( 2 * N - rare) / (2 * N))
   if ( (mid %% 2) != (rare %% 2) ) mid <- mid + 1

   probs[mid + 1] <- 1.0
   mysum <- 1.0

   # Calculate probablities from midpoint down 
   curr_hets <- mid
   curr_homr <- (rare - mid) / 2
   curr_homc <- N - curr_hets - curr_homr

   while ( curr_hets >=  2)
      {
      probs[curr_hets - 1]  <- probs[curr_hets + 1] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0)  * (curr_homc + 1.0))
      mysum <- mysum + probs[curr_hets - 1]

      # 2 fewer heterozygotes -> add 1 rare homozygote, 1 common homozygote
      curr_hets <- curr_hets - 2
      curr_homr <- curr_homr + 1
      curr_homc <- curr_homc + 1
      }    

   # Calculate probabilities from midpoint up
   curr_hets <- mid
   curr_homr <- (rare - mid) / 2
   curr_homc <- N - curr_hets - curr_homr

   while ( curr_hets <= rare - 2)
      {
      probs[curr_hets + 3] <- probs[curr_hets + 1] * 4.0 * curr_homr * curr_homc / ((curr_hets + 2.0) * (curr_hets + 1.0))
      mysum <- mysum + probs[curr_hets + 3]

      # add 2 heterozygotes -> subtract 1 rare homozygtote, 1 common homozygote
      curr_hets <- curr_hets + 2
      curr_homr <- curr_homr - 1
      curr_homc <- curr_homc - 1
      }    

    # P-value calculation
    target <- probs[obs_hets + 1]

    #plo <- min(1.0, sum(probs[1:obs_hets + 1]) / mysum)

    #phi <- min(1.0, sum(probs[obs_hets + 1: rare + 1]) / mysum)

    # This assignment is the last statement in the fuction to ensure 
    # that it is used as the return value
    p <- min(1.0, sum(probs[probs <= target])/ mysum)
}

format_haps= function(hap){
variants= paste('X', hap$chr, hap$pos, hap$ref, hap$eff, sep ='_')
ids= names(hap)[5:ncol(hap)]
hap= as.data.frame(t(hap[, 5:ncol(hap)]))
names(hap)=  variants
hap$IID= ids

return(hap)
}


fets= fread(snakemake@input[[1]])

fets= format_haps(fets)

pheno= fread(snakemake@input[[2]])

fets= filter(fets, IID %in% pheno$IID)
fets= data.frame(fets)

df_list= lapply(names(fets)[grepl('X_', names(fets))], function(snp) {
hets  <- max(table(round(fets[, snp]))[2], 0, na.rm= T)
hom_1 <- max(table(round(fets[, snp]))[1], 0, na.rm= T)
hom_2 <- max(table(round(fets[, snp]))[3], 0, na.rm= T)

p_value= SNPHWE(hets, hom_1, hom_2)

temp_df= data.frame(SNP= snp, hwe_pvalue= p_value)
return(temp_df)

}

)

d= do.call('rbind', df_list)

fwrite(d, snakemake@output[[1]], sep= '\t')
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
library(data.table)
library(dplyr)
library(tidyr)

format_haps= function(hap){
variants= paste(hap$chr, hap$pos, hap$ref, hap$eff, sep =':')
ids= names(hap)[5:ncol(hap)]
hap= as.data.frame(t(hap[, 5:ncol(hap)]))
names(hap)= variants
hap$PREG_ID_1724= as.numeric(ids)
return(hap)
}

h1= fread(snakemake@input[[1]])
h2= fread(snakemake@input[[2]])
h3= fread(snakemake@input[[3]])
h4= fread(snakemake@input[[4]])

h1= format_haps(h1)
h2= format_haps(h2)
h3= format_haps(h3)
h4= format_haps(h4)

pheno= fread(snakemake@input[[5]])

pheno$PREG_ID_1724= as.numeric(pheno$PREG_ID_1724)

print(nrow(pheno))
write( paste('snp', 'n', 'beta_h1', 'se_h1', 'pvalue_h1', 'beta_h2', 'se_h2', 'pvalue_h2', 'beta_h3', 'se_h3', 'pvalue_h3', 'beta_h4', 'se_h4', 'pvalue_h4', sep= '\t'), snakemake@output[[1]], append= T)

results_list= lapply(names(h1)[1:(length(names(h1))-1)], function(snp) {

print(snp)

h1_temp= h1[, c('PREG_ID_1724', snp)]
h2_temp= h2[, c('PREG_ID_1724', snp)]
h3_temp= h3[, c('PREG_ID_1724', snp)]
h4_temp= h4[, c('PREG_ID_1724', snp)]

names(h1_temp)= c('PREG_ID_1724', 'h1')
names(h2_temp)= c('PREG_ID_1724', 'h2')
names(h3_temp)= c('PREG_ID_1724', 'h3')
names(h4_temp)= c('PREG_ID_1724', 'h4')

d= inner_join(pheno, h1_temp, by= 'PREG_ID_1724') %>% inner_join(., h2_temp, by= 'PREG_ID_1724') %>% inner_join(., h3_temp, by= 'PREG_ID_1724') %>% inner_join(., h4_temp, by= 'PREG_ID_1724')

if (grepl('X', snp)) {

df= filter(d, KJONN== 0)

m1= glm(jaundice~ h1 + h2 + h3 + cohort + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, df, family= binomial)

n= length(resid(m1))
coefs= summary(m1)$coefficients[2:5,]
beta_h1= coefs[1,1]
se_h1= coefs[1,2]
pvalue_h1= coefs[1,4]
beta_h2= coefs[2,1]
se_h2= coefs[2,2]
pvalue_h2= coefs[2,4]
beta_h3= coefs[3,1]
se_h3= coefs[3,2]
pvalue_h3= coefs[3,4]
beta_h4= ''
se_h4= ''
pvalue_h4= ''

results= paste(snp, n, beta_h1, se_h1, pvalue_h1, beta_h2, se_h2, pvalue_h2, beta_h3, se_h3, pvalue_h3, beta_h4, se_h4, pvalue_h4, sep= '\t')

write(results, file= snakemake@output[[1]], append=TRUE)

df= filter(d, KJONN== 1)

m1= glm(jaundice~ h1 + h2 + h4 + cohort + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, df, family= binomial)

n= length(resid(m1))
coefs= summary(m1)$coefficients[2:5,]
beta_h1= coefs[1,1]
se_h1= coefs[1,2]
pvalue_h1= coefs[1,4]
beta_h2= coefs[2,1]
se_h2= coefs[2,2]
pvalue_h2= coefs[2,4]
beta_h3= ''
se_h3= ''
pvalue_h3= '' 
beta_h4= coefs[3, 1]
se_h4= coefs[3, 2]
pvalue_h4= coefs[3,4]

results= paste(snp, n, beta_h1, se_h1, pvalue_h1, beta_h2, se_h2, pvalue_h2, beta_h3, se_h3, pvalue_h3, beta_h4, se_h4, pvalue_h4, sep= '\t')

write(results, file= snakemake@output[[1]], append=TRUE)


} else {

m1= glm(jaundice~ h1 + h2 + h3 + h4 + KJONN + cohort + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, d, family= binomial)

n= length(resid(m1))
coefs= summary(m1)$coefficients[2:5,]
beta_h1= coefs[1,1]
se_h1= coefs[1,2]
pvalue_h1= coefs[1,4]
beta_h2= coefs[2,1]
se_h2= coefs[2,2]
pvalue_h2= coefs[2,4]
beta_h3= coefs[3,1]
se_h3= coefs[3,2]
pvalue_h3= coefs[3,4]
beta_h4= coefs[4,1]
se_h4= coefs[4,2]
pvalue_h4= coefs[4,4]

results= paste(snp, n, beta_h1, se_h1, pvalue_h1, beta_h2, se_h2, pvalue_h2, beta_h3, se_h3, pvalue_h3, beta_h4, se_h4, pvalue_h4, sep= '\t')
write(results, file= snakemake@output[[1]], append=TRUE)


}


}

)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
library(dplyr)
library(data.table)


h1= fread(snakemake@input[[1]])
h2= fread(snakemake@input[[2]])
h3= fread(snakemake@input[[3]])
h4= fread(snakemake@input[[4]])

pheno= fread(snakemake@input[[5]], select= c('PREG_ID', 'jaundice', 'FID_y', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'cohort', 'KJONN', 'Child', 'Father', 'Mother'))

pheno= inner_join(pheno, h1, by= 'PREG_ID') %>% inner_join(., h2, by= c('PREG_ID')) %>% inner_join(., h3, by= c('PREG_ID')) %>% inner_join(., h4, by= c('PREG_ID'))

m1= glm(jaundice~ h1_jaundice + h2_jaundice + h3_jaundice + h4_jaundice + cohort + KJONN + PC1 + PC2 + PC3 + PC4 + PC5 + PC6, pheno, family= binomial)

n= length(resid(m1))
coefs= data.frame(summary(m1)$coefficients[2:5,])
names(coefs)= c('beta', 'se', 'z', 'pvalue')

coefs$haplotype= rownames(coefs)
coefs$n= n

fwrite(coefs, snakemake@output[[1]], sep= '\t')
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import pandas as pd
import numpy as np

PREG_ID= 'PREG_ID'
Sentrix= 'IID'

def format_df(df):
        df.columns= d.PREG_ID_1724
        df[['chr', 'pos', 'ref', 'eff']]= varnames.str.split(':', expand= True)
        cols = list(df.columns.values)
        cols= cols[-4:] + cols[:-4]
        df= df[cols]
        return df

def phase_by_transmission(MT_list, PT_list):
        # Credit goes to Alistair Miles - code adapted from scikit-allel

        n_variants = len(varnames)
        n_samples = fets.shape[1]
        n_progeny = 1
        max_allele = 1

        # setup intermediates
        mac = np.zeros(max_allele + 1, dtype='u1')  # maternal allele counts
        pac = np.zeros(max_allele + 1, dtype='u1')  # paternal allele counts

        # setup outputs

        for index, row in d.iterrows():
                mat1= np.array([int(i[0]) for i in moms[row['Mother']]])
                mat2= np.array([int(i[2]) for i in moms[row['Mother']]])
                pat1= np.array([int(i[0]) for i in dads[row['Father']]])
                pat2= np.array([int(i[2]) for i in dads[row['Father']]])
                MT= np.array([int(i[0]) for i in fets[row['Child']]])
                PT= np.array([int(i[2]) for i in fets[row['Child']]])
                for i in range(n_variants):
                        ma1 = mat1[i]  # maternal allele 1
                        ma2 = mat2[i]  # maternal allele 2
                        pa1 = pat1[i]  # paternal allele 1
                        pa2 = pat2[i]  # paternal allele 2
                        # check for any missing calls in parents
                        if ma1 < 0 or ma2 < 0 or pa1 < 0 or pa2 < 0:
                                continue
                        # parental allele counts
                        mac[:] = 0  # reset to zero
                        pac[:] = 0  # reset to zero
                        mac[ma1] = 1
                        mac[ma2] = 1
                        pac[pa1] = 1
                        pac[pa2] = 1
                        a1 = MT[i]
                        a2 = PT[i]
                        if a1 < 0 or a2 < 0:  # child is missing
                                continue
                        elif a1 == a2:  # child is homozygous
                                if mac[a1] > 0 and pac[a1] > 0:  # Mendelian consistent
                                        # trivially phase the child
                                        continue
                        else:  # child is heterozygous
                                if mac[a1] > 0 and pac[a1] == 0 and pac[a2] > 0:
                                # allele 1 is unique to mother, no need to swap
                                        continue
                                elif mac[a2] > 0 and pac[a2] == 0 and pac[a1] > 0:
                                        # allele 2 is unique to mother, swap child alleles
                                        MT[i] = a2
                                        PT[i] = a1
                                elif pac[a1] > 0 and mac[a1] == 0 and mac[a2] > 0:
                                        # allele 1 is unique to father, swap child alleles
                                        MT[i] = a2
                                        PT[i] = a1
                                elif pac[a2] > 0 and mac[a2] == 0 and mac[a1] > 0:
                                        # allele 2 is unique to father, no need to swap
                                        continue
                                else:
                                        MT[i] = a2
                                        PT[i] = a1
                MT_list.append(pd.Series(MT))
                PT_list.append(pd.Series(PT))
        return MT_list, PT_list


d= pd.read_csv(snakemake.input[2], sep= '\t', header= 0)
d.dropna(axis= 0, inplace= True)

fets= pd.read_csv(snakemake.input[0], sep='\t', header= 0)
moms= pd.read_csv(snakemake.input[1], sep= '\t', header= 0)
dads= pd.read_csv(snakemake.input[3], sep='\t', header= 0)

d= d.loc[d.Child.isin(fets.columns), :]
d= d.loc[d.Mother.isin(moms.columns), :]
d= d.loc[d.Father.isin(dads.columns), :]

varnames= fets.chr.map(str) + ':' + fets.pos.map(str) + ':' + fets.ref + ':' + fets.eff

fets= fets[d.Child]
moms= moms[d.Mother]
dads= dads[d.Father]


fets.replace('/', '|', inplace= True, regex= True)
moms.replace('/', '|', inplace= True, regex= True)
dads.replace('/', '|', inplace= True, regex= True)

fets= np.where(fets== '0', '0|0', np.where(fets== '1', '1|1', fets))
moms= np.where(moms== '0', '0|0', np.where(moms== '1', '1|1', moms))
dads= np.where(dads== '0', '0|0', np.where(dads== '1', '1|1', dads))

fets= pd.DataFrame(fets, columns= d.Child)
moms= pd.DataFrame(moms, columns= d.Mother)
dads= pd.DataFrame(dads, columns= d.Father)

moms1= moms.copy()
dads1= dads.copy()

moms= moms.loc[:,~moms.columns.duplicated()]
dads= dads.loc[:,~dads.columns.duplicated()]

MT_list= list()
PT_list= list()

MT_list, PT_list= phase_by_transmission(MT_list, PT_list)
MT= pd.concat(MT_list, axis= 1)
PT= pd.concat(PT_list, axis= 1)

moms1= np.where(moms1== '0|0', 0, np.where((moms1== '0|1') | (moms1== '1|0'), 1, np.where(moms1== '1|1', 2, np.nan)))
dads1= np.where(dads1== '0|0', 0, np.where((dads1== '0|1') | (dads1== '1|0'), 1, np.where(dads1== '1|1', 2, np.nan)))

MnT= moms1 - MT
PnT= dads1 - PT

MT= format_df(MT)
MnT= format_df(MnT)
PT= format_df(PT)
PnT= format_df(PnT)

MT.to_csv(snakemake.output[0], header= True, sep= '\t', index= False)
MnT.to_csv(snakemake.output[1], header= True, sep= '\t', index= False)
PT.to_csv(snakemake.output[2], header= True, sep= '\t', index= False)
PnT.to_csv(snakemake.output[3], header= True, sep= '\t', index= False)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
library(data.table)
library(dplyr)
library(tidyr)

mfr= fread(snakemake@input[[1]])
ids= fread(snakemake@input[[2]])


flag= fread(snakemake@input[[3]])
flag= filter(flag, genotypesOK== T, phenoOK== T)

out= readLines(snakemake@input[[4]])

ids= pivot_longer(ids, c('Child', 'Mother', 'Father'), names_to= 'ROLE', values_to= 'IID')
ids= ids[!duplicated(ids[,c('PREG_ID_1724', 'IID')]), ]

ids= filter(ids, (IID %in% out), IID %in% flag$IID)
ids= group_by(ids, PREG_ID_1724, ROLE) %>% filter(row_number()== 1)

ids= spread(ids, key= ROLE, value= IID)

mfr= inner_join(mfr, ids, by= 'PREG_ID_1724')

mfr$miscarriage= with(mfr, ifelse(is.na(SPABORT_12_5), SPABORT_23_5, ifelse(is.na(SPABORT_23_5), SPABORT_12_5, SPABORT_12_5 + SPABORT_23_5)))

mfr$miscarriage_bin= with(mfr, ifelse(is.na(miscarriage), NA, ifelse(miscarriage> 1, 1, 0)))

mfr2= select(mfr, PREG_ID_1724, miscarriage, PARITET_5, Child, Mother, Father)
mfr2= filter(mfr2, !is.na(miscarriage), !is.na(PARITET_5))
mfr2$miscarriage_resid= lm(miscarriage ~ PARITET_5, mfr2)$resid
mfr2= select(mfr2, PREG_ID_1724, miscarriage_resid, Child, Mother, Father)
mfr= left_join(mfr, mfr2, by= c('PREG_ID_1724', 'Child', 'Mother', 'Father'))

mfr= arrange(mfr, desc(miscarriage))

if (grepl('bin', snakemake@output[[1]])){

moms= filter(mfr, !duplicated(PREG_ID_1724)) %>% select(Mother, miscarriage_bin, PREG_ID_1724, MOR_FAAR, BATCH_moms) %>% filter(!is.na(Mother), !is.na(miscarriage_bin))
fets= filter(mfr, !duplicated(PREG_ID_1724)) %>% select(Child, miscarriage_bin, PREG_ID_1724, MOR_FAAR, BATCH_fets) %>% filter(!is.na(Child), !is.na(miscarriage_bin))
dads= filter(mfr, !duplicated(PREG_ID_1724)) %>% select(Father, miscarriage_bin, PREG_ID_1724, MOR_FAAR, BATCH_dads) %>% filter(!is.na(Father), !is.na(miscarriage_bin))

} else if (grepl('_resid', snakemake@output[[1]])) {

moms= filter(mfr, !duplicated(PREG_ID_1724)) %>% select(Mother, miscarriage_resid, PREG_ID_1724, MOR_FAAR, BATCH_moms) %>% filter(!is.na(Mother), !is.na(miscarriage_resid))
fets= filter(mfr, !duplicated(PREG_ID_1724)) %>% select(Child, miscarriage_resid, PREG_ID_1724, MOR_FAAR, BATCH_fets) %>% filter(!is.na(Child), !is.na(miscarriage_resid))
dads= filter(mfr, !duplicated(PREG_ID_1724)) %>% select(Father, miscarriage_resid, PREG_ID_1724, MOR_FAAR, BATCH_dads) %>% filter(!is.na(Father), !is.na(miscarriage_resid))

} else {

moms= filter(mfr, !duplicated(PREG_ID_1724)) %>% select(Mother, miscarriage, PREG_ID_1724, MOR_FAAR, BATCH_moms) %>% filter(!is.na(Mother), !is.na(miscarriage))
fets= filter(mfr, !duplicated(PREG_ID_1724)) %>% select(Child, miscarriage, PREG_ID_1724, MOR_FAAR, BATCH_fets) %>% filter(!is.na(Child), !is.na(miscarriage))
dads= filter(mfr, !duplicated(PREG_ID_1724)) %>% select(Father, miscarriage, PREG_ID_1724, MOR_FAAR, BATCH_dads) %>% filter(!is.na(Father), !is.na(miscarriage))

}

moms= moms[!duplicated(moms$Mother, incomparables= NA), ]
fets= fets[!duplicated(fets$Child, incomparables= NA), ]
dads= dads[!duplicated(dads$Father, incomparables= NA), ]

names(moms)[ncol(moms)]= 'cohort'
names(fets)[ncol(fets)]= 'cohort'
names(dads)[ncol(dads)]= 'cohort'

moms$cohort= ifelse(grepl('NORM', moms$cohort), 'NORMENT', moms$cohort)
dads$cohort= ifelse(grepl('NORM', dads$cohort), 'NORMENT', dads$cohort)
fets$cohort= ifelse(grepl('NORM', fets$cohort), 'NORMENT', fets$cohort)

names(moms)[1]= 'IID'
names(fets)[1]= 'IID'
names(dads)[1]= 'IID'

fwrite(moms, snakemake@output[[1]], sep= '\t')
fwrite(fets, snakemake@output[[2]], sep= '\t')
fwrite(dads, snakemake@output[[3]], sep= '\t')
ShowHide 161 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/psnavais/VR-StartingGrant-2023
Name: vr-startinggrant-2023
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...