Snakemake workflow: Meta-analysis of GWAS of gestational duration, preterm and post-term deliveries (EGG Consortium)

public public 1yr ago Version: v1.0.0 0 bookmarks

Snakemake workflow: Meta-analysis of GWAS of gestational duration, preterm and post-term deliveries (EGG Consortium)

This repository contains code for Genetic effects on the timing of parturition and links to fetal birth weight .

Citation

Contact

Code Snippets

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
library(data.table)
library(dplyr)
library(coloc)
library(parallel)

pph_outfile= snakemake@output[[1]]
results_outfile= snakemake@output[[2]]

cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tlocus\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tlocus\n', file= snakemake@output[[2]])


prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6

d= fread(snakemake@input[[1]], select= c('ID', 'CHR', 'POS', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue', 'EAF'))
d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF)

x= fread(snakemake@input[[2]], select= c('ID', 'N','BETA', 'SE', 'pvalue', 'EAF'))

x$MAF= ifelse(x$EAF>0.5, 1- x$EAF, x$EAF)

names(x)= c('ID', 'N', 'beta', 'se', 'p', 'eaf', 'maf')

d= inner_join(d, x, by= 'ID')

if (sum(is.na(d$eaf)) == nrow(d)) {
d$maf= d$MAF
} 

z= fread(snakemake@input[[3]])

z$CHR= as.numeric(gsub('chr', '', z$chr))

z$locus= 1:nrow(z)


funk= function(i) {
        row= z[i,]
	locus= paste0('locus_', i)
        temp_df= filter(d, CHR== as.integer(row[, 'CHR']), POS >= as.integer(row[, 'start']), POS<= as.integer(row[, 'stop']))

	if (nrow(temp_df)== 0) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0,  PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
	pph_list[[i]]= PPH
	res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res
	fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')

	} else {
	temp_df= filter(temp_df, SE>0, se>0)
	if (grepl('PCOS|miscarriage|POP|endometriosis|Preeclampsia|leiomyoma_uterus', snakemake@input[[2]])) {
        if (grepl('PCOS', snakemake@input[[2]])) {s_pheno=  (1184 + 670 + 157 +658 +984 + 485 + 462 )/ (1184 + 670 + 157 +658 +984 + 485 + 462 + 5799 + 1379 +2807 +6774 +2963+ 407 + 96172)}
        if (grepl('miscarriage', snakemake@input[[2]])) {s_pheno=49996 / (174109+ 49996)}
        if (grepl('POP', snakemake@input[[2]])) {s_pheno= 7053 / (57407 + 7053) }
        if (grepl('endometriosis', snakemake@input[[2]])) {s_pheno= 1496 / (192678 + 1496 )}
        if (grepl('Preeclampsia', snakemake@input[[2]])){ s_pheno= 4630/ (4630 + 373345)}
        if (grepl('leiomyoma_uterus', snakemake@input[[2]])){ s_pheno= ( 14569) / (85792 + 14569)}
        if (grepl('allPTD', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.067)
        } else if (grepl('postTerm', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.122)
        } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) }

        data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'cc', snp= temp_df$ID, s= s_pheno, MAF= temp_df$maf)

        } else { 
	if (grepl('allPTD', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.067)
        } else if (grepl('postTerm', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.122)
        } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) }

        data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'quant', snp= temp_df$ID, MAF= temp_df$maf)
}
myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
	if (length(myres)==1 ) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
        pph_list[[i]]= PPH
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res
	fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')
	next 
	} else {
	PPH= data.frame(t(myres[[1]]))
        PPH$locus= locus
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= myres[[2]]
        res$locus= locus
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)

}
}
}

mclapply(1:nrow(z), funk, mc.cores= 3)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
library(data.table)
library(dplyr)
library(coloc)
library(parallel)

pph_outfile= snakemake@output[[1]]
results_outfile= snakemake@output[[2]]

cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tlocus\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tlocus\n', file= snakemake@output[[2]])


prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6

d= fread(snakemake@input[[1]], select= c('ID', 'CHR', 'POS', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue', 'EAF'))
d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF)

x= fread(snakemake@input[[2]], select= c('ID', 'TOTALSAMPLESIZE','BETA', 'SE', 'pvalue', 'EAF'))

x$MAF= ifelse(x$EAF>0.5, 1- x$EAF, x$EAF)

names(x)= c('ID', 'N', 'beta', 'se', 'p', 'eaf', 'maf')

d= inner_join(d, x, by= 'ID')

if (sum(is.na(d$eaf)) == nrow(d)) {
d$maf= d$MAF
} 

z= fread(snakemake@input[[3]])

z$CHR= as.numeric(gsub('chr', '', z$chr))

z$locus= 1:nrow(z)


funk= function(i) {
        row= z[i,]
	locus= paste0('locus_', i)
        temp_df= filter(d, CHR== as.integer(row[, 'CHR']), POS >= as.integer(row[, 'start']), POS<= as.integer(row[, 'stop']))

	if (nrow(temp_df)== 0) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0,  PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
	pph_list[[i]]= PPH
	res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res
	fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')

	} else {
	temp_df= filter(temp_df, SE>0, se>0)
	s_pheno= 0.067
	data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF)
	data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'cc', snp= temp_df$ID, s= s_pheno, MAF= temp_df$maf)
	myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
	if (length(myres)==1 ) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
        pph_list[[i]]= PPH
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res
	fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')
	next 
	} else {
	PPH= data.frame(t(myres[[1]]))
        PPH$locus= locus
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= myres[[2]]
        res$locus= locus
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)

}
}
}

mclapply(1:nrow(z), funk, mc.cores= 3)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
library(data.table)
library(dplyr)
library(coloc)

prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6

d= fread(snakemake@input[[1]])
d= select(d, ID, CHR, POS, TOTALSAMPLESIZE, BETA, SE, pvalue, EAF)
d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF)

x= fread(snakemake@input[[2]])

x$BETA= ifelse(x$REF > x$EFF, -1 * x$BETA, x$BETA)

x= select(x, ID, TOTALSAMPLESIZE, BETA, SE, pvalue, EAF)
x$MAF= ifelse(x$EAF>0.5, 1- x$EAF, x$EAF)
names(x)= c('ID', 'N', 'beta', 'se', 'p', 'eaf', 'maf')

d= inner_join(d, x, by= 'ID')

if (sum(is.na(d$eaf)) == nrow(d)) {
d$maf= d$MAF
} 

z= fread(snakemake@input[[3]])
z$CHR= ifelse(z$CHR== 'X', '23', z$CHR)
z$CHR= as.integer(z$CHR)

z1= fread(snakemake@input[[4]])
z1$CHR= ifelse(z1$CHR== 'X', '23', z1$CHR)
z1$CHR= as.integer(z1$CHR)
z1= filter(z1, nearestGene== 'LRP5' | nearestGene== 'SCML4')

z= rbind(z, z1)

pph_list= list()
res_list= list()

for(i in 1:nrow(z)) {
        row <- z[i,]
	locus= paste0('chr', row[,'CHR'], '_', row[,'nearestGene'])
        temp_df= filter(d, CHR== as.integer(row[, 'CHR']), POS >= as.integer(row[, 'pos1']), POS<= as.integer(row[, 'pos2']))

	if (nrow(temp_df)== 0) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0,  PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
	pph_list[[i]]= PPH
	res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res

	} else {
	temp_df= filter(temp_df, SE>0, se>0)
	s_pheno= 0.067
	data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF)
	data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'cc', snp= temp_df$ID, s= s_pheno, MAF= temp_df$maf)
	myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
	if (length(myres)==1 ) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
        pph_list[[i]]= PPH
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res
	next 
	}
	PPH= data.frame(t(myres[[1]]))
	PPH$locus= locus        
	pph_list[[i]]= PPH
        res= myres[[2]]
        res$locus= locus
	res_list[[i]]= res
}
}

pph= data.frame(do.call('rbind', pph_list))
res= data.frame(do.call('rbind', res_list))


write.table(pph, snakemake@output[[1]], sep= '\t', row.names=F, col.names= T, quote=F)
write.table(res, snakemake@output[[2]], sep= '\t', row.names=F, col.names= T, quote=F)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
library(data.table)
library(dplyr)
library(coloc)

prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6

d= fread(snakemake@input[[1]], select= c('ID', 'CHR', 'POS', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue', 'EAF'))
d= select(d, ID, CHR, POS, TOTALSAMPLESIZE, BETA, SE, pvalue, EAF)
d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF)

x= fread(snakemake@input[[2]], select= c('ID', 'REF', 'EFF', 'BETA', 'EAF', 'SE', 'N', 'pvalue'))

x$BETA= ifelse(x$REF > x$EFF, -1 * x$BETA, x$BETA)

x= select(x, ID, N, BETA, SE, pvalue, EAF)
x$MAF= ifelse(x$EAF>0.5, 1- x$EAF, x$EAF)
names(x)= c('ID', 'N', 'beta', 'se', 'p', 'eaf', 'maf')

d= inner_join(d, x, by= 'ID')

if (sum(is.na(d$eaf)) == nrow(d)) {
d$maf= d$MAF
} 

z= fread(snakemake@input[[3]])
z$CHR= ifelse(z$CHR== 'X', '23', z$CHR)
z$CHR= as.integer(z$CHR)

pph_list= list()
res_list= list()

for(i in 1:nrow(z)) {
        row <- z[i,]
	locus= paste0('chr', row[,'CHR'], '_', row[,'nearestGene'])
        temp_df= filter(d, CHR== as.integer(row[, 'CHR']), POS >= as.integer(row[, 'POS']) - 250000, POS<= as.integer(row[, 'POS']) + 25000)

	if (nrow(temp_df)== 0) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0,  PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
	pph_list[[i]]= PPH
	res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res

	} else {
	temp_df= filter(temp_df, SE>0, se>0)
        if (grepl('PCOS|miscarriage|POP|endometriosis|Preeclampsia|leiomyoma_uterus', snakemake@input[[2]])) {
	if (grepl('PCOS', snakemake@input[[2]])) {s_pheno=  (1184 + 670 + 157 +658 +984 + 485 + 462 )/ (1184 + 670 + 157 +658 +984 + 485 + 462 + 5799 + 1379 +2807 +6774 +2963+ 407 + 96172)}
	if (grepl('miscarriage', snakemake@input[[2]])) {s_pheno=49996 / (174109+ 49996)}
	if (grepl('POP', snakemake@input[[2]])) {s_pheno= 7053 / (57407 + 7053) }
	if (grepl('endometriosis', snakemake@input[[2]])) {s_pheno= 1496 / (192678 + 1496 )}
	if (grepl('Preeclampsia', snakemake@input[[2]])){ s_pheno= 4630/ (4630 + 373345)}
	if (grepl('leiomyoma_uterus', snakemake@input[[2]])){ s_pheno= ( 14569) / (85792 + 14569)}
	if (grepl('allPTD', snakemake@input[[1]])) {
	data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.067)
	} else if (grepl('postTerm', snakemake@input[[1]])) {
	data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.122) 
	} else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) }

	data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'cc', snp= temp_df$ID, s= s_pheno, MAF= temp_df$maf)

	} else {

        if (grepl('allPTD', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.067) 
        } else if (grepl('postTerm', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.122) 
        } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) }

	data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'quant', snp= temp_df$ID, MAF= temp_df$maf) 

	}
	myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
	if (length(myres)==1 ) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
        pph_list[[i]]= PPH
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res
	next 
	}
	PPH= data.frame(t(myres[[1]]))
	PPH$locus= locus        
	pph_list[[i]]= PPH
        res= myres[[2]]
        res$locus= locus
	res_list[[i]]= res
}
}

pph= data.frame(do.call('rbind', pph_list))
res= data.frame(do.call('rbind', res_list))


write.table(pph, snakemake@output[[1]], sep= '\t', row.names=F, col.names= T, quote=F)
write.table(res, snakemake@output[[2]], sep= '\t', row.names=F, col.names= T, quote=F)
14
15
script:
	'coloc.R'
23
24
25
26
27
28
29
30
31
run:
	df_list= list()
	for i in input:
		d= pd.read_csv(i, sep= '\t', header= 0)
		x= i.split('pph_')[1].replace('.txt', '')
		d['trait']= x
		df_list.append(d)
	d= pd.concat(df_list)
	d.to_csv(output[0], sep= '\t', header= True, index= False)
39
40
41
42
43
44
45
46
47
run:
        df_list= list()
        for i in input:
                d= pd.read_csv(i, sep= '\t', header= 0)
                x= i.split('results_')[1].replace('.txt', '')
                d['trait']= x
                df_list.append(d)
        d= pd.concat(df_list)
        d.to_csv(output[0], sep= '\t', header= True, index= False)
61
62
script:
        'coloc_GA_vs_PTD.R'
75
76
script:
	'coloc_GA_vs_PTD_GW.R'
89
90
script:
        'coloc_BW_GA_GW.R'
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import pandas as pd
import numpy as np


def flip_beta(df):
        'Flip EFF and REF allele if REF> EFF. Flip beta direction with same condition. Assumed column names: beta, REF, EFF.'
        df['BETA']= np.where(df.REF>df.EFF, -1 * df.BETA, df.BETA)
        df['REF'], df['EFF']= np.where(df.REF> df.EFF, [df.EFF, df.REF], [df.REF, df.EFF])
        return df

def add_ID(x):
	x['REF']= np.where(x.REF.str.len() > x.EFF.str.len(), 'I', x.REF)
	x['EFF']= np.where(x.REF.str.len() < x.EFF.str.len(), 'I', x.EFF)
	x['REF']= np.where(x.EFF== 'I', 'D', x.REF)
	x['EFF']= np.where(x.REF== 'I', 'D', x.EFF)
	x['ID']= np.where(x.REF> x.EFF, x.CHR.apply(str) + ':' + x.POS.apply(str) + ':' + x.EFF + ':' + x.REF, x.CHR.apply(str) + ':' + x.POS.apply(str) + ':' + x.REF + ':' + x.EFF)
	x= flip_beta(x)
	return x


def format_df(x, reg):
	d= pd.read_csv(x, sep= ',', header= 0)
	d['chr']= d.chr.apply(str)
	d= pd.merge(d, reg, left_on= 'chr', right_on= 'CHR')
	d= d.loc[((d.pos >= d.pos1) & (d.pos<= d.pos2)), :]
	h1= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h1.coef', 'h1.se', 'h1.pval']]
	h1.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']
	h2= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h2.coef', 'h2.se', 'h2.pval']]
	h2.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']
	h3= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h3.coef', 'h3.se', 'h3.pval']]
	h3.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']
	h1= add_ID(h1)
	h2= add_ID(h2)
	h3= add_ID(h3)
	h1.to_csv(snakemake.output[0], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	h2.to_csv(snakemake.output[1], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	h3.to_csv(snakemake.output[2], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	print('Completed file:' + x)

regions= pd.read_csv(snakemake.input[0], sep= '\t', header= 0)

format_df(snakemake.input[1], regions)
29
30
script:
	'format_CCHMC_haplotype.py'
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['contig', 'position', 'testedAllele', 'otherAllele', 'h.Bmnt', 'h.Bmnt.se', 'h.Bmnt.p', 'h.Bmt', 'h.Bmt.se', 'h.Bmt.p', 'h.Bft', 'h.Bft.se', 'h.Bft.p'])[['contig', 'position', 'testedAllele', 'otherAllele', 'h.Bmnt', 'h.Bmnt.se', 'h.Bmnt.p', 'h.Bmt', 'h.Bmt.se', 'h.Bmt.p', 'h.Bft', 'h.Bft.se', 'h.Bft.p']]
	d.columns= ['CHR', 'POS', 'EFF', 'REF', 'h2_beta', 'h2_se', 'h2_pvalue', 'h1_beta', 'h1_se', 'h1_pvalue', 'h3_beta', 'h3_se', 'h3_pvalue']
	h1= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'h1_beta', 'h1_se', 'h1_pvalue']]
	h1.columns= ['CHR', 'POS', 'EFF', 'REF', 'BETA', 'SE', 'pvalue']
	h2= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'h2_beta', 'h2_se', 'h2_pvalue']]
	h2.columns= ['CHR', 'POS', 'EFF', 'REF', 'BETA', 'SE', 'pvalue']
	h3= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'h3_beta', 'h3_se', 'h3_pvalue']]
	h3.columns= ['CHR', 'POS', 'EFF', 'REF', 'BETA', 'SE', 'pvalue']
	h1= add_ID(h1)
	h2= add_ID(h2)
	h3= add_ID(h3)
	h1.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	h2.to_csv(output[1], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	h3.to_csv(output[2], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
64
65
66
67
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	d[['CHR', 'POS', 'REF', 'EFF']]= d.snp.str.split(':', expand= True)
	h1= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'beta_h1', 'se_h1', 'pvalue_h1']]
90
91
shell:
	'/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}'
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
run:
	h1= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value', 'Direction', 'HetISq', 'HetPVal'])
	h2= pd.read_csv(input[1], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value', 'HetISq', 'HetPVal'])
	h3= pd.read_csv(input[2], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value', 'HetISq', 'HetPVal'])
	h1['N_cohorts']= 6 - h1['Direction'].apply(lambda x: str.count(x, '?'))
	h1.columns= ['MarkerName', 'Allele1', 'Allele2', 'beta_h1', 'se_h1', 'pvalue_h1', 'Direction_h1', 'HetISq_h1', 'HetPval_h1', 'n_cohorts']
	h1['beta_h1']= np.where(h1.Allele2> h1.Allele1, -1 * h1.beta_h1, h1.beta_h1)
	h2.columns= ['MarkerName', 'Allele1', 'Allele2', 'beta_h2', 'se_h2', 'pvalue_h2', 'HetISq_h2', 'HetPval_h2']
	h2['beta_h2']= np.where(h2.Allele2> h2.Allele1, -1 * h2.beta_h2, h2.beta_h2)
	h3.columns= ['MarkerName', 'Allele1', 'Allele2', 'beta_h3', 'se_h3', 'pvalue_h3', 'HetISq_h3', 'HetPval_h3']
	h3['beta_h3']= np.where(h3.Allele2> h3.Allele1, -1 * h3.beta_h3, h3.beta_h3)
	d= pd.merge(h1, h2[['MarkerName', 'beta_h2', 'se_h2', 'pvalue_h2', 'HetISq_h2', 'HetPval_h2']], on= ['MarkerName'], how= 'inner')
	d= pd.merge(d, h3[['MarkerName', 'beta_h3', 'se_h3', 'pvalue_h3', 'HetISq_h3', 'HetPval_h3']], on= ['MarkerName'], how= 'inner')
	d= d[['MarkerName', 'Allele1', 'Allele2', 'beta_h1', 'se_h1', 'pvalue_h1', 'Direction_h1', 'HetISq_h1', 'HetPval_h1', 'n_cohorts', 'beta_h2', 'se_h2', 'pvalue_h2', 'HetISq_h2', 'HetPval_h2', 'beta_h3', 'se_h3', 'pvalue_h3', 'HetISq_h3', 'HetPval_h3']]
	d['Allele1'], d['Allele2']= np.where(d.Allele2> d.Allele1, [d.Allele2, d.Allele1], [d.Allele1, d.Allele2])
	d.to_csv(output[0], sep= '\t', header= True, index= False)
 9
10
11
12
13
14
15
16
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'RSID', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue'])
	d.drop_duplicates(['ID'], keep= 'first', inplace= True)
	d.sort_values('pvalue', inplace= True, ascending= True)
	d= d.iloc[0:99999, :]
	d= d[['RSID', 'CHR', 'POS', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue']]
	d.columns= ['CHR', 'POS', 'RSID', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'PVALUE']
	d.to_csv(output[0], sep= '\t', header= True, index= False)
25
26
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'RSID', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue'])
45
46
47
run:
        d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'RSID', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue'])
        d.drop_duplicates(['ID'], keep= 'first', inplace= True)
65
66
run:
        d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'RSID', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue'])
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	top_list= list()
	non_top_list= list()
	for lname in set(d.locus):
		df_temp= d.loc[d.locus== lname, :]
		df_temp.sort_values(['PP'], ascending= False, inplace= True)
		df_temp['PPcum']= df_temp.PP.cumsum()
		top_vars= df_temp.loc[df_temp.PPcum< 0.95, :]
		non_top= df_temp.loc[df_temp.PPcum>= 0.95, :]
		top_list.append(top_vars)
		non_top_list.append(non_top)
	top= pd.concat(top_list)
	non_top= pd.concat(non_top_list)
	top.to_csv(output[0], sep= '\t', header= True, index= False)
	non_top.to_csv(output[1], sep= '\t', header= True, index= False)
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
run:
	d= pd.read_csv(input[0], sep='\t', header= 0)
	pli= pd.read_csv(input[1], header= 0, sep= '\t', usecols= ['gene_id', 'gene', 'chromosome', 'start_position', 'end_position', 'pLI'])[['gene_id', 'gene', 'chromosome', 'start_position', 'end_position', 'pLI']]
	d= d.loc[d.nearestGene.isin(pli.gene.values), :]
	pli.columns= ['EID', 'gene', 'CHR', 'start', 'end', 'pLI']
	pli.dropna(subset= ['pLI'], inplace= True)
	pli_genes= pli.loc[pli.pLI>= 0.9, 'gene'].values.tolist()
	df= d.loc[d.nearestGene.isin(pli_genes), :]
	b= len(pli_genes) - df.shape[0]
	c= d.shape[0] - df.shape[0]
	d= df.shape[0]
	a= pli.shape[0] - b - d - c
	oddsratio, pvalue = st.fisher_exact([[a, b],[c, d]], alternative= 'greater')
	z= ['pli', a, b, c, d, (d / (d+c)), (b / (a + b)), oddsratio, pvalue]
	with open(output[0], 'w') as file_handler:
		file_handler.write('\t'.join([str(item) for item in z]) + '\n')
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	rna= pd.read_csv(input[1], sep= '\t', header= 0)
	rna['GA']= np.where(rna['Gene name'].isin(d.nearestGene.values), 1, 0)
	rna['NX_rk']= rna.groupby('Gene name')['NX'].rank('average', ascending= True)
	df_list= list()
	for tissue in set(rna.Tissue):
		ilist= rna.loc[((rna.GA== 1) & (rna.Tissue == tissue)), 'NX_rk']
		base= rna.loc[((rna.GA== 0) & (rna.Tissue == tissue)), 'NX_rk']
		mannw_pvalue= st.mannwhitneyu(ilist, base, alternative= 'greater')[1]
		i_median= np.median(ilist)
		base_median= np.median(base)
		df_list.append([tissue, i_median, base_median, mannw_pvalue])
	z= pd.DataFrame.from_records(df_list)
	z.to_csv(output[0], sep= '\t', header= ['tissue', 'i_listmedian', 'base_list_median', 'MannW_pvalue'], index= False)
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
run:
	pheno= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'nearestGene'])
	x= pd.read_csv(input[1], sep= '\t', header= None, names= ['CHR', 'start', 'end', 'gene', 'EnsembleID'])
	add= [line.strip() for line in open(input[2], 'r')]
	df= pheno.loc[pheno.nearestGene.isin(add), :]
	b= len(add) - df.shape[0]
	c= pheno.shape[0] - df.shape[0]
	d= df.shape[0]
	a= x.shape[0] - b - d - c
	oddsratio, pvalue = st.fisher_exact([[a, b],[c, d]], alternative= 'greater')
	z= ['dominant', a, b, c, d, (d / (d+c)), (b / (a + b)), oddsratio, pvalue]
	with open(output[0], 'w') as file_handler:
		file_handler.write('\t'.join([str(item) for item in z]) + '\n')
	rec= [line.strip() for line in open(input[3], 'r')]
	df= pheno.loc[pheno.nearestGene.isin(rec), :]
	b= len(rec) - df.shape[0]
	c= pheno.shape[0] - df.shape[0]
	d= df.shape[0]
	a= x.shape[0] - b - d - c
	oddsratio, pvalue = st.fisher_exact([[a, b],[c, d]], alternative= 'greater')
	z= ['recessive', a, b, c, d, (d / (d+c)), (b / (a + b)), oddsratio, pvalue]
	with open(output[0], 'a') as file_handler:
		file_handler.write('\t'.join([str(item) for item in z]) + '\n')
134
135
136
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'nearestGene'])
	stc= pd.read_csv(input[1], header= 0, sep= '\t', usecols= ['geneid', 'log2FoldChange', 'pvalue'])[['geneid', 'log2FoldChange', 'pvalue']]
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
run:
	pheno= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'nearestGene'])
	x= pd.read_csv(input[1], sep= '\t', header= None, names= ['CHR', 'start', 'end', 'gene', 'EnsembleID'])
	lab= pd.read_csv(input[2], sep= '\t', header= 0)
	for i in set(lab.Cell_type):
		temp_df= lab.loc[lab.Cell_type== i, :]
		df= pheno.loc[pheno.nearestGene.isin(temp_df.gene_name.to_list()), :]
		b= len(temp_df.gene_name.to_list()) - df.shape[0]
		c= pheno.shape[0] - df.shape[0]
		d= df.shape[0]
		a= x.shape[0] - b - d - c
		oddsratio, pvalue = st.fisher_exact([[a, b],[c, d]], alternative= 'greater')
		z= [i, a, b, c, d, (d / (d+c)), (b / (a + b)), oddsratio, pvalue]
		with open(output[0], 'a') as file_handler:
			file_handler.write('\t'.join([str(item) for item in z]) + '\n')
	df= pheno.loc[pheno.nearestGene.isin(lab.gene_name.to_list()), :]
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
library(data.table)
library(dplyr)
library(coloc)
library(parallel)

df= fread(snakemake@input[[1]], select= c('RSID', 'BETA', 'SE', 'TOTALSAMPLESIZE', 'EAF'))

df= filter(df, !duplicated(RSID))

df$MAF= ifelse(df$EAF>0.5, 1 - df$EAF, df$EAF)

z= fread(snakemake@input[[2]])
z$n= 206
z$maf= ifelse(z$Freq< 0.5, 1 - z$Freq, z$Freq)
df= inner_join(df, z, by= c('RSID'= 'SNP'))

rm(z)

pph_outfile= snakemake@output[[1]]
results_outfile= snakemake@output[[2]]


cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tprotein\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tprotein\n', file= snakemake@output[[2]])

prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6


df= data.frame(df)

colocalization_eqtl= function(temp_df){
	protein= unique(temp_df$Gene)
        if (nrow(temp_df)== 0) {

        PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')

        } else {
	temp_df = filter(temp_df, SE>0, se> 0)

	if (grepl('allPTD', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$RSID, s= 0.067, MAF= temp_df$MAF)
        } else if (grepl('postTerm', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$RSID, s= 0.122, MAF= temp_df$MAF)
        } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N= temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$RSID, MAF= temp_df$MAF) }

        data2= list(beta= temp_df$b, varbeta= temp_df$se**2, N=temp_df$n, type= 'quant', snp= temp_df$RSID, MAF= temp_df$maf)
        myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
        if (length(myres)==1 ) { 
        PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')
        } else {
        PPH= data.frame(t(myres[[1]]))
        PPH$protein= protein
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= myres[[2]]
        res$protein= protein
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
}
}
}



mclapply(split(df, df$Gene), colocalization_eqtl, mc.cores= 3)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
library(data.table)
library(dplyr)
library(coloc)
library(parallel)

df= fread(snakemake@input[[1]], select= c('ID', 'BETA', 'SE', 'TOTALSAMPLESIZE', 'EAF'))

df= filter(df, !duplicated(ID))

df$MAF= ifelse(df$EAF>0.5, 1 - df$EAF, df$EAF)

z= fread(snakemake@input[[2]], select= c('gene_id', 'ID', 'maf', 'slope', 'slope_se'))


z$n= with(z, ifelse(grepl('Ovary', snakemake@input[[2]]), 167, ifelse(grepl('Uterus', snakemake@input[[2]]), 269, 141)))


df= inner_join(df, z, by= 'ID')

rm(z)

pph_outfile= snakemake@output[[1]]
results_outfile= snakemake@output[[2]]


cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tprotein\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tprotein\n', file= snakemake@output[[2]])

prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6


df= data.frame(df)

colocalization_eqtl= function(temp_df){
	protein= unique(temp_df$gene_id)
        if (nrow(temp_df)== 0) {

        PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')

        } else {
	temp_df = filter(temp_df, SE>0, slope_se> 0)

	if (grepl('allPTD', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, s= 0.067, MAF= temp_df$MAF)
        } else if (grepl('postTerm', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, s= 0.122, MAF= temp_df$MAF)
        } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N= temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) }

        data2= list(beta= temp_df$slope, varbeta= temp_df$slope_se**2, N=temp_df$n, type= 'quant', snp= temp_df$ID, MAF= temp_df$maf)
        myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
        if (length(myres)==1 ) { 
        PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')
        } else {
        PPH= data.frame(t(myres[[1]]))
        PPH$protein= protein
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= myres[[2]]
        res$protein= protein
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
}
}
}



mclapply(split(df, df$gene_id), colocalization_eqtl, mc.cores= 3)
11
12
script:
        'coloc_endometrium.R'
20
21
22
23
24
25
26
27
run:
	df_list= list()
	for i in input:
		d= pd.read_csv(i, header= 0, sep= '\t', usecols= ['ID'])
		df_list.append(d)
	x= pd.concat(df_list)
	x.drop_duplicates('ID', keep= 'first', inplace= True)
	x.to_csv(output[0], sep= '\t', header= True, index= False)
36
37
38
39
40
41
42
43
44
45
46
run:
	x= ['ID', 'gene_id', 'maf', 'slope', 'slope_se']
	with open(output[0], mode="w") as file:
		file.write("\t".join(x) + "\n")
	d= pd.read_csv(input[1], sep= '\t', header= 0)
	for chunk in pd.read_csv(input[0], sep= '\t', header= 0, chunksize= 500000, compression= 'gzip', usecols= ['gene_id', 'variant_id', 'maf', 'slope', 'slope_se']) :
		chunk[['CHR', 'POS', 'REF', 'EFF', 'build']]= chunk.variant_id.str.split('_', expand= True)
		chunk['ID']= np.where(chunk.REF> chunk.EFF, chunk.CHR + ':' + chunk.POS + ':' + chunk.EFF + ':' + chunk.REF, chunk.CHR + ':' + chunk.POS + ':' + chunk.REF + ':' + chunk.EFF)
		chunk= chunk[['ID', 'gene_id', 'maf', 'slope', 'slope_se']]
		chunk= pd.merge(chunk, d, on= 'ID')
		chunk.to_csv(output[0], sep= '\t', header= False, index= False, mode= 'a')
60
61
script:
        'coloc_GTEx.R'
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
import numpy as np


def flip_beta(df):
        'Flip EFF and REF allele if REF> EFF. Flip beta direction with same condition. Assumed column names: beta, REF, EFF.'
        df['BETA']= np.where(df.REF>df.EFF, -1 * df.BETA, df.BETA)
        df['REF'], df['EFF']= np.where(df.REF> df.EFF, [df.EFF, df.REF], [df.REF, df.EFF])
        return df

def add_ID(x):
        x['REF']= np.where(x.REF.str.len() > x.EFF.str.len(), 'I', x.REF)
        x['EFF']= np.where(x.REF.str.len() < x.EFF.str.len(), 'I', x.EFF)
        x['REF']= np.where(x.EFF== 'I', 'D', x.REF)
        x['EFF']= np.where(x.REF== 'I', 'D', x.EFF)
        x['ID']= np.where(x.REF> x.EFF, x.CHR.apply(str) + ':' + x.POS.apply(str) + ':' + x.EFF + ':' + x.REF, x.CHR.apply(str) + ':' + x.POS.apply(str) + ':' + x.REF + ':' + x.EFF)
        x= flip_beta(x)
        return x


def format_df(x):
	d= pd.read_csv(x, sep= ',', header= 0)
	d['chr']= d.chr.apply(str)
	d= d.loc[d.chr== '2', :]
	d= d.loc[d.pos== 113521754, :]
	h1= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h1.coef', 'h1.se', 'h1.pval']]
	h1.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']
	h2= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h2.coef', 'h2.se', 'h2.pval']]
	h2.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']
	h3= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h3.coef', 'h3.se', 'h3.pval']]
	h3.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']
	h1= add_ID(h1)
	h2= add_ID(h2)
	h3= add_ID(h3)
	h1.to_csv(snakemake.output[0], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	h2.to_csv(snakemake.output[1], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	h3.to_csv(snakemake.output[2], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	print('Completed file:' + x)


format_df(snakemake.input[0])
 9
10
script:
        'format_CCHMC_haplotype.py'
20
21
22
23
24
25
run:
	d= pd.read_csv(input[0], sep= ' ', header= 0)
	d['CHR']= 2
	d['POS']= 113521754
	d['REF']= 'C'
	d['EFF']= 'T'
47
48
49
50
51
52
53
run:
        d= pd.read_csv(input[0], sep= ' ', header= 0)
        d[['CHR', 'POS', 'REF', 'EFF']]= d.snp.str.split(':', expand= True)
        h1= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'beta_h1', 'se_h1', 'pvalue_h1']]
        h1.columns= ['CHR', 'POS', 'EFF', 'REF', 'BETA', 'SE', 'pvalue']
        h2= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'beta_h2', 'se_h2', 'pvalue_h2']]
        h2.columns= ['CHR', 'POS', 'EFF', 'REF', 'BETA', 'SE', 'pvalue']
73
74
shell:
        '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}'
84
85
86
87
88
run:
	h1= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value'])
	h1.columns= ['ID', 'EFF', 'REF', 'beta_MT', 'se_MT', 'pvalue_MT']
	h1['beta_MT']= np.where(h1.REF > h1.EFF, -1 * h1.beta_MT, h1.beta_MT)
	h2= pd.read_csv(input[1], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value'])
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)




d= fread(snakemake@input[[1]])
d1= fread(snakemake@input[[2]])

d$beta= as.numeric(d$beta)
d$se= as.numeric(d$se)
d$pvalue= as.numeric(d$pvalue)

d1= filter(d1, PP.H4.abf> 0.75)
d= filter(d, pheno %in% d1$pheno_FINNGEN)

mani= fread(snakemake@input[[3]], select= c('phenocode', 'name'))
names(mani)= c('pheno', 'description')

d= inner_join(d, mani, by= 'pheno')

x= fread(snakemake@input[[4]])

x1= fread(snakemake@input[[5]])
x1= filter(x1, PP.H4.abf> 0.75)

x= filter(x, pheno %in% x1$pheno_PAN_UKBB)

mani=fread(snakemake@input[[6]], select= c('phenocode', 'trait_type', 'description'))
mani$pheno= paste(mani$trait_type, mani$phenocode, sep= '_')

x= inner_join(x, mani, by= 'pheno')

d$zscore= d$beta / d$se
x$zscore= x$beta / x$se

d= select(d, pheno, description, zscore, pvalue, trait)
x= select(x, pheno, description, zscore, pvalue, trait)
d= bind_rows(d, x)




d$zscore= ifelse(d$zscore> 10, 10, ifelse(d$zscore< -10, -10, d$zscore))

d$trait= ifelse(d$trait== 'Gestational duration', 'rs28654158 (gestational duration)', 'rs11708067 (birth weight)')

d$trait= factor(d$trait, levels= rev(c('rs28654158 (gestational duration)', 'rs11708067 (birth weight)')))

d$description= with(d, ifelse(grepl('Other diabetes', description), 'Other diabetes', description))

d$description= with(d, ifelse(grepl('Non-insulin-dep', description), 'Non-insulin dependent diabetes', description))
d$description= with(d, ifelse(grepl('Diabetes, varying def', description), 'Diabetes, wide', description))
d$description= with(d, ifelse(grepl('Intestinal adhesions', description), 'Intestinal adhesions', description))

d$description= with(d, ifelse(grepl('Type 2 diabetes, strict', description), 'Type 2 diabetes', description))

d$description= with(d, ifelse(grepl('Type 2 diabetes with other specified/multiple/unspecified complications', description), 'Type 2 diabetes with complications', description))

d$description= with(d, ifelse(grepl('Diabetes, insuline treatment', description), 'Diabetes, insuline treatment', description))

d$description= with(d, ifelse(grepl('Creatinine', description), 'Creatinine in urine', description))

ord <- hclust( dist(d$zscore, method = "euclidean"), method = "ward.D" )$order
d= d[ord, ]
d$description= factor(d$description, levels= unique(d$description))


p1= ggplot(d, aes(y= trait, x= description, fill= round(zscore), alpha= factor(as.numeric(pvalue< 5e-6)))) +
geom_tile(colour = "white", size= 1) +
theme_cowplot(font_size= 8) +
scale_alpha_discrete(guide=F, range= c(0.3, 1)) +
scale_fill_gradient2(low= colorBlindBlack8[3], high= colorBlindBlack8[8], mid= 'white', guide= F) +
theme(  axis.text.x= element_text(hjust= 1, angle= 45),
	axis.text.y= element_text(),
	axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        plot.margin= unit(c(t= 0, r= 0, b= 0, l= 0), unit= 'cm'),
	axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3)) +
coord_equal() +
labs(x = NULL, y = NULL)




ggsave(snakemake@output[[1]], p1, height= 100, width= 127, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')


p1= ggplot(d, aes(description, trait, fill= round(zscore), alpha= factor(as.numeric(pvalue< 5e-6)))) +
geom_tile(colour = "white", size= 1) +
theme_cowplot(font_size= 8) +
scale_alpha_discrete(guide=F) +
scale_fill_gradient2(low= colorBlindBlack8[3], high= colorBlindBlack8[8], mid= 'white') +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        plot.margin= margin(t= 0, r= 0, l= 0, b= 0, unit= 'pt')) +
scale_x_discrete(position = "top")


ggsave(snakemake@output[[3]], p1, height= 100, width= 140, units= 'mm', dpi= 300)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
library(dplyr)
library(data.table)
library(ggplot2)
library(cowplot)
library(ggrepel)
library(tidyr)
library(showtext)
colorBlindBlack8  <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')
as= 8
as1= 9
showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])
names(d)= c('CHR', 'POS', 'FST_EUR_AFR')

d1= fread(snakemake@input[[2]])
names(d1)= c('CHR', 'POS', 'FST_EUR_EAS')

d2= fread(snakemake@input[[3]])
names(d2)= c('CHR', 'POS', 'FST_AFR_EAS')

d= inner_join(d, d1, by= c('CHR', 'POS')) %>% inner_join(., d2, by= c('CHR', 'POS'))

rm(d1); rm(d2)
d$v_ids= paste(d$CHR, d$POS, sep= ':')

z= fread(snakemake@input[[4]])

zl= gather(z, control_set, v_ids, Set_1:Set_10000)

bw_pos= c(123065778)
ga_pos= c(123112292)

zl= inner_join(zl, d[, c('v_ids', 'FST_EUR_AFR', 'FST_EUR_EAS', 'FST_AFR_EAS')], by= 'v_ids')

zl= filter(zl, Input_SNP== '3:123065778' | Input_SNP== '3:123112292')

zl$haplotype= with(zl, ifelse(Input_SNP== '3:123065778', 'Birth weight', 'Gestational duration'))

zl= zl[!duplicated(zl$v_ids), ]

zl= data.frame(zl)
d= data.frame(d)
df_list= list()
r_num= 1
for (i in c('FST_EUR_AFR', 'FST_AFR_EAS', 'FST_EUR_EAS')){

ga_pvalue=wilcox.test(zl[zl$haplotype== 'Gestational duration', i], mu= d[d$v_ids== '3:123112292', i], alternative= 'less')$p.value

m1= d[d$v_ids== '3:123112292', i]
mc1= mean(zl[zl$haplotype== 'Gestational duration', i], na.rm=T)
medc1= median(zl[zl$haplotype== 'Gestational duration', i], na.rm=T)
prop_above= prop.table(table(d[d$v_ids== '3:123112292', i]> zl[zl$haplotype== 'Gestational duration', i]))[2]
temp_df= data.frame(haplotype= 'Gestational duration', ancestries= i, FST= m1, FST_mean_controls= mc1, FST_median_controls= medc1, pvalue= ga_pvalue)


ga_pvalue= wilcox.test(zl[zl$haplotype== 'Birth weight', i], mu= d[d$v_ids== '3:123065778', i], alternative= 'less')$p.value

medc1= median(zl[zl$haplotype== 'Birth weight', i], na.rm=T)
m1= d[d$v_ids== '3:123065778', i]
mc1= mean(zl[zl$haplotype== 'Birth weight', i], na.rm=T)

temp_df2= data.frame(haplotype= 'Birth weight', ancestries= i, FST= m1, FST_mean_controls= mc1, FST_median_controls= medc1, pvalue= ga_pvalue)
temp_df= rbind(temp_df, temp_df2)
df_list[[r_num]]= temp_df

r_num= r_num + 1
}

xp= do.call('rbind', df_list)

xp$enrichment= with(xp, FST / FST_median_controls)

bw= filter(zl, haplotype== 'Birth weight') %>% select(FST_EUR_AFR, FST_EUR_EAS, FST_AFR_EAS)
ga= filter(zl, haplotype== 'Gestational duration') %>% select(FST_EUR_AFR, FST_EUR_EAS, FST_AFR_EAS)

names(bw)= c('FST_EUR_AFR_bw', 'FST_EUR_EAS_bw', 'FST_AFR_EAS_bw')

df1= cbind(bw, ga)

ga_fst= d[d$v_ids== '3:123112292', 'FST_EUR_AFR']
bw_fst= d[d$v_ids== '3:123065778', 'FST_EUR_AFR']
ga_fst_pvalue= xp[xp$haplotype== 'Gestational duration' & xp$ancestries== 'FST_EUR_AFR', 'enrichment']
bw_fst_pvalue= xp[xp$haplotype== 'Birth weight' & xp$ancestries== 'FST_EUR_AFR', 'enrichment']


p1= ggplot(df1, aes(x=x) ) +
  geom_density( aes(x = FST_EUR_AFR, y = ..density..), fill= colorBlindBlack8[4], colour= colorBlindBlack8[4]) +
annotate('text', x=0.6, y= 10, label="Gestational \nduration", color= colorBlindBlack8[4], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=0.6, y= -10 - 0.5, label="Birth weight", color= colorBlindBlack8[2], size= as1/ .pt, fontface = 'bold') +
  annotate('text', x=ga_fst, y=5 + 0.5, label="rs28654158", color= colorBlindBlack8[4], size= as/ .pt) +
  annotate('text', x=bw_fst, y= -10 - 0.5, label="rs11708067", color= colorBlindBlack8[2], hjust= 0, size= as/ .pt) +
  annotate('text', x= 0.6, y= 1, label= paste0('Enrichment x', round(ga_fst_pvalue, 1)), color= colorBlindBlack8[4], size= as/ .pt) +
  annotate('text', x= 0.6, y= -1, label= paste0('Enrichment x', round(bw_fst_pvalue, 1)), color= colorBlindBlack8[2], size= as/ .pt) +
  geom_density(aes(x = FST_EUR_AFR_bw, y = -..density..), fill= colorBlindBlack8[2], colour= colorBlindBlack8[2]) +
  theme_cowplot(font_size = 8) +
scale_x_continuous(expand= c(0, 0)) +
scale_y_continuous(limits= c(-11, 11), breaks= c(-10, -5, 0, 5, 10), labels= c(10, 5, 0, 5, 10)) +
  xlab("Fst Africans - Europeans") +
ylab('Density') +
geom_segment(aes(x = ga_fst, y = 0, xend = ga_fst, yend = 5)) +
geom_segment(aes(x = bw_fst, y = 0, xend = bw_fst, yend = -10))+
geom_hline(yintercept= 0, colour= 'grey') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))



ggsave(snakemake@output[[1]], p1, width= 63, height= 63, units= 'mm', dpi= 300)

ga_fst= d[d$v_ids== '3:123112292', 'FST_EUR_EAS']
bw_fst= d[d$v_ids== '3:123065778', 'FST_EUR_EAS']
ga_fst_pvalue= xp[xp$haplotype== 'Gestational duration' & xp$ancestries== 'FST_EUR_EAS', 'enrichment']
bw_fst_pvalue= xp[xp$haplotype== 'Birth weight' & xp$ancestries== 'FST_EUR_EAS', 'enrichment']

p1= ggplot(df1, aes(x=x) ) +
  geom_density( aes(x = FST_EUR_EAS, y = ..density..), fill= colorBlindBlack8[4], colour= colorBlindBlack8[4]) +
annotate('text', x=0.57, y= 9, label="Gestational \nduration", color= colorBlindBlack8[4], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=0.57, y= -10, label="Birth weight", color= colorBlindBlack8[2], size= as1/ .pt, fontface = 'bold') +
  annotate('text', x=ga_fst, y= 10 + 0.5, label="rs28654158", color= colorBlindBlack8[4], hjust= 0, size= as/ .pt) +
  annotate('text', x=bw_fst, y= -5 - 0.5, label="rs11708067", color= colorBlindBlack8[2], size= as/ .pt) +
  annotate('text', x= 0.6, y= 1, label= paste0('Enrichment x', round(ga_fst_pvalue, 1)), color= colorBlindBlack8[4], size= as/ .pt) +
  annotate('text', x= 0.6, y= -1, label= paste0('Enrichment x', round(bw_fst_pvalue, 1)), color= colorBlindBlack8[2], size= as/ .pt) +
  geom_density( aes(x = FST_EUR_EAS_bw, y = -..density..), fill= colorBlindBlack8[2], colour= colorBlindBlack8[2]) +
scale_x_continuous(expand= c(0, 0)) +
scale_y_continuous(limits= c(-11, 11), breaks= c(-10, -5, 0, 5, 10), labels= c(10, 5, 0, 5, 10)) +
  theme_cowplot(font_size = 8) +
  xlab("Fst East Asians - Europeans") +
ylab('Density') +
geom_segment(aes(x = ga_fst, y = 0, xend = ga_fst, yend = 10)) +
geom_segment(aes(x = bw_fst, y = 0, xend = bw_fst, yend = -5)) +
geom_hline(yintercept= 0, colour= 'grey') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))


ggsave(snakemake@output[[2]], p1, width= 63, height= 63, units= 'mm', dpi= 300)

ga_fst= d[d$v_id== '3:123112292', 'FST_AFR_EAS']
bw_fst= d[d$v_id== '3:123065778', 'FST_AFR_EAS']
ga_fst_pvalue= xp[xp$haplotype== 'Gestational duration' & xp$ancestries== 'FST_AFR_EAS', 'enrichment']
bw_fst_pvalue= xp[xp$haplotype== 'Birth weight' & xp$ancestries== 'FST_AFR_EAS', 'enrichment']

p1= ggplot(df1, aes(x=x) ) +
geom_density( aes(x = FST_AFR_EAS, y = ..density..), fill= colorBlindBlack8[4], colour= colorBlindBlack8[4]) +
annotate('text', x=0.72, y=7, label="Gestational \nduration", color= colorBlindBlack8[4], size= as1/ .pt, fontface = 'bold') + 
annotate('text', x=0.72, y= -7, label="Birth weight", color= colorBlindBlack8[2], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=ga_fst, y=5 + 0.5, label="rs28654158", color= colorBlindBlack8[4], size= as/ .pt) +
annotate('text', x=bw_fst, y= -5 - 0.5, label="rs11708067", color= colorBlindBlack8[2], hjust= 0, size= as/ .pt) +
annotate('text', x= 0.75, y= 1, label= paste0('Enrichment x', round(ga_fst_pvalue, 1)), color= colorBlindBlack8[4], size= as/ .pt) +
annotate('text', x= 0.75, y= -1, label= paste0('Enrichment x', round(bw_fst_pvalue, 1)), color= colorBlindBlack8[2], size= as/ .pt) +
geom_density( aes(x = FST_AFR_EAS_bw, y = -..density..), fill= colorBlindBlack8[2], colour= colorBlindBlack8[2]) +
scale_x_continuous(expand= c(0, 0)) +
theme_cowplot(font_size = 8) +
xlab("Fst Africans - East Asians") +
scale_y_continuous(limits= c(-11, 11), breaks= c(-10, -5, 0, 5, 10), labels= c(10, 5, 0, 5, 10)) +
ylab('Density') +
geom_segment(aes(x = ga_fst, y = 0, xend = ga_fst, yend = 5)) +
geom_segment(aes(x = bw_fst, y = 0, xend = bw_fst, yend = -5))+
geom_hline(yintercept= 0, colour= 'grey') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))



ggsave(snakemake@output[[3]], p1, width= 63, height= 63, units= 'mm', dpi= 300)

fwrite(df1, snakemake@output[[4]], sep= '\t')
fwrite(xp, snakemake@output[[5]], sep= '\t')
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
library(dplyr)
library(data.table)
library(ggplot2)
library(cowplot)
library(ggrepel)
library(tidyr)
library(showtext)
colorBlindBlack8  <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])
names(d)[8]= 'phenocode'
mani= fread(snakemake@input[[2]])

trait_list= c('biomarkers', 'continuous', 'icd10')
mani= mani[mani$trait_type %in% trait_list, ]

mani= filter(mani, saige_heritability_EUR> 0.01)
mani= mani[order(mani$saige_heritability_EUR, decreasing= TRUE), ]
mani= mani[!duplicated(mani$phenocode), ]

mani$phenocode= paste(mani$trait_type, mani$phenocode, sep= '_')
mani= mani[, c('phenocode', 'description')]
mani= mani[!duplicated(mani$description), ]

d= inner_join(d, mani[, c('description', 'phenocode')], by= 'phenocode')
d$cohort= 'UKBB'

x= fread(snakemake@input[[3]])
names(x)[8]= 'phenocode'
mani= fread(snakemake@input[[4]])
mani= mani[, c('phenocode', 'name')]
names(mani)= c('phenocode', 'description')
mani= mani[!duplicated(mani$description), ]

x= inner_join(x, mani, by= 'phenocode')
x$cohort= 'FINNGEN'

d= rbind(d, x)
d= d[order(d$PP.H4.abf, decreasing= F), ]
d= filter(d, PP.H4.abf> 0.01, PP.H4.abf + PP.H3.abf> 0.75)

d$preg_trait= factor(d$preg_trait)
empty_bar <- 5
to_add <- data.frame( matrix(NA, empty_bar*nlevels(d$preg_trait), ncol(d)) )
colnames(to_add) <- colnames(d)
to_add$preg_trait <- rep(levels(d$preg_trait), each=empty_bar)
d <- rbind(d, to_add)
d <- d %>% arrange(preg_trait)


d$id= seq(1, nrow(d))

label_data= d
number_of_bar <- nrow(label_data)
angle <-  90 - 360 * (label_data$id-0.5) /number_of_bar
label_data$hjust<-ifelse( angle < -90, 1, 0)


label_data$angle<-ifelse(angle < -90, angle+180, angle)

#d$id= factor(d$id, levels= d$id[order(d$PP.H4.abf)])

base_data= d %>%
  group_by(preg_trait) %>%
  filter(is.na(PP.H4.abf)) %>%
  summarize(start=min(id), end=max(id) ) %>%
  rowwise() %>%
  mutate(title=mean(c(start, end)))

arc100= rep(1, 2)
arc75= rep(0.75, 2)
arc50= rep(0.50, 2)
arc25= rep(0.25, 2)

label_data$description= with(label_data, ifelse(grepl('Other diabetes', description), 'Other diabetes', description))

label_data$description= with(label_data, ifelse(grepl('Non-insulin-dep', description), 'Non-insulin dependent diabetes', description))
label_data$description= with(label_data, ifelse(grepl('Diabetes, varying def', description), 'Diabetes, wide', description))
label_data$description= with(label_data, ifelse(grepl('Intestinal adhesions', description), 'Intestinal adhesions', description))

label_data$description= with(label_data, ifelse(grepl('Type 2 diabetes, strict', description), 'Type 2 diabetes', description))

label_data$description= with(label_data, ifelse(grepl('Type 2 diabetes with other specified/multiple/unspecified complications', description), 'Type 2 diabetes with complications', description))

label_data$description= with(label_data, ifelse(grepl('and lymph nodes, not elsewhere classified', description), 'Diseases of veins', description))

label_data$description= with(label_data, ifelse(grepl('Diabetes, insuline treatment', description), 'Diabetes, insuline treatment', description))

label_data$description= with(label_data, ifelse(grepl('Creatinine', description), 'Creatinine in urine', description))

p1= ggplot(d, aes(as.factor(id), PP.H4.abf, fill= preg_trait, alpha= PP.H4.abf)) +
geom_bar(stat="identity", colour= NA) +
scale_alpha_continuous(range= c(0.4, 1), guide= F) +
geom_segment(data=base_data, aes(x = end, y = arc100, xend = start, yend = arc100), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  geom_segment(data=base_data, aes(x = end, y = arc75, xend = start, yend = arc75), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  geom_segment(data=base_data, aes(x = end, y = arc50, xend = start, yend = arc50), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  geom_segment(data=base_data, aes(x = end, y = arc25, xend = start, yend = arc25), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  annotate("text", x = ((base_data$end[1] + base_data$start[1]) / 2), y = c((0.25 + 0.075) , (0.50 + 0.075), (0.75 + 0.075) , (1 + 0.075)), label = c("0.25", "0.50", "0.75", "1") , color="grey", size=2.5 , angle=13, fontface="bold", hjust= 0.5) +
   annotate("text", x = ((base_data$end[2] + base_data$start[2]) / 2), y = c((0.25 + 0.075) , (0.50 + 0.075), (0.75 + 0.075) , (1 + 0.075) ), label = c("0.25", "0.50", "0.75", "1") , color="grey", size=2.5, angle=13, fontface="bold", hjust=0.5) +
ylim(-0.2, 2) + # Limits of the plot = very important. The negative value controls the size of the inner circle, the positive one is useful to add size over each bar
theme_cowplot() +
scale_fill_manual(values=colorBlindBlack8[c(2,4)], guide= F) +
scale_colour_manual(values=colorBlindBlack8[c(2,4)], guide= F) +
#    plot.margin = margin(t= -200, r= -40, b= -200, l=-70, unit= 'mm')   ) +
labs(x=NULL, y=NULL)  +
  coord_polar(start = 0) +
geom_text(data=filter(label_data, PP.H4.abf> 0.75), aes(x= factor(id), y=PP.H4.abf + 0.01, label=description, hjust=hjust), color="black", fontface="bold",alpha=0.6, size=6/ .pt, angle= filter(label_data, PP.H4.abf> 0.750)$angle, inherit.aes = FALSE) +
theme(axis.line=element_blank(),axis.text.x=element_blank(),
          axis.text.y=element_blank(),axis.ticks=element_blank(),
          axis.title.x=element_blank(),
          axis.title.y=element_blank(),legend.position="none",
          panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
          panel.grid.minor=element_blank(),plot.background=element_blank(),
axis.ticks.length = unit(0, "mm"))

ggsave(snakemake@output[[1]], plot= p1, width= 127, height= 127, dpi= 300, units= 'mm')

fwrite(d, snakemake@output[[2]], sep= '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
library(scales)
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
library(tidyverse)
library(fmsb)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

flist= snakemake@input #list.files('/mnt/hdd/common/pol/metaGWAS/colocalization/GAraw/', 'pph_BW_', full.names=T)

funk= function(x){
d= fread(x)
d= filter(d, PP.H1.abf + PP.H2.abf + PP.H3.abf + PP.H4.abf + PP.H0.abf> 0)
fname= gsub('.txt', '', gsub('pph_', '', unlist(strsplit(x, '/'))[9]))
d= separate(d, locus, into= c('chrom', 'locus'), sep= '_')
d$sloc= d$PP.H4.abf + d$PP.H3.abf
d= select(d, PP.H4.abf, sloc, locus)

names(d)= c(fname, paste0(fname, '_sloc'), 'locus')
return(d)
}

d= lapply(flist, funk)

d= reduce(d, full_join, by = "locus")

d= arrange(d, BW_maternal_effect)

# Spider plot maternal

x= as.data.frame(matrix(d$BW_maternal_effect, ncol= nrow(d)))
x=rbind(x, as.data.frame(matrix(d$BW_maternal_effect_sloc, ncol= nrow(d))))
names(x)= d$locus

rownames(x)= c('BW maternal effect', 'BW maternal effect ')

x= rbind(rep(1,nrow(d)) , rep(0,nrow(d)) , x)


png(snakemake@output[[1]], width= 60, height= 60, res= 300, units= 'mm')
par(mar=c(0,0,0,0))

radarchart(x, axistype= 0, 

    #custom polygon
    pcol= c(colorBlindBlack8[4], colorBlindBlack8[2]) , pfcol= c(alpha(colorBlindBlack8[4], 0.4), alpha(colorBlindBlack8[2], 0.4)) , plwd=1, pty= 32, plty= 1,
    #custom the grid
    cglcol="grey", cglty=1, axislabcol="#525252", caxislabels= seq(0, 1, 0.25), caxisoffset= 0.1, cglwd=0.8, calcex= 0.4,

    #custom labels
    vlcex= 0.43
    )

dev.off()


# Spider plot fetal

x= as.data.frame(matrix(d$BW_fetal_effect, ncol= nrow(d)))
x=rbind(x, as.data.frame(matrix(d$BW_fetal_effect_sloc, ncol= nrow(d))))
names(x)= d$locus

rownames(x)= c('BW fetal effect', 'BW fetal effect ')

x= rbind(rep(1,nrow(d)) , rep(0,nrow(d)) , x)


png(snakemake@output[[2]], width= 60, height= 60, res= 300, units= 'mm')
par(mar=c(0,0,0,0))

radarchart(x, axistype= 0,

    #custom polygon
    pcol= c(colorBlindBlack8[4], colorBlindBlack8[2]) , pfcol= c(alpha(colorBlindBlack8[4], 0.4), alpha(colorBlindBlack8[2], 0.4)) , plwd=1, pty= 32, plty= 1,
    #custom the grid
    cglcol="grey", cglty=1, axislabcol="#525252", caxislabels= seq(0, 1, 0.25), caxisoffset= 0.1, cglwd=0.8, calcex= 0.4,

    #custom labels
    vlcex= 0.43
    )

dev.off()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

as= 8
as1= 8

z= fread(snakemake@input[[1]])
z$chr= as.numeric(gsub('chr', '', z$chr))
z$chr= as.character(z$chr)
z$locus= 1:nrow(z)


funk= function(infile){
d= fread(infile)
names(d)[1:11]= names(d)[2:12]
d=d[, 1:11]

d= filter(d, p<5e-6)

d$bC= ifelse(d$b< 0, -1 * d$bC, d$bC)
d$b= ifelse(d$b< 0, -1 * d$b, d$b)

d= separate(d, SNP, into= c('chr', 'POS', 'REF', 'EFF'), sep= ':')

d$POS= as.numeric(d$POS)
d$chr= as.character(d$chr)
d$GWAS= ifelse(grepl('BW_maternal_effect_GA', infile), 'BW_maternal_GA', 'BW_fetal_GA')
d= inner_join(d, z, on= 'chr') 
d= d %>% filter(POS>= start, POS< stop)

d= group_by(d, locus) %>% arrange(p) %>% filter(row_number()== 1)

return(d)

}

df_list= lapply(snakemake@input[grepl('BW', snakemake@input)], funk)

d= do.call('rbind', df_list)

d$beta_dif= with(d, (bC - b) / b)


mor= filter(d, GWAS== 'BW_maternal_GA') %>% pull(beta_dif)
barn= filter(d, GWAS== 'BW_fetal_GA') %>% pull(beta_dif)

p1= ggplot() +
geom_density( mapping=aes(x = mor, y = ..density..), fill= colorBlindBlack8[3], colour= colorBlindBlack8[3]) +
annotate('text', x=0.1, y= 3, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=0.1, y= -15, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
geom_density(mapping= aes(x = barn, y = -..density..), fill= colorBlindBlack8[8], colour= colorBlindBlack8[8]) +
  theme_cowplot(font_size = 8) +
scale_x_continuous(expand= c(0, 0)) +
  xlab("Relative difference in effect size on \nbirth weight after conditioning") +
ylab('Density') +
geom_hline(yintercept= 0, colour= 'grey') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))


ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)

p1= ggplot(d, aes(beta_dif, group= GWAS, fill= GWAS)) +
geom_hline(yintercept= 0, colour= 'black') +
geom_density(color= NA) +
annotate('text', x=-0.75, y= 1, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=0.1, y= 15, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
theme_cowplot(font_size= 8) +
#scale_colour_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_fill_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_x_continuous(expand= c(0, 0)) +
scale_y_continuous(expand=c(0, 0.5)) +
  xlab("Relative difference in effect size on \nbirth weight after conditioning") +
ylab('Density') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))

ggsave(snakemake@output[[3]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

as= 8
as1= 8

z= fread(snakemake@input[[1]])

z$SNP= with(z, ifelse(ref> eff, paste(chr, pos, eff, ref, sep= ':'), paste(chr, pos, ref, eff, sep= ':')))

funk= function(infile){
d= fread(infile)
names(d)[1:11]= names(d)[2:12]
d=d[, 1:11]

d$bC= ifelse(d$b< 0, -1 * d$bC, d$bC)
d$b= ifelse(d$b< 0, -1 * d$b, d$b)

d$GWAS= ifelse(grepl('BW_maternal_effect_GA', infile), 'BW_maternal_GA', 'BW_fetal_GA')

var= ifelse(grepl('BW_maternal_effect_GA', infile), 'Maternal Only', 'Fetal Only')
temp_z= z[z$origin== var, ]

d= filter(d, SNP %in% temp_z$SNP)

return(d)

}

df_list= lapply(snakemake@input[grepl('BW', snakemake@input)], funk)

d= do.call('rbind', df_list)

d$beta_dif= with(d, (bC - b) / b)


mor= filter(d, GWAS== 'BW_maternal_GA') %>% pull(beta_dif)
barn= filter(d, GWAS== 'BW_fetal_GA') %>% pull(beta_dif)

p1= ggplot() +
geom_density( mapping=aes(x = mor, y = ..density..), fill= colorBlindBlack8[3], colour= colorBlindBlack8[3]) +
annotate('text', x= 0.1, y= 3, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x= 0.1, y= -15, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
geom_density(mapping= aes(x = barn, y = -..density..), fill= colorBlindBlack8[8], colour= colorBlindBlack8[8]) +
  theme_cowplot(font_size = 8) +
scale_x_continuous(expand= c(0, 0)) +
  xlab("Relative difference in effect size on \nbirth weight after conditioning") +
ylab('Density') +
geom_hline(yintercept= 0, colour= 'grey') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))


ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)

p1= ggplot(d, aes(beta_dif, group= GWAS, fill= GWAS)) +
geom_hline(yintercept= 0, colour= 'black') +
geom_density(color= NA) +
annotate('text', x=-0.55, y= 1, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=0.1, y= 10, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
theme_cowplot(font_size= 8) +
#scale_colour_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_fill_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_x_continuous(expand= c(0, 0)) +
scale_y_continuous(expand=c(0, 0.5)) +
  xlab("Relative difference in effect size on \nbirth weight after conditioning") +
ylab('Density') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))

ggsave(snakemake@output[[3]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')


showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)



d= fread(snakemake@input[[1]])
d= filter(d, grepl('GAraw', p1), grepl('BW', p2))
d$p1= 'Gestational duration (maternal)'


x= fread(snakemake@input[[2]])

x= filter(x, grepl('GA_fetal', p1), grepl('BW', p2))
x$p1= 'Gestational duration (fetal)'



d= rbind(d, x)

d$p2= gsub('.txt.sumstats.gz', '', apply(d[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))
d$trait= d$p2
d$trait= with(d, ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Maternal',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
                ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Fetal',
                ifelse(trait== 'BW_fetal_effect', 'Fetal \nonly',
                ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'BW_maternal_effect', 'Maternal \nonly',
                ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis'))))))))))))))))))))))



p1= ggplot(d, aes(trait, rg, colour= p1)) +
geom_pointrange(aes(ymin= rg - se * 1.96, ymax= rg + se * 1.96), position = position_dodge(0.3), width = 1/10, size= 0.4, fatten= 0.6) +
scale_colour_manual(values= colorBlindBlack8[c(8,3)], guide= FALSE) +
theme_cowplot(font_size= 8) +
scale_y_continuous(limits= c(-0.2, 0.8), breaks= seq(-0.2, 0.8, 0.2)) +
ylab('Genetic correlation') +
xlab('Effect on birth weight') +
geom_hline(yintercept= 0, size= 0.3) +
geom_hline(yintercept= c(-0.2, seq(0.2, 0.8, 0.2)), colour= 'grey', linetype= 'dashed', alpha= 0.5, size= 0.2) +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
	axis.ticks= element_line(size= 0.3))


ggsave(snakemake@output[[1]], plot= p1, width= 60, height= 60, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
library(data.table)
library(dplyr)
library(cowplot)
library(ggrepel)
library('showtext')


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])
x= fread(snakemake@input[[2]])
d= inner_join(d, x, by= 'Name')


d= d[sample(nrow(d)),]

d= d[order(d$Category, decreasing= F), ]

d$Name= factor(d$Name, levels= unique(d$Name))

d$Name2= gsub('_', ' ', gsub("^.*\\.","", d$Name))
d$Name2= factor(d$Name2, levels= unique(d$Name2))

p1= ggplot(d, aes(Name2, -log10(Coefficient_P_value), colour= Category, fill= Category)) + 
geom_point(size= 2, shape= 21, stroke= 0.1) +
xlab('Tissues') +
ylab('-log10(Enrichment)') +
theme_cowplot(font_size= 8) +
geom_hline(yintercept= -log10(0.05), colour= '#d9d9d9') +
theme(axis.text.x = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05),
	panel.grid.major.x= element_blank(),
	legend.position="none") +
geom_text_repel(data= filter(d, Coefficient_P_value< 0.05), aes(Name2, -log10(Coefficient_P_value), colour= Category, label= Name2, show_guide = FALSE))


ggsave(snakemake@output[[1]], plot= p1, width= 120, height= 90, units= 'mm', dpi= 300)

p2= ggplot(d, aes(Name2, -log10(Coefficient_P_value), colour= Category, fill= Category)) +
geom_point(size= 2, shape= 21, stroke= 0.1) +
xlab('Tissues') +
ylab('-log10(Enrichment)') +
theme_cowplot(font_size= 8) +
geom_hline(yintercept= -log10(0.05), colour= '#d9d9d9') +
theme(axis.text.x = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05),
        panel.grid.major.x= element_blank()) +
geom_text_repel(data= filter(d, Coefficient_P_value< 0.05), aes(Name2, -log10(Coefficient_P_value), colour= Category, label= Name2), show_guide = FALSE)

ggsave(snakemake@output[[2]], plot= p2, width= 120, height= 90, units= 'mm', dpi= 300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
library(ggtern)
options(warn=-1)



colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)



shbg= fread(snakemake@input[[1]])

testo= fread(snakemake@input[[2]])


#shbg$locus= gsub("^.*\\_","", shbg$locus)
#testo$locus= gsub("^.*\\_","", testo$locus)




colT= colorBlindBlack8[4]
colR= colorBlindBlack8[1]
colL= colorBlindBlack8[2]

shbg$One_or_Other= shbg$PP.H0.abf + shbg$PP.H1.abf + shbg$PP.H2.abf
shbg$coloc= shbg$PP.H4.abf
shbg$shared_locus= shbg$PP.H3.abf

p1= ggtern(shbg, aes(One_or_Other, coloc, shared_locus)) +
geom_point(colour= colorBlindBlack8[8], fill= colorBlindBlack8[8], shape= 21) +
scale_alpha_continuous(range= c(0.6, 1), guide= F) +
scale_size_continuous(range= c(.001, 10), guide= F) +
theme_custom(tern.plot.background = NULL, tern.panel.background = 'white', col.T = colT, col.L = colL, col.R = colR, col.grid.minor = "white") +
Tarrowlab("Probability of shared causal variant") +
Larrowlab("Probability of locus not shared") +
Rarrowlab("Probability of shared locus (distinct causal variant)")  +
theme_showarrows()  +
theme_notitles() +
theme(text=element_text(family="arial", size= 10),
        tern.axis.arrow.T = element_blank(),
        tern.axis.arrow.L = element_blank(),
        tern.axis.arrow.R = element_blank(),
        tern.axis.text.T = element_text(color = colT),
        tern.axis.text.L = element_text(color = colL),
        tern.axis.text.R = element_text(color = colR),
        tern.axis.arrow.text.T = element_text(color = colT),
        plot.margin = margin(0, 0, 0, 0, "cm"),
        tern.axis.arrow.text.L = element_text(color = colL),
        tern.axis.arrow.text.R = element_text(color = colR),
        tern.panel.grid.major = element_line(linetype = 6, size = 0.3))

testo$One_or_Other= testo$PP.H0.abf + testo$PP.H1.abf + testo$PP.H2.abf
testo$coloc= testo$PP.H4.abf
testo$shared_locus= testo$PP.H3.abf

p2= ggtern(testo, aes(One_or_Other, coloc, shared_locus)) +
geom_point(colour= colorBlindBlack8[8], fill= colorBlindBlack8[8], shape= 21) +
scale_alpha_continuous(range= c(0.6, 1), guide= F) +
scale_size_continuous(range= c(.001, 10), guide= F) +
theme_custom(tern.plot.background = NULL, tern.panel.background = 'white', col.T = colT, col.L = colL, col.R = colR, col.grid.minor = "white") +
Tarrowlab("Probability of shared causal variant") +
Larrowlab("Probability of locus not shared") +
Rarrowlab("Probability of shared locus (distinct causal variant)")  +
theme_showarrows()  +
theme_notitles() +
theme(text=element_text(family="arial", size= 10),
        tern.axis.arrow.T = element_blank(),
        tern.axis.arrow.L = element_blank(),
        tern.axis.arrow.R = element_blank(),
        tern.axis.text.T = element_text(color = colT),
        tern.axis.text.L = element_text(color = colL),
        tern.axis.text.R = element_text(color = colR),
        tern.axis.arrow.text.T = element_text(color = colT),
        plot.margin = margin(0, 0, 0, 0, "cm"),
        tern.axis.arrow.text.L = element_text(color = colL),
        tern.axis.arrow.text.R = element_text(color = colR),
        tern.panel.grid.major = element_line(linetype = 6, size = 0.3))

ggsave(snakemake@output[[1]], plot= p1, width= 95, height= 95, units= 'mm', dpi= 300)

ggsave(snakemake@output[[2]], plot= p2, width= 95, height= 95, units= 'mm', dpi= 300)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
library(ggdendro)
library(gridExtra)
library(dendextend)
library(plyr)
library(ggtree)
library(scales)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[1]])


x= fread(snakemake@input[[2]], select= c('nearestGene', 'RSID'))

d= inner_join(d, x, by= c('rsid'= 'RSID'))

d$GENE= d$nearestGene
d$GENE= with(d, ifelse(GENE== 'CDC42', 'CDC42/ WNT4', ifelse(GENE== 'HIVEP3', 'HIVEP3/ EDN2', ifelse(GENE== 'TET3', 'TET3/ DGUOK-AS1', ifelse(GENE== 'TCEA2', 'TCEA2/ OPRL1', GENE)))))
d$nearestGene= d$GENE

d$nearestGene= with(d, ifelse(rsid== 'rs3129768', 'HLA-DQA1', ifelse(rsid== 'rs5991030', 'AGTR2', ifelse(rsid== 'rs5930554', 'RAP2C', nearestGene)))) 

d$nearestGene= with(d, ifelse(rsid== 'rs6780427', 'KCNAB1', nearestGene))
d$nearestGene= with(d, ifelse(rsid== 'rs6879092', 'EBF1', nearestGene))


d$nearestGene= gsub(' ', '', d$nearestGene)
d$nearestGene= paste0("(", d$nearestGene, ")")
d$rsid_lab= with(d, paste(rsid, nearestGene))

d$beta_PT= with(d, ifelse(beta_MT<0, -1 * beta_PT, beta_PT))
d$beta_MNT= with(d, ifelse(beta_MT<0, -1 * beta_MNT, beta_MNT))
d$beta_MT= with(d, ifelse(beta_MT<0, -1 * beta_MT, beta_MT))

d= gather(d, haplotype, beta, c('beta_MT', 'beta_MNT', 'beta_PT'))

max_beta= max(d$beta)
min_beta= min(d$beta)


d$haplotype= with(d, ifelse(haplotype== 'beta_MT', 'Maternal\ntransmitted', ifelse(haplotype== 'beta_MNT', 'Maternal\nnon-transmitted', 'Paternal\ntransmitted')))
d$rsid_lab= factor(d$rsid_lab, levels= unique(d$rsid_lab))


d$class_name= factor(d$class_name, levels= c("Maternal", "MF SD", "MF OD", "Fetal MatT", "Fetal"))

d= d %>% arrange(class_name, desc(probability)) %>% ungroup()
d$rsid_lab= factor(d$rsid_lab, levels= unique(d$rsid_lab))

labs <- sapply(
  strsplit(levels(d$rsid_lab), " "), 
  function(x) parse(text = paste0(x[1], "~italic('", x[2], "')"))
)

p1= ggplot(d, aes(rsid_lab, haplotype, fill= beta)) +
  theme_cowplot(8) +
  geom_tile() +
  #scale_fill_gradient2(low= colorBlindBlack8[4], high= colorBlindBlack8[2], mid= 'white', limits= c(min_beta, max_beta), guide= 'none', midpoint= 0) +
scale_fill_gradientn(colours=c(colorBlindBlack8[4], 'white', colorBlindBlack8[2]), values= rescale(c(min_beta, 0, max_beta)), limits= c(min_beta, max_beta), guide= 'none') +
  coord_equal() +
  scale_x_discrete(labels= labs) +
  theme(axis.title= element_blank(),
        axis.ticks= element_blank(),
        plot.margin = margin(0, 0, 0, 0, "mm"),
        text= element_text(size= 9/ .pt),
        axis.text.y= element_text(hjust= 0.5),
	axis.text.x= element_text(angle= 45, hjust= 1),
        axis.line = element_line(colour = 'black', size = 0.2)) +
  geom_text_repel(data= filter(d, haplotype== 'Paternal\ntransmitted'), aes(x= rsid_lab, y= 4,
                label= round(probability, 2)),  direction= 'y', size= 8/ .pt, box.padding = 0.01)

ggsave(snakemake@output[[1]], plot= p1, width= 180, height= 60, units= 'mm', dpi= 300)

p1= ggplot(d, aes(rsid_lab, haplotype, fill= beta)) +
  theme_cowplot(8) +
  geom_tile() +
  scale_fill_gradientn(colours=c(colorBlindBlack8[4], 'white', colorBlindBlack8[2]), values= rescale(c(min_beta, 0, max_beta)), limits= c(min_beta, max_beta), name= 'Effect size') +
  coord_equal() +
scale_x_discrete(labels= labs) +  
theme(axis.title= element_blank(),
        axis.ticks= element_blank(),
        plot.margin = margin(0, 9, 0,0, "mm"),
        text= element_text(size= 9/ .pt),
        axis.text.y= element_text(hjust= 0.5),
        axis.line = element_line(colour = 'black', size = 0.2),
	legend.position= 'bottom') +
  geom_text_repel(data= filter(d, haplotype== 'Paternal\ntransmitted'), aes(x= rsid_lab, y= -0.05,
                                                                                label= round(probability, 2)), direction= "y" ,
                  size= 6.5/ .pt) 
ggsave(snakemake@output[[2]], plot= p1, width= 180, height= 100, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[3]], sep= '\t')
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
library(ggtern)
options(warn=-1)


x= fread(snakemake@input[[1]], h= T)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

x$rsid= with(x, ifelse(rsid== 'chrX:116013571', 'rs5991030', ifelse(rsid== 'chrX:132178061', 'rs5930554', rsid)))

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[2]], header= T, select= c('RSID', 'ID', 'nearestGene'))

x= inner_join(x, d, by= c('rsid'= 'RSID'))

x$nearestGene= with(x, ifelse(rsid== 'rs3129768', 'HLA-DQA1', ifelse(rsid== 'rs5991030', 'AGTR2', ifelse(rsid== 'rs5930554', 'RAP2C', nearestGene)))) 

x$nearestGene= with(x, ifelse(rsid== 'rs6780427', 'KCNAB1', nearestGene))

x$nearestGene= with(x, ifelse(rsid== 'rs6879092', 'EBF1', nearestGene))

#d= gather(x, haplotype, beta, c('Paternal', 'MaternalT', 'MaternalNT'))

#d$rsid_label= with(d, paste0(RSID, ' (', nearestGene, ')'))


#max_beta= max(abs(d$beta))

#d$class= factor(d$class, levels= c("MF SD", "MF OD", "Maternal", "Fetal MatT", "Fetal"))

#d= arrange(d, class, desc(max_prob))

#d$rsid_label= factor(d$rsid_label, levels= unique(d$rsid_label))


colT= colorBlindBlack8[4]
colR= colorBlindBlack8[1]
colL= colorBlindBlack8[2]

x$MF= x$MF_OD + x$MF_SD
x$Fet= x$Fetal_MatT + x$Fetal

p1= ggtern(x, aes(Maternal, Fet, MF)) +
geom_point(colour= colorBlindBlack8[8], fill= colorBlindBlack8[8], shape= 21) +
scale_alpha_continuous(range= c(0.6, 1), guide= F) +
scale_size_continuous(range= c(.001, 10), guide= F) +
theme_custom(tern.plot.background = NULL, tern.panel.background = 'white', col.T = colT, col.L = colL, col.R = colR, col.grid.minor = "white") +
Tarrowlab("Fetal only effect") +
Larrowlab("Maternal only effect") +
Rarrowlab("Maternal and fetal effect")  +
theme_showarrows()  +
theme_notitles() +
theme(text=element_text(family="arial", size= 10),
        tern.axis.arrow.T = element_blank(),
        tern.axis.arrow.L = element_blank(),
        tern.axis.arrow.R = element_blank(),
        tern.axis.text.T = element_text(color = colT),
        tern.axis.text.L = element_text(color = colL),
        tern.axis.text.R = element_text(color = colR),
        tern.axis.arrow.text.T = element_text(color = colT),
        plot.margin = margin(0, 0, 0, 0, "cm"),
        tern.axis.arrow.text.L = element_text(color = colL),
        tern.axis.arrow.text.R = element_text(color = colR),
        tern.panel.grid.major = element_line(linetype = 6, size = 0.3)) 
print('
ggtern(data=d, aes(-log10(pvalue_h1),-log10(pvalue_h2),-log10(pvalue_h3), label= nearestGene, size= abs(BETA), alpha= -log10(pvalue))) +
geom_point(colour= "black", fill= colorBlindBlack8[8], shape= 21) +
scale_alpha_continuous(range= c(0.6, 1), guide= F) +
scale_size_continuous(range= c(.001, 10), guide= F) +
theme_custom(tern.plot.background = NULL, tern.panel.background = "white", col.T = colT, col.L = colL, col.R = colR, col.grid.minor = "white") +
Tarrowlab("Maternal non-transmitted allele") + 
Larrowlab("Maternal transmitted allele") + 
Rarrowlab("Paternal transmitted allele")  +
theme_showarrows()  +
theme_notitles() +
theme(text=element_text(family="arial", size= 10),
	tern.axis.arrow.T = element_blank(),
	tern.axis.arrow.L = element_blank(),
	tern.axis.arrow.R = element_blank(),
        tern.axis.text.T = element_text(color = colT),
        tern.axis.text.L = element_text(color = colL),
        tern.axis.text.R = element_text(color = colR),
	tern.axis.arrow.text.T = element_text(color = colT), 
	plot.margin = margin(0, 0, 0, 0, "cm"), 
	tern.axis.arrow.text.L = element_text(color = colL),
	tern.axis.arrow.text.R = element_text(color = colR),
	tern.panel.grid.major = element_line(linetype = 6, size = 0.3)) +
geom_text(data= filter(d, nearestGene== "HAND2"), position= position_nudge_tern(y=0.05,x=-0.05/2,z=-0.05/2), aes(label=nearestGene), fontface= "bold", check_overlap=T, size= 8/ .pt, colour= "#525252", hjust= 1, vjust= 0.5)')


ggsave(snakemake@output[[1]], plot= p1, width= 95, height= 95, units= 'mm', dpi= 300)


d= select(x, rsid, ID, MF, Maternal, Fetal)

fwrite(x, snakemake@output[[2]], sep= '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
library(data.table)
library(dplyr)
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library('showtext')


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[1]])

d$lead_snp= with(d, ifelse(lead_snp== '1:50958027', '1:50959262', ifelse(lead_snp== '9:116929327', '9:116935764', ifelse(lead_snp== '5:157896786', '5:157895049', ifelse(lead_snp== '1:22511594', '1:22462111', lead_snp)))))

x= fread(snakemake@input[[2]])

x$lead_snp= paste(x$CHR, x$POS, sep= ':')

d= inner_join(d,x, by= 'lead_snp')

d$z_score= ifelse(d$z_score> 3.5, 3.5, d$z_score)

d$nearestGene= with(d, ifelse(nearestGene== 'CDC42', 'CDC42/ WNT4', ifelse(nearestGene== 'HIVEP3', 'HIVEP3/ EDN2', ifelse(nearestGene== 'TET3', 'TET3/ DGUOK-AS1', ifelse(nearestGene== 'TCEA2', 'TCEA2/ OPRL1', nearestGene)))))

d= filter(d, !(annotation %in% c('B2', 'geva_allele_age')))

d$annotation= with(d, ifelse(annotation== 'argweave', 'ARGWEAVE', 
		ifelse(annotation== 'betascore', 'Beta score',
		ifelse(annotation== 'B2', '', 
		ifelse(annotation== 'fst_eas_afr', 'Fst AFR-EAS',
		ifelse(annotation== 'fst_eur_afr', 'Fst AFR-EUR',
		ifelse(annotation== 'fst_eur_eas', 'Fst EAS-EUR',
		ifelse(annotation== 'gerp', 'GERP',
		ifelse(annotation== 'geva_allele_age', 'Alelle age',
		ifelse(annotation== 'iES_Sabeti', 'iES',
		ifelse(annotation== 'linsigh', 'LINSIGHT',
		ifelse(annotation== 'phastCon100', 'phastCONS100',
		ifelse(annotation== 'phyloP100', 'PhyloP',
		ifelse(annotation== 'xpehh_afr2_eas', 'XPEHH AFR-EAS',
		ifelse(annotation== 'xpehh_afr2_eur', 'XPEHH AFR-EUR',
		'XPEHH EAS-EUR')))))))))))))))

p1= ggplot(d, aes(annotation, nearestGene, fill= z_score)) +
geom_tile(colour = "white", size= 1) +
theme_cowplot(font_size= 9) +
scale_fill_gradient2(low= colorBlindBlack8[2], high= colorBlindBlack8[4], mid= 'white', limits= c(-2, 4)) +
theme(axis.text.x = element_text(angle = 45, hjust = 0),
        axis.title.x = element_blank(),
        axis.title.y = element_blank()) +
scale_x_discrete(position = "top") +
geom_text(data= filter(d, pvalue.x< 0.05), aes(annotation, nearestGene, label= '*'), size= 8/ .pt) +
theme(  panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.ticks= element_blank(),
        panel.border = element_rect(colour= 'black', fill= NA, size=1),
        plot.margin = unit(c(0, 1, 0, 0), "cm"),
        axis.line= element_blank(),
	axis.text.y = element_text(face = "italic")) +
coord_equal()


ggsave(snakemake@output[[1]], plot= p1, width= 140, height= 120, units= 'mm', dpi= 300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


colorBlindBlack8  <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

d= fread(snakemake@input[[1]])

z= fread(snakemake@input[[3]])

df= fread(snakemake@input[[2]], select= (c('MarkerName', 'Effect', 'StdErr', 'HetISq', 'HetPVal', 'TOTALSAMPLESIZE', 'P-value', 'Allele1', 'Allele2')))
names(df)= c('SNP', 'BETA', 'SE', 'HetISq', 'HetPval', 'N', 'pvalue', 'A1', 'A2')
df= filter(df, SNP %in% d$SNP)

df= separate(df, SNP, into= c('CHR', 'POS', 'Ax1', 'Ax2', 'ID'), sep= ':', remove= F)
df$BETA= ifelse(df$A2 > df$A1, -1 * df$BETA, df$BETA)
df$CHR= ifelse(df$CHR== 'X','23', df$CHR)
df$CHR= as.integer(df$CHR)
df$POS= as.integer(df$POS)
df= select(df, -c(A1, A2, ID, Ax1, Ax2))

df$cohort= 'Meta-analysis'
d= bind_rows(d, df)

z$CHR= ifelse(z$CHR== 'X','23', z$CHR)
z$CHR= as.integer(z$CHR)

d= inner_join(d, z, by= 'CHR') %>% filter(POS> pos1, POS< pos2)

d$locus= paste0('Chr ', d$CHR,': ', d$nearestGene)

d= filter(d, !(cohort %in% c('PGPII', 'PGPIII', 'BIB', 'DNBCPTD', 'STORK', 'STORKGROR')))

d$cohort= paste0(d$cohort, ' (n= ', d$N, ')')

temp_df= d[d$nearestGene== snakemake@wildcards[['prev_locus']], ]

temp_df= temp_df[order(temp_df$N, decreasing= T), ]

rsid= ifelse(snakemake@wildcards[['prev_locus']]== 'EEFSEC', 'rs2659685', 
ifelse(snakemake@wildcards[['prev_locus']]== 'WNT4', 'rs12037376', 
ifelse(snakemake@wildcards[['prev_locus']]== 'EBF1', 'rs2963463',
ifelse(snakemake@wildcards[['prev_locus']]== 'AGTR2', 'rs5991030', 'rs28654158'))))

gene= unique(temp_df$nearestGene)
my_title = expression(paste0(italic(gene), " (,", rsid, ")"))

p1= ggplot(temp_df, aes(x=factor(cohort, level = factor(cohort)), y=BETA, ymin= BETA - 1.96 * SE, ymax= BETA + 1.96 * SE, colour= !is.na(HetISq), shape= !is.na(HetISq)), alpha= 0.5) +
 geom_pointrange(size= 0.4) +
scale_shape_manual(values= c(15, 18), guide= F) +
 geom_hline(yintercept = 0, linetype=2) +
scale_y_continuous(sec.axis = dup_axis()) +
ggtitle(parse(text = paste0(rsid, ' - ', "~italic('", unique(temp_df$nearestGene), "')"))) + 
coord_flip() +
scale_colour_manual(values= c(colorBlindBlack8[3], colorBlindBlack8[4]), guide= F) +
theme_cowplot(8) +
 xlab('') +
    ylab('Beta [95% CI]') +
geom_vline(xintercept= 0, linetype= "dotted", colour= 'grey') 

ggsave(snakemake@output[[1]], plot= p1, width= 140, height= 30.5  + 50/13 * nrow(temp_df), units= 'mm', dpi= 300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])

d$term= with(d, ifelse(term== 'fetal_effect_PGS', 'Fetal', 'Maternal'))
d$outcome= gsub(' PGS', '', d$outcome)


p1= ggplot(d, aes(term, estimate, colour= term)) + 
geom_pointrange(aes(ymin= lo95, ymax= up95)) + 
facet_wrap(vars(outcome)) + 
scale_colour_manual(guide= 'none', values= colorBlindBlack8[c(2, 4)]) +
theme_cowplot(10) + 
geom_hline(yintercept= 0, colour= 'grey', size= 0.5, linetype= 'dashed') + 
theme(strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA)) + 
ylab('Effect on gestational duration \ngenetic score (95% CI), days') +
xlab('Birth weight genetic score')


ggsave(snakemake@output[[1]], plot= p1, width= 180, height= 100, units= 'mm', dpi= 300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
library(MendelianRandomization)
library(data.table)
library(dplyr)
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library('showtext')


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


top_ga= fread(snakemake@input[[1]])

top_ga= c(pull(top_ga, ID), '5:158058432:G:T', '3:156697097:A:G')


top_ptd= fread(snakemake@input[[2]])

top_ptd= pull(top_ptd, ID)

top= c(top_ga, top_ptd)

top= unique(top)

ga= fread(snakemake@input[[3]], select= c('ID', 'BETA', 'SE'))

ga= filter(ga, ID %in% top)

ptd= fread(snakemake@input[[4]], select= c('ID', 'BETA', 'SE'))

ptd= filter(ptd, ID %in% top) %>% select(ID, BETA, SE)

names(ptd)= c('ID', 'BETA_ptd', 'SE_ptd')

d= inner_join(ga, ptd, by= 'ID')

d$GWAS= with(d, ifelse(ID== '5:157895049:C:T', 'Both phenotypes', ifelse(ID %in% top_ptd, 'Preterm delivery', 'Gestational duration')))

p1= ggplot(d, aes(BETA, BETA_ptd, colour= GWAS, fill= GWAS)) +
geom_errorbarh(aes(xmin= BETA - SE, xmax= BETA + SE, colour= GWAS, fill= GWAS), size= 0.1, alpha= 0.7) +
geom_errorbar(aes(ymin= BETA_ptd - SE_ptd, ymax= BETA_ptd + SE_ptd, colour= GWAS, fill= GWAS),size= 0.1, alpha= 0.7) +
geom_point(size= 2, shape=21, stroke= 0.1, alpha= 0.7) +
scale_colour_manual(values= colorBlindBlack8[c(4, 2, 1)], guide= 'none') +
scale_fill_manual(values= colorBlindBlack8[c(4, 2, 1)], guide= 'none') +
xlab('Maternal effect on gestational duration, days') +
ylab('Maternal effect on preterm delivery, log(OR)') +
theme_cowplot(font_size= 8) +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05))


ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)

p1= ggplot(d, aes(BETA, BETA_ptd, colour= GWAS, fill= GWAS)) +
geom_errorbarh(aes(xmin= BETA - SE, xmax= BETA + SE, colour= GWAS, fill= GWAS), size= 0.1, alpha= 0.7) +
geom_errorbar(aes(ymin= BETA_ptd - SE_ptd, ymax= BETA_ptd + SE_ptd, colour= GWAS, fill= GWAS),size= 0.1, alpha= 0.7) +
geom_point(size= 2, shape=21, stroke= 0.1, alpha= 0.7) +
scale_colour_manual(values= colorBlindBlack8[c(4, 2, 1)], guide= 'none') +
scale_fill_manual(values= colorBlindBlack8[c(4, 2, 1)]) +
xlab('Maternal effect on gestational duration, days') +
ylab('Maternal effect on preterm delivery, log(OR)') +
theme_cowplot(font_size= 8) +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05))

ggsave(snakemake@output[[2]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)


d= fread(snakemake@input[[1]], h= T)


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)



pph= fread(snakemake@input[[1]])
supp_table= pph
geneb= fread(snakemake@input[[2]])

gene_dict= fread(snakemake@input[[3]])

names(gene_dict)= c('CHR', 'POS1', 'POS2', 'Gene', 'EnsembleID')

gene_dict$EID= with(gene_dict, unlist(lapply(strsplit(as.character(EnsembleID), ".", fixed= T), '[[', 1)))

d= inner_join(pph, gene_dict, by= c('protein'= 'EID')) %>% inner_join(., geneb, by= 'Gene')

#supp_table= full_join(pph, gene_dict, by= c('protein'= 'EID')) %>% full_join(., geneb, by= 'Gene') %>% filter(Pvalue< 0.05/ nrow(geneb) | PP.H4.abf>= 0.9)


z= fread(snakemake@input[[5]], select= c('z.df1', 'z.df2', 'SNP.PP.H4', 'protein', 'snp'))
z= arrange(z, desc(SNP.PP.H4))

z= group_by(z, protein) %>% filter(row_number()==1)

d= left_join(d, z, by= c('protein'))

d= separate(d, snp, into= c('CHR', 'POS', 'REF', 'EFF'), sep= ':', remove= FALSE)

#aa= fread(snakemake@input[[6]])
#names(aa)= c('CHR', 'POS', 'REF', 'ALT', 'AA')
#aa= filter(aa, AA!= '.')
#aa= filter(aa, POS %in% d$POS)

#aa$ID= with(aa, ifelse(REF> ALT, paste(CHR, POS, ALT, REF, sep= ':'), paste(CHR, POS, REF, ALT, sep= ':')))

#d= left_join(d,aa[, c('ID', 'AA')], by= c('snp'= 'ID'))

#d$z.df1= with(d, ifelse(d$AA== d$EFF, -1 * d$z.df1, d$z.df1))
#d$z.df2= with(d, ifelse(d$AA== d$EFF, -1 * d$z.df2, d$z.df2))

#d$direction= with(d, ifelse(z.df1>0 & z.df2 > 0, 'Positive', ifelse(z.df1<0 & z.df2< 0, 'Negative', 'Opposite')))
#d$direction= with(d, ifelse(is.na(d$AA), 'Missing', d$direction))

d$direction= with(d, ifelse((z.df1 * z.df2)>0, 'Same direction', 'Opposite'))

d$gene_group= with(d, ifelse(PP.H4.abf> 0.9 & Pvalue< 0.05 / nrow(geneb), 'Colocalize and gene-based significant', ifelse(Pvalue< 0.05 / nrow(geneb) & PP.H4.abf<= 0.9, 'Gene based significant',
	ifelse(PP.H4.abf> 0.9 & Pvalue> 0.05 / nrow(geneb), 'Colocalize', 'No colocalize and not significant'))))

ga= fread(snakemake@input[[4]], select= c('ID', 'BETA'))

d= inner_join(d, ga, by= c('snp'= 'ID'))

p1= ggplot(d, aes(-log10(Pvalue), PP.H4.abf, fill= direction, alpha= (1 + PP.H4.abf) * -log10(Pvalue))) +
geom_point(shape=21, colour= 'black', size= 4) +
theme_cowplot(font_size= 10) +
scale_alpha_continuous(guide= F) +
scale_size_continuous(range = c(.001, 10), guide= F) +
scale_fill_manual(values= c(colorBlindBlack8[c(2, 4)]), guide= F) +
geom_text_repel(data= filter(d, PP.H4.abf> 0.9 | Pvalue< 0.05 / nrow(geneb)), aes(label= Gene), max.overlaps= 20, colour= 'black', size= 6/ .pt, max.time= 10, alpha= 1) +
geom_hline(yintercept= 0.9, colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_vline(xintercept= -log10(0.05/nrow(geneb)), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
scale_y_continuous(breaks= c(seq(0, 1, 0.25), 0.9), limits= c(0, 1), expand= expansion(mult= c(0.05,0))) +
ylab('Posterior probability of colocalization') +
xlab('-log10(Gene based p-value)')

ggsave(snakemake@output[[1]], plot= p1, width= 95, height= 95, units= 'mm', dpi= 300)

d= select(d, Gene, BETA, direction, Pvalue, PP.H4.abf, Pvalue, z.df1, z.df2)

fwrite(d, snakemake@output[[2]], sep= '\t')

p1= ggplot(d, aes(-log10(Pvalue), PP.H4.abf, fill= direction, alpha= (1 + PP.H4.abf) * -log10(Pvalue))) +
geom_point(shape=21, colour= 'black', size= 4) +
theme_cowplot(font_size= 10) +
scale_alpha_continuous('Legend') +
scale_size_continuous('Legend', range = c(.001, 10)) +
scale_fill_manual('Legend', values= c(colorBlindBlack8[c(2, 4)])) +
geom_text_repel(data= filter(d, PP.H4.abf> 0.9 | Pvalue< 0.05 / nrow(geneb)), aes(label= Gene), max.overlaps= 20, colour= 'black', size= 6/ .pt, max.time= 10, alpha= 1) +
geom_hline(yintercept= 0.9, colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_vline(xintercept= -log10(0.05/nrow(geneb)), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
scale_y_continuous(breaks= c(seq(0, 1, 0.25), 0.9), limits= c(0, 1), expand= expansion(mult= c(0.05,0))) +
ylab('Posterior probability of colocalization') +
xlab('-log10(Gene based p-value)')

ggsave(snakemake@output[[3]], plot= p1, width= 90, height= 90, units= 'mm', dpi= 300)

fwrite(supp_table, snakemake@output[[4]], sep= '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
library(scales)
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
library(tidyverse)
library(fmsb)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

d= fread(snakemake@input[[1]])

d$p1= 'Gestational\nduration'
d$p2= with(d, ifelse(grepl('postTerm', p2), 'Post-term\ndelivery', ifelse(grepl('allPTD', p2), 'Preterm\ndelivery', 'GAnrm')))

d= filter(d, p2!= 'GAnrm')

p1= ggplot(d, aes(p2, rg, colour= p2)) +
  geom_point() +
geom_errorbar(aes(ymin= I(rg - 1.96*se) , ymax= (rg + 1.96 * se)), width=.2, position=position_dodge(.9)) +
theme_cowplot(font_size= 9) +
scale_fill_manual(values= colorBlindBlack8[c(8,3,2)], guide= 'none') +
scale_colour_manual(guide= 'none', values= colorBlindBlack8[c(8,3,2)]) +
xlab('Phenotype') +
ylab('Genetic correlation [95% CI]') +
theme(legend.position= 'none') +
ylim(pmin(-1, min(d$rg - 1.96*d$se)), pmax(1, max(d$rg + 1.96 * d$se))) +
geom_hline(yintercept= 0, linetype= 'dashed', colour= 'grey', size= 0.5)


ggsave(snakemake@output[[1]], plot= p1, width= 60, height= 80, units= 'mm', dpi= 300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)


d= fread(snakemake@input[[1]], h= T)


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])

d= filter(d, pheno!= 'GAnrm')

d$pheno= with(d, ifelse(pheno== 'GAraw', 'Gestational\nduration', ifelse(pheno== 'allPTD', 'Preterm\ndelivery', 'Post-term\ndelivery')))

p1= ggplot(d, aes(pheno, h2, colour= pheno)) +
  geom_point() +
geom_errorbar(aes(ymin= I(h2 - 1.96*se) , ymax= (h2 + 1.96 * se)), width=.2, position=position_dodge(.9)) +
theme_cowplot(font_size= 9) +
scale_fill_manual(values= colorBlindBlack8[c(8,3,2)], guide= 'none') +
scale_colour_manual(guide= 'none', values= colorBlindBlack8[c(8,3,2)]) +
xlab('Phenotype') +
ylab('Common SNP heritability [95% CI]') +
theme(legend.position= 'none',
	axis.text.x= element_text(angle= 45, hjust= 1))


ggsave(snakemake@output[[1]], plot= p1, width= 60, height= 80, units= 'mm', dpi= 300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)


d= fread(snakemake@input[[1]], h= T)
x= fread(snakemake@input[[2]], h= T)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)
d$trait= 'Gestational\nduration'
x$trait= 'Preterm delivery'

d= rbind(d, x)

p1= ggplot(d, aes(cohort, h2, colour= cohort)) +
  geom_point() +
geom_errorbar(aes(ymin= I(h2 - 1.96*se) , ymax= (h2 + 1.96 * se)), width=.2, position=position_dodge(.9)) +
theme_cowplot(font_size= 9) +
facet_wrap(vars(trait), ncol= 1) +
scale_fill_manual(values= colorBlindBlack8[c(8,3,2,6,7, 4, 1)], guide= 'none') +
scale_colour_manual(guide= 'none', values= colorBlindBlack8[c(8,3,2,6,7, 4, 1)]) +
xlab('Cohort') +
ylab('Common SNP heritability [95% CI]') +
theme(legend.position= 'none',
	strip.background = element_blank(),
	axis.text.x= element_text(angle= 45, hjust= 1))


ggsave(snakemake@output[[1]], plot= p1, width= 60, height= 120, units= 'mm', dpi= 300)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
library(dplyr)
library(data.table)
library(ggplot2)
library(cowplot)
library(ggrepel)
library(tidyr)
library(showtext)
colorBlindBlack8  <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])
names(d)[8]= 'phenocode'
mani= fread(snakemake@input[[2]])

trait_list= c('biomarkers', 'continuous', 'icd10')
mani= mani[mani$trait_type %in% trait_list, ]

mani= filter(mani, saige_heritability_EUR> 0.01)
mani= mani[order(mani$saige_heritability_EUR, decreasing= TRUE), ]
mani= mani[!duplicated(mani$phenocode), ]

mani$phenocode= paste(mani$trait_type, mani$phenocode, sep= '_')
mani= mani[, c('phenocode', 'description')]
mani= mani[!duplicated(mani$description), ]

d= inner_join(d, mani[, c('description', 'phenocode')], by= 'phenocode')
d$cohort= 'UKBB'

x= fread(snakemake@input[[3]])
names(x)[8]= 'phenocode'
mani= fread(snakemake@input[[4]])
mani= mani[, c('phenocode', 'name')]
names(mani)= c('phenocode', 'description')
mani= mani[!duplicated(mani$description), ]

x= inner_join(x, mani, by= 'phenocode')
x$cohort= 'FINNGEN'

d= rbind(d, x)
d= d[order(d$PP.H4.abf, decreasing= F), ]
d= filter(d, PP.H4.abf> 0.01, PP.H4.abf + PP.H3.abf> 0.75)

d$preg_trait= factor(d$preg_trait)
empty_bar <- 6
to_add <- data.frame( matrix(NA, empty_bar*nlevels(d$preg_trait), ncol(d)) )
colnames(to_add) <- colnames(d)
to_add$preg_trait <- rep(levels(d$preg_trait), each=empty_bar)
d <- rbind(d, to_add)
d <- d %>% arrange(preg_trait)


d$id= seq(1, nrow(d))

label_data= d
number_of_bar <- nrow(label_data)
angle <-  90 - 360 * (label_data$id-0.5) /number_of_bar
label_data$hjust<-ifelse( angle < -90, 1, 0)


label_data$angle<-ifelse(angle < -90, angle+180, angle)

#d$id= factor(d$id, levels= d$id[order(d$PP.H4.abf)])

base_data= d %>%
  group_by(preg_trait) %>%
  filter(is.na(PP.H4.abf)) %>%
  summarize(start=min(id), end=max(id) ) %>%
  rowwise() %>%
  mutate(title=mean(c(start, end)))

arc100= rep(1, 2)
arc75= rep(0.75, 2)
arc50= rep(0.50, 2)
arc25= rep(0.25, 2)

p1= ggplot(d, aes(as.factor(id), PP.H4.abf, fill= preg_trait, alpha= PP.H4.abf)) +
geom_bar(stat="identity", colour= NA) +
scale_alpha_continuous(range= c(0.4, 1), guide= F) +
geom_segment(data=base_data, aes(x = end, y = arc100, xend = start, yend = arc100), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  geom_segment(data=base_data, aes(x = end, y = arc75, xend = start, yend = arc75), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  geom_segment(data=base_data, aes(x = end, y = arc50, xend = start, yend = arc50), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  geom_segment(data=base_data, aes(x = end, y = arc25, xend = start, yend = arc25), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  annotate("text", x = ((base_data$end[1] + base_data$start[1]) / 2), y = c((0.25 + 0.05) , (0.50 + 0.05), (0.75 + 0.05) , (1 + 0.05)), label = c("0.25", "0.50", "0.75", "1") , color="grey", size=3 , angle=0, fontface="bold", hjust= 0.5) +
   annotate("text", x = ((base_data$end[2] + base_data$start[2]) / 2), y = c((0.25 + 0.05) , (0.50 + 0.05), (0.75 + 0.05) , (1 + 0.05) ), label = c("0.25", "0.50", "0.75", "1") , color="grey", size=3, angle=15, fontface="bold", hjust=0.5) +
ylim(-0.2, 2) + # Limits of the plot = very important. The negative value controls the size of the inner circle, the positive one is useful to add size over each bar
theme_cowplot() +
scale_fill_manual(values=colorBlindBlack8[c(2,4)], guide= F) +
scale_colour_manual(values=colorBlindBlack8[c(2,4)], guide= F) +
  theme(
    axis.text = element_blank(),
    axis.title = element_blank(),
    panel.grid = element_blank(),
    plot.margin = unit(rep(-2,4), "cm")    ) +
  coord_polar(start = 0) +
geom_text(data=filter(label_data, PP.H4.abf> 0.75), aes(x= factor(id), y=PP.H4.abf + 0.01, label=description, hjust=hjust), color="black", fontface="bold",alpha=0.6, size=2.5, angle= filter(label_data, PP.H4.abf> 0.750)$angle, inherit.aes = FALSE) +
theme(panel.grid = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank())

p1= save_plot(snakemake@output[[1]], p1, base_width= 8, base_height= 8)

fwrite(d, snakemake@output[[2]], sep= '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
library(data.table)
library(dplyr)
library(cowplot)
library(ggrepel)
library('showtext')


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)



d= fread(snakemake@input[[1]])

d$Category= factor(d$Category, levels= unique(d$Category))

p1= ggplot(d, aes(Enrichment, -log10(Enrichment_p))) + 
geom_point(aes(size= Enrichment_p< 0.05/ (nrow(d)-1)), shape= 21, stroke= 0.1, fill= colorBlindBlack8[4]) +
xlab('Heritability enrichment') +
ylab('-log10(P-value)') +
theme_cowplot(font_size= 8) +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(panel.grid.major= element_line(colour= 'grey', size= 0.05),
legend.position = "none")  +
geom_text_repel(data= filter(d, Enrichment_p< 0.05), aes(Enrichment, -log10(Enrichment_p), label= Category), size= 8/.pt)


ggsave(snakemake@output[[1]], plot= p1, width= 120, height= 90, units= 'mm', dpi= 300)


p2= ggplot(d, aes(n_genes, -log10(Enrichment_p))) + 
geom_point(aes(size= Enrichment_p< 0.05/ (nrow(d)-1)), shape= 21, stroke= 0.1, fill= colorBlindBlack8[4]) +
xlab('Size of gene set') +
ylab('-log10(Enrichment)') +
theme_cowplot(font_size= 8) +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(panel.grid.major= element_line(colour= 'grey', size= 0.05),
legend.position = "none") +
geom_text_repel(data= filter(d, Enrichment_p< 0.05), aes(n_genes, -log10(Enrichment_p), label= Category), size= 8/.pt)

ggsave(snakemake@output[[2]], plot= p2, width= 90, height= 90, units= 'mm', dpi= 300)

p3= ggplot(d, aes(n_genes, -log10(Enrichment_p))) + 
geom_point(aes(size= Enrichment_p< 0.05/ (nrow(d)-1)), shape= 21, stroke= 0.1, fill= colorBlindBlack8[4]) +
xlab('Size of gene set') +
ylab('-log10(Enrichment)') +
theme_cowplot(font_size= 8) +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(panel.grid.major= element_line(colour= 'grey', size= 0.05)) +
geom_text_repel(data= filter(d, Enrichment_p< 0.05), aes(n_genes, -log10(Enrichment_p), label= Category), size= 8/.pt)

ggsave(snakemake@output[[3]], plot= p3, width= 90, height= 90, units= 'mm', dpi= 300)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')



colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[1]])

x= fread(snakemake@input[[2]], select= c('RSID', 'BETA'))


d= inner_join(d, x, by= c('rsid' = 'RSID'))

d$beta_MNT= with(d, ifelse(BETA< 0, -1 * beta_MNT,  beta_MNT))
d$beta_PT= with(d, ifelse(BETA< 0, -1 * beta_PT, beta_PT))
d$beta_MT= with(d, ifelse(BETA< 0, -1 * beta_MT, beta_MT))
d$BETA= with(d, ifelse(BETA<0, -1 * BETA, BETA))

d$lo95_MT= d$beta_MT - 1.96 * d$se_MT
d$up95_MT= d$beta_MT + 1.96 * d$se_MT

d$lo95_MNT= d$beta_MNT - 1.96 * d$se_MNT
d$up95_MNT= d$beta_MNT + 1.96 * d$se_MNT

d$lo95_PT= d$beta_PT - 1.96 * d$se_PT
d$up95_PT= d$beta_PT + 1.96 * d$se_PT

d$class_name= with(d, ifelse(class_name== 'MF SD', 'Maternal and fetal (same direction)', ifelse(class_name== 'Fetal MatT', 'Fetal effect, maternal transmitted only', ifelse(class_name== 'Maternal', 'Maternal', ifelse(class_name== 'Fetal', 'Fetal', ifelse(class_name== 'MF OD', 'Maternal and fetal (opposite direction)', ''))))))

p1= ggplot(d, aes(beta_MNT, BETA, colour= class_name)) +
geom_point(size= 0.5) +
#geom_errorbarh(data= filter(d, (lo95_h2 >0 & up95_h2>0) | (lo95_h2<0 & up95_h2 <0)), aes(xmax = lo95_h2, xmin = up95_h2), size= 0.05) +
theme_cowplot(font_size= 8) +
scale_colour_manual(values= c('grey', colorBlindBlack8[c(8, 2, 4, 3)])) +
geom_vline(xintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_hline(yintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
xlab('Effect size maternal \nnon-transmitted alleles, days') +
ylab('Effect size maternal genome, days')
#theme(legend.direction = "horizontal", legend.position = "bottom")
#scale_x_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) +
#  scale_y_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1))


ggsave(snakemake@output[[1]], plot= p1, width= 60, height= 60, units= 'mm', dpi= 300)

print('plot1')
p1= ggplot(d, aes(beta_PT, BETA, colour= class_name)) +
geom_point(size= 0.5) +
#geom_errorbarh(data= filter(d, (lo95_h3 >0 & up95_h3>0) | (lo95_h3<0 & up95_h3 <0)), aes(xmax = lo95_h3, xmin = up95_h3), size= 0.05) +
theme_cowplot(font_size= 8) +
scale_colour_manual(values= c('grey', colorBlindBlack8[c(8, 2, 4, 3)])) +
geom_vline(xintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_hline(yintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
xlab('Effect size paternal \ntransmitted alleles, days') +
ylab('Effect size maternal genome, days') 
#scale_x_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) +
#  scale_y_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1))

ggsave(snakemake@output[[2]], plot= p1, width= 60, height= 60, units= 'mm', dpi= 300)

print('plot2')
p1= ggplot(d, aes(beta_MT, BETA, colour= class_name)) +
geom_point(size= 0.5) +
#geom_errorbarh(data= filter(d, (lo95_h3 >0 & up95_h3>0) | (lo95_h3<0 & up95_h3 <0)), aes(xmax = lo95_h3, xmin = up95_h3), size= 0.05) +
theme_cowplot(font_size= 8) +
scale_colour_manual(values= c('grey', colorBlindBlack8[c(8, 2, 4, 3)]), guide= F) +
geom_vline(xintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_hline(yintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
xlab('Effect size maternal \ntransmitted alleles, days') +
ylab('Effect size maternal genome, days')
#scale_x_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) +
#  scale_y_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1))

ggsave(snakemake@output[[3]], plot= p1, width= 60, height= 60, units= 'mm', dpi= 300)

p1= ggplot(d, aes(beta_MNT, BETA, colour= class_name)) +
geom_point(size= 0.5) +
#geom_errorbarh(data= filter(d, (lo95_h2 >0 & up95_h2>0) | (lo95_h2<0 & up95_h2 <0)), aes(xmax = lo95_h2, xmin = up95_h2), size= 0.05) +
theme_cowplot(font_size= 8) +
scale_colour_manual(values= c('grey', colorBlindBlack8[c(8, 2, 4, 3)])) +
geom_vline(xintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_hline(yintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
xlab('Effect size maternal \nnon-transmitted alleles, days') +
ylab('Effect size maternal genome, days') 
theme(legend.direction = "horizontal", legend.position = "bottom")
#scale_x_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) +
#  scale_y_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1))

ggsave(snakemake@output[[4]], plot= p1, width= 120, height= 60, units= 'mm', dpi= 300)
fwrite(d, snakemake@output[[5]], sep= '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])

x= fread(snakemake@input[[2]])

d= rbind(d, x)

names(d)= c('Name', 'no_no', 'no_yes', 'yes_no', 'yes_yes', 'candidate_gene', 'rest_genes', 'OR', 'pvalue')
d$enrichment= d$candidate_gene / d$rest_genes

d= arrange(d, desc(pvalue))

d$description= with(d, ifelse(Name== 'pli', 'Loss-of-function intolerant',
			ifelse(Name== 'dominant', 'Dominant', 'Recessive')))

d$description= factor(d$description, levels= unique(d$description))



p1= ggplot(data=d, aes(x= description, y= -log10(pvalue))) +
geom_col(fill=colorBlindBlack8[2], alpha= 0.6) +
theme_cowplot(font_size= 10) +
ylab('Enrichment -log10(pvalue)') +
theme(axis.title.y=element_blank()) +
geom_hline(yintercept= -log10(0.05/nrow(d)), linetype= 'dashed', colour= 'grey') +
coord_flip()


ggsave(snakemake@output[[1]], plot= p1, height= 35, width= 90, dpi= 300, units= 'mm')

fwrite(d, snakemake@output[[2]], sep='\t')
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)


d= fread(snakemake@input[[1]], h= T)


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

ge= data.frame(CHR= c(5, 3, 1, 23, 3, 23), pos_ge= c(157895049, 127881613, 22470407, 115164770, 123068359, 131300571))

df= arrange(d, pvalue)

dg= fread(snakemake@input[[2]])
dg$GENE= dg$nearestGene


don <- df %>%
    group_by(CHR)      %>%
    summarise(chr_len= max(POS)) %>%
    mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome
    select(-chr_len) %>%
    left_join(df, ., by= 'CHR') %>%
    arrange(CHR, POS) %>% # Add a cumulative position of each SNP
    mutate(BPcum=POS+tot) %>%
         ungroup()

axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
  names(axisdf)= c('CHR', 'center')
HC= -log10(5*10**-8)
dg= dg %>% ungroup() %>% select(ID, GENE, CHR, POS, BETA)

don$disc= ifelse(don$pvalue> 5*10**-8, 0, 2)

don= left_join(don, select(dg, ID, GENE), by= 'ID')
names(dg)= c('ID', 'GENE', 'CHR', 'POS_new', 'BETA')

lims= 250000

don= data.frame(don)
dg= data.frame(dg)


for (i in rownames(dg)) {
don= mutate(don, disc= ifelse(CHR== as.integer(dg[i, 'CHR']) & POS>= as.integer(dg[i, 'POS_new']) - lims & POS<= as.integer(dg[i, 'POS_new']) + lims, 2, disc)) 
}


for (i in rownames(ge)) {
don= mutate(don, disc= ifelse(CHR== as.integer(ge[i, 'CHR']) & POS>= as.integer(ge[i, 'pos_ge']) - lims & POS<= as.integer(ge[i, 'pos_ge']) + lims, 1, disc))
}

don= don[order(don$disc, decreasing= F, na.last= T), ]
don$disc= factor(don$disc, levels=c(0, 1, 2), labels=c('Not significant', 'Previous discovery', 'New discovery'))

cols <- c('Not significant'= 'grey', 'Previous discovery'= colorBlindBlack8[3], 'New discovery'= colorBlindBlack8[8])

don$GENE= ifelse(!is.na(don$GENE), don$nearestGene, don$GENE)

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


don$GENE= with(don, ifelse(GENE== 'CDC42', 'CDC42/ WNT4', ifelse(GENE== 'HIVEP3', 'HIVEP3/ EDN2', ifelse(GENE== 'TET3', 'TET3/ DGUOK-AS1', ifelse(GENE== 'TCEA2', 'TCEA2/ OPRL1', GENE)))))

don$logpval= -log10(don$pvalue)

p1= ggplot(data= don, aes(x= BPcum, y= logpval, colour= disc)) +
  geom_point(size= 0.07) +   # Show all points
  theme_cowplot(font_size= 9) +
  scale_colour_manual(values= cols, guide= F) +
  scale_x_continuous(label = c(1:19, '', 21,'', 'X'), breaks= axisdf$center, expand= c(0.03, 0.03)) + # label = ifelse(axisdf$CHR== 23, 'X', axisdf$CHR)
  scale_y_continuous(expand= c(0, 0), limits= c(min(don$logpval) - 2, max(don$logpval) + 2), breaks= seq(0, 10, 5), labels= c(abs(seq(0, 10, 5)))) + # , sec.axis = sec_axis(~ ., name = derive())) +
  ylab('-log10(pvalue)') +
  xlab('Chromosome') +
  geom_hline(yintercept= 0,, size= 0.25, colour= 'black') +
  geom_hline(yintercept= c(HC, -HC), size= 0.2, linetype= 2, colour= '#878787') +
  coord_cartesian(clip = "off") +
  geom_text_repel(data= filter(don, GENE!= ''), aes(x= BPcum, y= logpval, label= GENE),
                  size= 6/ .pt,
                  force_pull= 0, # do not pull toward data points
                  force= 0.1,
                  nudge_y      =  ifelse(filter(don, GENE!= '') %>% pull(logpval)>0, 1, -1), #43 - ((-log10(filter(don, GENE!= '')$pvalue))),
                  direction    = "both",
                  hjust        = 0,
                  vjust=  0.5,
		  box.padding= 0.1,
		  angle= 0,
                  segment.size = 0.1,
                  segment.square= TRUE,
                  segment.inflect= FALSE,
                  segment.colour= colorBlindBlack8[8],
                  colour= ifelse(filter(don, GENE!= '') %>% pull(disc)== 'New discovery', colorBlindBlack8[8], colorBlindBlack8[3]),
                  segment.linetype = 4,
                  ylim = c(-Inf, 50),
                  xlim = c(-Inf, Inf)) +
  theme(legend.position= 'none',
	plot.margin = unit(c(t= 0, r=0, b= 0, l=0), 'cm'),
        text= element_text(family="arial", size= 9),
	axis.line= element_line(size= 0.1)) 

save_plot(snakemake@output[[1]], plot= p1, base_height= 90, base_width= 185, units= 'mm', dpi= 300)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)


d= fread(snakemake@input[[1]], h= T, select= c('ID', 'CHR', 'POS', 'pvalue', 'nearestGene'))
d$pheno= 'GAraw'
x= fread(snakemake@input[[3]], h= T, select= c('ID', 'CHR', 'POS', 'pvalue', 'nearestGene'))
x$pheno= 'allPTD'

d= rbind(d, x)

rm(x)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

ge= data.frame(CHR= c(5, 3, 1, 23, 3, 23), pos_ge= c(157895049, 127881613, 22470407, 115164770, 123068359, 131300571))

df= arrange(d, pvalue)


dg= fread(snakemake@input[[2]])
dg$GENE= dg$nearestGene

ptd= fread(snakemake@input[[4]])
ptd$GENE= ptd$nearestGene

don <- df %>%
    group_by(CHR)      %>%
    summarise(chr_len= max(POS)) %>%
    mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome
    select(-chr_len) %>%
    left_join(df, ., by= 'CHR') %>%
    arrange(CHR, POS) %>% # Add a cumulative position of each SNP
    mutate(BPcum=POS+tot) %>%
         ungroup()

axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
  names(axisdf)= c('CHR', 'center')
HC= -log10(5*10**-8)
dg= dg %>% ungroup() %>% select(ID, GENE, CHR, POS, BETA)
ptd= ptd %>% ungroup %>% select(ID, GENE, CHR, POS, BETA)

don$disc= ifelse(don$pvalue> 5*10**-8, 0, 2)

don1= filter(don, pheno== 'GAraw') %>% left_join(., select(dg, ID, GENE), by= 'ID')
don2= filter(don, pheno!= 'GAraw') %>% left_join(., select(ptd, ID, GENE), by= 'ID')
names(dg)= c('ID', 'GENE', 'CHR', 'POS_new', 'BETA')
names(ptd)= c('ID', 'GENE', 'CHR', 'POS_new', 'BETA')

lims= 250000

don= data.frame(don)
dg= data.frame(dg)
ptd= data.frame(ptd)


for (i in rownames(dg)) {
don1= mutate(don1, disc= ifelse(CHR== as.integer(dg[i, 'CHR']) & POS>= as.integer(dg[i, 'POS_new']) - lims & POS<= as.integer(dg[i, 'POS_new']) + lims, 2, disc)) 
}

for (i in rownames(ptd)) {
don2= mutate(don2, disc= ifelse(CHR== as.integer(ptd[i, 'CHR']) & POS>= as.integer(ptd[i, 'POS_new']) - lims & POS<= as.integer(ptd[i, 'POS_new']) + lims, 2, disc))

}

don= rbind(don1, don2)
rm(don1) ; rm(don2)

for (i in rownames(ge)) {
don= mutate(don, disc= ifelse(CHR== as.integer(ge[i, 'CHR']) & POS>= as.integer(ge[i, 'pos_ge']) - lims & POS<= as.integer(ge[i, 'pos_ge']) + lims, 1, disc))
}

don= don[order(don$disc, decreasing= F, na.last= T), ]
don$disc= factor(don$disc, levels=c(0, 1, 2), labels=c('Not significant', 'Previous discovery', 'New discovery'))

cols <- c('Not significant'= 'grey', 'Previous discovery'= colorBlindBlack8[3], 'New discovery'= colorBlindBlack8[8])

don$GENE= ifelse(!is.na(don$GENE), don$nearestGene, don$GENE)

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


don$GENE= with(don, ifelse(GENE== 'CDC42', 'CDC42/ WNT4', ifelse(GENE== 'HIVEP3', 'HIVEP3/ EDN2', ifelse(GENE== 'TET3', 'TET3/ DGUOK-AS1', ifelse(GENE== 'TCEA2', 'TCEA2/ OPRL1', GENE)))))

don$logpval= with(don, ifelse(pheno== 'allPTD', log10(pvalue), -log10(pvalue)))

p1= ggplot(data= don, aes(x= BPcum, y= logpval, colour= disc)) +
  geom_point(size= 0.07) +   # Show all points
  theme_cowplot(font_size= 9) +
  scale_colour_manual(values= cols, guide= F) +
  scale_x_continuous(label = c(1:19, '', 21,'', 'X'), breaks= axisdf$center, expand= c(0.03, 0.03)) + # label = ifelse(axisdf$CHR== 23, 'X', axisdf$CHR)
  scale_y_continuous(expand= c(0, 0), limits= c(min(don$logpval) - 2, max(don$logpval) + 2), breaks= seq(-30, 45, 10), labels= c(abs(seq(-30, 45, 10)))) + # , sec.axis = sec_axis(~ ., name = derive())) +
  ylab('-log10(pvalue)') +
  xlab('Chromosome') +
  geom_hline(yintercept= 0,, size= 0.25, colour= 'black') +
  geom_hline(yintercept= c(HC, -HC), size= 0.2, linetype= 2, colour= '#878787') +
  coord_cartesian(clip = "off") +
  geom_text_repel(data= filter(don, GENE!= ''), aes(x= BPcum, y= logpval, label= GENE),
                  size= 6/ .pt,
                  force_pull= 0, # do not pull toward data points
                  force= 0.1,
                  nudge_y      =  ifelse(filter(don, GENE!= '') %>% pull(logpval)>0, 1, -1), #43 - ((-log10(filter(don, GENE!= '')$pvalue))),
                  direction    = "both",
                  hjust        = 0,
                  vjust=  0.5,
		  box.padding= 0.1,
		  angle= 0,
                  segment.size = 0.1,
                  segment.square= TRUE,
                  segment.inflect= FALSE,
                  segment.colour= colorBlindBlack8[8],
                  colour= ifelse(filter(don, GENE!= '') %>% pull(disc)== 'New discovery', colorBlindBlack8[8], colorBlindBlack8[3]),
                  segment.linetype = 4,
                  ylim = c(-Inf, 50),
                  xlim = c(-Inf, Inf)) +
  theme(legend.position= 'none',
	plot.margin = unit(c(t= 0, r=0, b= 0, l=0), 'cm'),
        text= element_text(family="arial", size= 9),
	axis.line= element_line(size= 0.1)) 

save_plot(snakemake@output[[1]], plot= p1, base_height= 90, base_width= 180, units= 'mm', dpi= 300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

as= 8
as1= 8

d= fread(snakemake@input[[1]])
d$effect= 'fetal_effect'
x= fread(snakemake@input[[2]])
x$effect= 'maternal_effect'

d= rbind(d, x)

d= filter(d, !(rsID %in% c('rs7819593', 'rs41311445')))
d$Beta2= ifelse(d$Beta1< 0, -1 * d$Beta2, d$Beta2)
d$Beta1= ifelse(d$Beta1< 0, -1 * d$Beta1, d$Beta1)

d$beta_dif= with(d, (Beta2 - Beta1) / Beta1)

mor= filter(d, effect == 'maternal_effect') %>% pull(beta_dif)
barn= filter(d, effect == 'fetal_effect') %>% pull(beta_dif)

p1= ggplot() +
geom_density( mapping=aes(x = mor, y = ..density..), fill= colorBlindBlack8[3], colour= colorBlindBlack8[3]) +
annotate('text', x= 0.35, y= 0.6, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x= 0.35, y= -1, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
geom_density(mapping= aes(x = barn, y = -..density..), fill= colorBlindBlack8[8], colour= colorBlindBlack8[8]) +
  theme_cowplot(font_size = 8) +
scale_x_continuous(expand= c(0, 0)) +
  xlab("Relative difference in effect size on \nbirth weight with or without adjusting for gestational duration") +
ylab('Density') +
geom_hline(yintercept= 0, colour= 'grey') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))


ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)



p1= ggplot(d, aes(beta_dif, group= effect, fill= effect)) +
geom_hline(yintercept= 0, colour= 'black') +
geom_density(color= NA) +
annotate('text', x=-1.5, y= 0.8, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=1, y= 0.8, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
theme_cowplot(font_size= 8) +
#scale_colour_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_fill_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_x_continuous(expand= c(0, 0)) +
scale_y_continuous(expand=c(0, 0.05)) +
  xlab("Relative difference in effect size on birth weight\nwith or without adjusting for gestational duration") +
ylab('Density') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3)) +
geom_vline(xintercept= 0, linetpye= 'dashed', colour= 'grey')

ggsave(snakemake@output[[3]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

as= 8
as1= 8

d= fread(snakemake@input[[1]])

d$beta_h2_GA= ifelse(d$beta_h2< 0, -1 * d$beta_h2_GA, d$beta_h2_GA)
d$beta_h2= ifelse(d$beta_h2< 0, -1 * d$beta_h2, d$beta_h2)

d$beta_h3_GA= ifelse(d$beta_h3 < 0, -1 * d$beta_h3_GA, d$beta_h3_GA)
d$beta_h3= ifelse(d$beta_h3 < 0, -1 * d$beta_h3, d$beta_h3)

d$beta_dif_h2= with(d, (beta_h2_GA - beta_h2) / beta_h2)
d$beta_dif_h3= with(d, (beta_h3_GA - beta_h3) / beta_h3)

mor= filter(d, effect == 'maternal_effect') %>% pull(beta_dif_h2)
barn= filter(d, effect == 'fetal_effect') %>% pull(beta_dif_h3)

p1= ggplot() +
geom_density( mapping=aes(x = mor, y = ..density..), fill= colorBlindBlack8[3], colour= colorBlindBlack8[3]) +
annotate('text', x= 0.35, y= 0.6, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x= 0.35, y= -1, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
geom_density(mapping= aes(x = barn, y = -..density..), fill= colorBlindBlack8[8], colour= colorBlindBlack8[8]) +
  theme_cowplot(font_size = 8) +
scale_x_continuous(expand= c(0, 0)) +
  xlab("Relative difference in effect size on \nbirth weight with or without adjusting for gestational duration") +
ylab('Density') +
geom_hline(yintercept= 0, colour= 'grey') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))


ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)



moms= filter(d, effect== 'maternal_effect') %>% gather(key, beta_dif, beta_dif_h2) %>% select(beta_dif, effect)
fets= filter(d, effect== 'fetal_effect') %>% gather(key, beta_dif, beta_dif_h3) %>% select(beta_dif, effect)

d= rbind(moms, fets)


p1= ggplot(d, aes(beta_dif, group= effect, fill= effect)) +
geom_hline(yintercept= 0, colour= 'black') +
geom_density(color= NA) +
annotate('text', x=-2, y= 0.4, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=1, y= 0.8, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
theme_cowplot(font_size= 8) +
#scale_colour_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_fill_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_x_continuous(expand= c(0, 0)) +
scale_y_continuous(expand=c(0, 0.05)) +
  xlab("Relative difference in effect size on birth weight\nwith or without adjusting for gestational duration") +
ylab('Density') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3)) +
geom_vline(xintercept= 0, linetpye= 'dashed', colour= 'grey')

ggsave(snakemake@output[[3]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
library(MendelianRandomization)
library(data.table)
library(dplyr)
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library('showtext')


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])

d$rsid= with(d, ifelse(rsid== 'chrX:116013571', 'rs5991030', ifelse(rsid== 'chrX:132178061', 'rs5930554', rsid)))

d$effect_origin= with(d, ifelse(class_name== 'MF OD' | class_name== 'MF SD', 'Maternal and fetal', ifelse(class_name== 'Fetal MatT' | class_name== 'Fetal', 'Fetal', 'Maternal')))

#d= filter(d, MarkerName!= '6:32595083:G:T')

#top= fread(snakemake@input[[2]])
#ids= pull(top, ID)
#ids= c('3:156697097:A:G', '5:158058432:G:T', ids)

x= fread(snakemake@input[[2]], select= c('ID', 'RSID'))

#x= filter(x, ID %in% ids)

d= inner_join(d, x, by= c('rsid'= 'RSID'))

d= separate(d, ID, into= c('CHR', 'POS', 'REF', 'EFF'), sep= ':')
d$beta_MT= with(d, ifelse(REF > EFF, -1 * beta_MT, beta_MT))
d$beta_MNT= with(d, ifelse(REF > EFF, -1 * beta_MNT, beta_MNT))
d$beta_PT= with(d, ifelse(REF > EFF, -1 * beta_PT, beta_PT))

d$ID= with(d, ifelse(REF> EFF, paste(CHR, POS, EFF, REF, sep= ':'), paste(CHR, POS, REF, EFF, sep= ':')))

outcome= ifelse(grepl('fetal', snakemake@input[[3]]), 'Fetal', 'Maternal')

x= fread(snakemake@input[[3]], select= c('ID', 'BETA', 'SE', 'pvalue'))

d= inner_join(d, x, by= 'ID')

df_MT= select(d, beta_MT, se_MT, BETA, SE, effect_origin)
df_MT$BETA= with(df_MT, ifelse(beta_MT<0, BETA * -1, BETA))
df_MT$beta_MT= with(df_MT, ifelse(beta_MT<0, beta_MT * -1, beta_MT))


inputMR_m= mr_input(bx= df_MT$beta_MT, bxse= df_MT$se_MT, by= df_MT$BETA, byse= df_MT$SE)
MT= mr_allmethods(inputMR_m)$Values
names(MT)= c('method', 'estimate', 'se', 'lo95', 'up95', 'pvalue')

df_MNT= select(d, beta_MNT, se_MNT, BETA, SE, effect_origin)
df_MNT$BETA= with(df_MNT, ifelse(beta_MNT<0, BETA * -1, BETA))
df_MNT$beta_MNT= with(df_MNT, ifelse(beta_MNT<0, beta_MNT * -1, beta_MNT))


inputMR_m= mr_input(bx= df_MNT$beta_MNT, bxse= df_MNT$se_MNT, by= df_MNT$BETA, byse= df_MNT$SE)
MNT= mr_allmethods(inputMR_m)$Values
names(MNT)= c('method', 'estimate', 'se', 'lo95', 'up95', 'pvalue')


df_PT= select(d, beta_PT, se_PT, BETA, SE, effect_origin)
print(nrow(df_PT))
df_PT$BETA= with(df_PT, ifelse(beta_PT<0, BETA * -1, BETA))
df_PT$beta_PT= with(df_PT, ifelse(beta_PT<0, beta_PT * -1, beta_PT))

inputMR_m= mr_input(bx= df_PT$beta_PT, bxse= df_PT$se_PT, by= df_PT$BETA, byse= df_PT$SE)
PT= mr_allmethods(inputMR_m)$Values
names(PT)= c('method', 'estimate', 'se', 'lo95', 'up95', 'pvalue')

p1= ggplot(df_MT, aes(beta_MT, BETA, colour= effect_origin, fill= effect_origin)) +
geom_errorbarh(aes(xmin= beta_MT - se_MT, xmax= beta_MT + se_MT, colour= effect_origin, fill= effect_origin), size= 0.1, alpha= 0.7) +
geom_errorbar(aes(ymin= BETA - SE, ymax= BETA + SE, colour= effect_origin, fill= effect_origin),size= 0.1, alpha= 0.7) +
geom_point(size= 2, shape=21, stroke= 0.1, alpha= 0.7) +
scale_colour_manual(values= colorBlindBlack8[c(4, 2, 1)], guide= 'none') +
scale_fill_manual(values= colorBlindBlack8[c(4, 2, 1)], guide= 'none') +
xlab('Effect of maternal transmitted\nalleles on gestational duration, days') +
ylab(paste(outcome, 'only effect\non birth weight, z-score')) +
theme_cowplot(font_size= 8) +
geom_abline(intercept= 0, slope= filter(MT, method== 'IVW') %>% pull(estimate), colour= '#d9d9d9') +
geom_abline(intercept= (filter(MT, method== '(intercept)') %>% pull(estimate))[1], slope= filter(MT, method== 'MR-Egger') %>% pull(estimate), colour= '#d9d9d9', linetype= 'dashed') +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05))

p2= ggplot(df_MNT, aes(beta_MNT, BETA, colour= effect_origin, fill= effect_origin)) +
geom_errorbarh(aes(xmin= beta_MNT - se_MNT, xmax= beta_MNT + se_MNT,colour= effect_origin, fill= effect_origin), size= 0.1) +
geom_errorbar(aes(ymin= BETA - SE, ymax= BETA + SE,colour= effect_origin, fill= effect_origin),size= 0.1) +
geom_point(size= 2, shape= 21, stroke= 0.1) +
scale_colour_manual(values= alpha(colorBlindBlack8[c(4, 2, 1)], 0.7), guide= 'none') +
scale_fill_manual(values= alpha(colorBlindBlack8[c(4, 2, 1)], 0.7), guide= 'none') +
xlab('Effect of maternal non-transmitted alleles\non gestational duration, days') +
ylab(paste(outcome, 'only effect\non birth weight, z-score')) +
theme_cowplot(font_size= 8) +
geom_abline(intercept= 0, slope= filter(MNT, method== 'IVW') %>% pull(estimate), colour= '#d9d9d9') +
geom_abline(intercept= (filter(MNT, method== '(intercept)') %>% pull(estimate))[1], slope= filter(MNT, method== 'MR-Egger') %>% pull(estimate), colour= '#d9d9d9', linetype= 'dashed') +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05))

p3= ggplot(df_PT, aes(beta_PT, BETA, colour= effect_origin, fill= effect_origin)) +
geom_errorbarh(aes(xmin= beta_PT - se_PT, xmax= beta_PT + se_PT, colour= effect_origin, fill= effect_origin), size= 0.1) +
geom_errorbar(aes(ymin= BETA - SE, ymax= BETA + SE, colour= effect_origin, fill= effect_origin), alpha= 0.5, size= 0.1) +
geom_point(size= 2, shape= 21, stroke = 0.1) +
scale_colour_manual(values= alpha(colorBlindBlack8[c(4, 2, 1)], 0.7), guide= 'none') +
scale_fill_manual(values= alpha(colorBlindBlack8[c(4, 2, 1)], 0.7), guide= 'none') +
xlab('Effect of paternal transmitted alleles\non gestational duration, days') +
ylab(paste(outcome, 'only effect\non birth weight, z-score')) +
theme_cowplot(font_size= 8) +
geom_abline(intercept= 0, slope= filter(PT, method== 'IVW') %>% pull(estimate), colour= '#d9d9d9') +
geom_abline(intercept= (filter(PT, method== '(intercept)') %>% pull(estimate))[1], slope= filter(PT, method== 'MR-Egger') %>% pull(estimate), colour= '#d9d9d9', linetype= 'dashed') +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05))

ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)
ggsave(snakemake@output[[2]], plot= p2, width= 70, height= 70, units= 'mm', dpi= 300)
ggsave(snakemake@output[[3]], plot= p3, width= 70, height= 70, units= 'mm', dpi= 300)

MT$haplotype= 'MT'
MNT$haplotype= 'MNT'
PT$haplotype= 'PT'

df= bind_rows(MT, MNT, PT)

fwrite(d, snakemake@output[[4]], sep= '\t')
fwrite(df, snakemake@output[[5]], sep= '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
library(MendelianRandomization)
library(data.table)
library(dplyr)
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library('showtext')


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])
x= fread(snakemake@input[[2]], select= c('ID', 'BETA', 'SE'))

mr= fread(snakemake@input[[3]])

d= inner_join(d,x, by= 'ID')
d= filter(d, !duplicated(ID))

d$BETA= with(d, ifelse(beta< 0, -1 * BETA, BETA))
d$beta= with(d, ifelse(beta< 0, -1 * beta, beta))


shbg= filter(d, trait== 'SHBG_fem_cluster')
testo= filter(d, trait== 'Testosterone_fem_cluster')

p1= ggplot(shbg, aes(beta, BETA), color= colorBlindBlack8[2]) +
geom_errorbarh(aes(xmin= beta - se, xmax= beta + se), size= 0.1, alpha= 0.7, color= colorBlindBlack8[2]) +
geom_errorbar(aes(ymin= BETA - SE, ymax= BETA + SE), size= 0.1, alpha= 0.7, color= colorBlindBlack8[2]) +
geom_point(size= 2, shape=21, stroke= 0.1, alpha= 0.7, fill= colorBlindBlack8[2]) +
xlab('Effect on SHBG (women), nmol/L') +
ylab('Effect on gestational duration, days') +
theme_cowplot(font_size= 8) +
geom_abline(intercept= 0, slope= filter(mr, method== 'IVW', trait== 'SHBG_fem_cluster') %>% pull(estimate), colour= '#d9d9d9') +
geom_abline(intercept= (filter(mr, method== '(intercept)', trait== 'SHBG_fem_cluster') %>% pull(estimate))[1], slope= filter(mr, method== 'MR-Egger', trait== 'SHBG_fem_cluster') %>% pull(estimate), colour= '#d9d9d9', linetype= 'dashed') +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05))


p2= ggplot(testo, aes(beta, BETA), color= colorBlindBlack8[2]) +
geom_errorbarh(aes(xmin= beta - se, xmax= beta + se), size= 0.1, alpha= 0.7, color= colorBlindBlack8[2]) +
geom_errorbar(aes(ymin= BETA - SE, ymax= BETA + SE), size= 0.1, alpha= 0.7, color= colorBlindBlack8[2]) +
geom_point(size= 2, shape=21, stroke= 0.1, alpha= 0.7, fill= colorBlindBlack8[2]) +
xlab('Effect on testosterone (women), nmol/L') +
ylab('Effect on gestational duration, days') +
theme_cowplot(font_size= 8) +
geom_abline(intercept= 0, slope= filter(mr, method== 'IVW', trait== 'Testosterone_fem_cluster') %>% pull(estimate), colour= '#d9d9d9') +
geom_abline(intercept= (filter(mr, method== '(intercept)', trait== 'Testosterone_fem_cluster') %>% pull(estimate))[1], slope= filter(mr, method== 'MR-Egger', trait== 'Testosterone_fem_cluster') %>% pull(estimate), colour= '#d9d9d9', linetype= 'dashed') +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05))

ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)
ggsave(snakemake@output[[2]], plot= p2, width= 70, height= 70, units= 'mm', dpi= 300)
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[1]])

nr= nrow(d)

d= filter(d, Enrichment_p< 0.05 / (nrow(d)- 1))

d$description= with(d, ifelse(Category== 'H3K27ac_HniszL2_0', 'H3K27ac', 
			ifelse(Category== 'SuperEnhancer_HniszL2_0', 'SuperEnhancer',
			ifelse(Category== 'Backgrd_Selection_StatL2_0', 'Background selection',
			ifelse(Category== 'CpG_Content_50kbL2_0', 'CpG content',
			ifelse(Category== 'BLUEPRINT_DNA_methylation_MaxCPPL2_0', 'DNA Methylation', NA))))))

d= arrange(d, desc(Enrichment_p))

d$description= factor(d$description, levels= unique(d$description))

p1= ggplot(data=d, aes(x= description, y= -log10(Enrichment_p))) +
geom_col(fill=colorBlindBlack8[2], alpha= 0.6) +
theme_cowplot(font_size= 10) +
ylab('Enrichment -log10(pvalue)') +
theme(axis.title.y=element_blank()) +
geom_hline(yintercept= -log10(0.05/ (nr -1)), linetype= 'dashed', colour= 'grey') +
coord_flip()

p2= ggplot(data=d, aes(x= description, y= Enrichment)) +
geom_col(fill=colorBlindBlack8[4], alpha= 0.6) +
theme_cowplot(font_size= 10) +
ylab('Enrichment (h2 / proportion of SNPs)') +
theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
geom_hline(yintercept= 1, linetype= 'dashed', colour= 'grey') +
coord_flip()

x= plot_grid(p1, p2)


ggsave(snakemake@output[[1]], plot= x, height= 50, width= 140, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)


d= fread(snakemake@input[[1]], h= T)


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)




d= fread(snakemake@input[[1]], h= T, select= c('ID', 'pvalue', 'EAF'))
d$MAF= ifelse(d$EAF>0.5,  1 - d$EAF, d$EAF)
d= arrange(d, pvalue)
d= d[!duplicated(d$ID), ]


d= mutate(d, maf_tertiles = ntile(MAF, 3))
#m1= round(max(d[d$maf_tertiles== 1, 'MAF']), 3)
#m2= round(max(d[d$maf_tertiles== 2, 'MAF']), 3)


#d$maf_tertiles= factor(d$maf_tertiles, levels=c("1", "2", "3"), labels=c(paste('MAF<', m1), paste(m1,'< MAF >', m2), paste('MAF>', m2)))

df= arrange(d, pvalue) %>% mutate(exp1= -log10(1:length(pvalue)/length(pvalue)))

p1= ggplot(filter(df, pvalue<0.05), aes(exp1, -log10(pvalue))) +
  geom_point(size= 0.4, color= colorBlindBlack8[2]) +
#scale_color_manual(values= colorBlindBlack8[c(2,4,8)])+
  geom_abline(intercept = 0, slope = 1, alpha = .5) +
labs(colour="") +
theme_cowplot(font_size= 12) +
xlab('Expected (-log10(p-value))') +
ylab('Observed (-log10(p-value))') +
theme(legend.position= 'bottom') 
#guides(colour = guide_legend(override.aes = list(size=3)))

ggsave(snakemake@output[[1]], plot= p1, width= 120, height= 120, units= 'mm', dpi= 300)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
library(data.table)
library(dplyr)
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library('showtext')


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[1]])


d= filter(d, !grepl('BW', trait), !grepl('GA_fetal', trait), !grepl('male', trait))

d$trait= with(d, ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Maternal',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
                ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Fetal',
                ifelse(trait== 'BW_fetal_effect', 'Fetal only',
                ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'BW_maternal_effect', 'Maternal only',
                ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis'))))))))))))))))))))))

pregnancy= c('Miscarriage', 'Pre-eclampsia')
uterus= c('Leiomyoma uterus', 'Pelvic Organ Prolapse', 'Endometriosis', 'Polycystic ovary syndrome')
fitness= c('Age at first birth', 'Number of live births')
hormonal= c('Age at menarche', 'Age at menopause', 'Testosterone (women)', 'SHBG (women)', 'CBAT (women)', 'Oestradiol (women)')

d$cluster= with(d, ifelse(trait %in% pregnancy, 'Pregnancy', ifelse(trait %in% uterus, 'Reproductive organs', ifelse(trait %in% fitness, 'Fitness', 'Sex-hormone related'))))

d$colour= with(d, ifelse(cluster== 'Pregnancy', colorBlindBlack8[3], ifelse(cluster== 'Reproductive organs', colorBlindBlack8[5], ifelse(cluster== 'Fitness', colorBlindBlack8[7], colorBlindBlack8[8]))))

d$GENE= apply(d[, 'locus'], 1, function(x) unlist(strsplit(x, '_'))[2])

d$GENE= with(d, ifelse(GENE== 'CDC42', 'CDC42/ WNT4', ifelse(GENE== 'HIVEP3', 'HIVEP3/ EDN2', ifelse(GENE== 'TET3', 'TET3/ DGUOK-AS1', ifelse(GENE== 'TCEA2', 'TCEA2/ OPRL1', GENE)))))

d$sig= ifelse(d$PP.H4.abf>0.5, '*', '')

d= arrange(d, cluster)

d$trait= factor(d$trait, levels= unique(d$trait))
traits= unique(d$trait)
colors <- filter(d, !duplicated(trait)) %>% arrange(trait) %>% pull(colour)

d$PP= ifelse(d$PP.H4.abf> d$PP.H3.abf, d$PP.H4.abf, -d$PP.H3.abf - d$PP.H4.abf)
d$PP2= ifelse(d$PP.H4.abf> d$PP.H3.abf, d$PP.H4.abf, d$PP.H3.abf)
p1= ggplot(d, aes(trait, GENE, value= PP, fill= PP, colour= PP, size= PP2, stroke= 1-  PP)) + 
theme_cowplot(font_size= 9) +
geom_point(shape= 15) + 
scale_fill_gradient2(low= colorBlindBlack8[4], mid= 'white', high= colorBlindBlack8[2], guide= F) +
scale_colour_gradient2(low= colorBlindBlack8[4], mid= 'white', high= colorBlindBlack8[2], guide= F) +
scale_size_continuous(range= c(1, 2.5), guide= F) +
scale_x_discrete(position= 'top') +
theme(axis.ticks= element_blank(),
	axis.title= element_blank(),
	axis.text.x= element_blank())  +
geom_vline(xintercept= 1:(length(unique(d$trait))-1) + 0.5, size= 0.4, colour= 'grey') +
geom_hline(yintercept= 1:(length(unique(d$GENE))-1) + 0.5, size= 0.4, colour= 'grey') +
geom_vline(xintercept= cumsum(c(length(fitness) , length(pregnancy) , length(uterus) )) +0.5, size= 0.8) +
theme(	panel.grid.major = element_blank(), 
	panel.grid.minor = element_blank(),
	panel.background = element_blank(),
	panel.border = element_rect(colour= 'black', fill= NA, size=1),
	plot.margin = unit(c(0, 0.1, 0.1, 0), "cm"),
	axis.line= element_blank())

t_count_locus= group_by(d, trait) %>% summarize(PP= sum(as.numeric(PP.H4.abf> 0.8)), PP_locus= sum(as.numeric(PP.H4.abf + PP.H3.abf>0.8)))
t_count_locus$PP= t_count_locus$PP_locus - t_count_locus$PP
t_count_locus$supp= 'Locus-level'

t_count= group_by(d, trait) %>% summarize(PP= sum(as.numeric(PP.H4.abf> 0.8)))
t_count$supp= 'Coloc'

t_count= bind_rows(t_count, t_count_locus)

t_count$trait= factor(t_count$trait, levels= unique(d$trait))
t_count$supp= factor(t_count$supp, levels= c('Locus-level','Coloc'))

p2= ggplot(t_count, aes(trait, -PP, fill= supp)) +
theme_cowplot(font_size= 8) +
geom_col(alpha= 0.7) +
geom_hline(yintercept= 0) +
scale_fill_manual(values= c(colorBlindBlack8[4], colorBlindBlack8[2]), guide= F) +
theme(	axis.line= element_blank(),
	panel.grid.major = element_blank(), 
	panel.grid.minor = element_blank(),
	panel.background = element_blank(),
	panel.border = element_rect(colour= 'black', fill= NA, size=1),
	axis.text.x= element_blank(),
	axis.ticks.x= element_blank(),
	axis.title= element_blank(),
	plot.margin = unit(c(0, 0, 0, 0.1), "cm")) +
scale_y_continuous(limits= c(-10, 0), expand= c(0,0), labels= seq(0, 10, 2), breaks= seq(0, -10, -2)) +
geom_vline(xintercept= cumsum(c(length(fitness) , length(pregnancy) , length(uterus) )) +0.5, size= 0.8)  +
geom_hline(yintercept= c(-4, -8), size= 0.3, linetype= 'dashed', colour= 'grey')

l_count_locus= group_by(d, GENE) %>% summarize(PP= sum(as.numeric(PP.H4.abf> 0.8)), PP_locus= sum(as.numeric(PP.H4.abf + PP.H3.abf>0.8)))
l_count_locus$PP= l_count_locus$PP_locus - l_count_locus$PP
l_count_locus$supp= 'Locus-level'

l_count= group_by(d, GENE) %>% summarize(PP= sum(as.numeric(PP.H4.abf> 0.8)))
l_count$supp= 'Coloc'

l_count= bind_rows(l_count, l_count_locus)

l_count$trait= factor(l_count$GENE, levels= unique(d$GENE))
l_count$supp= factor(l_count$supp, levels= c('Locus-level','Coloc'))

print('done')
p3= ggplot(l_count, aes(PP, GENE, fill= supp)) +
theme_cowplot(font_size= 8) +
geom_col(alpha= 0.7) +
geom_hline(yintercept= 0) +
scale_fill_manual(values= c(colorBlindBlack8[4], colorBlindBlack8[2]), guide= F) +
theme(	axis.line= element_blank(),
	panel.grid.major = element_blank(), 
	panel.grid.minor = element_blank(),
	panel.background = element_blank(),
	panel.border = element_rect(colour= 'black', fill= NA, size=1),
	axis.text.y= element_blank(),
	axis.ticks.y= element_blank(),
	axis.title= element_blank(),
	plot.margin = unit(c(0, 0.1, 0, 0), "cm")) +
scale_x_continuous(limits= c(0, 10), expand= c(0,0), labels= seq(0,10, 2), breaks= seq(0, 10, 2))

x1= plot_grid(p1, p3, nrow= 1, align= 'h', rel_widths= c(2, 0.5))
x2= plot_grid(p1, p2, nrow= 2, align= 'v', rel_heights= c(2, 0.3))

ggsave(snakemake@output[[1]], plot= x1, width= 127 - 1, height= 127 - 25 - 1, units= 'mm', dpi= 300)
ggsave(snakemake@output[[2]], plot= x2, width= 103 - 1, height= 127 - 25 - 1, units= 'mm', dpi= 300)

################## Genetic correlations

d= fread(snakemake@input[[2]])

d= filter(d, grepl('GAraw', p1), !grepl('BW', p2), !grepl('male', p2))
#d$p1= 'Gestational duration (maternal)'
d$p1= 'Maternal'
x= fread(snakemake@input[[2]])

x= filter(x, grepl('GA_fetal', p1), !grepl('BW', p2), !grepl('male', p2))
#x$p1= 'Gestational duration (fetal)'
x$p1= 'Fetal'
d= rbind(d, x)

d$p2= gsub('.txt.sumstats.gz', '', apply(d[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))
d$trait= d$p2

d$trait= with(d, ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Maternal',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
                ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Fetal',
                ifelse(trait== 'BW_fetal_effect', 'Fetal only',
                ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'BW_maternal_effect', 'Maternal only',
                ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis'))))))))))))))))))))))


d= filter(d, trait!= 'GA fetal effect')

d$cluster= with(d, ifelse(trait %in% pregnancy, 'Pregnancy', ifelse(trait %in% uterus, 'Reproductive organs', ifelse(trait %in% fitness, 'Fitness', 'Sex-hormone related'))))

d$colour= with(d, ifelse(cluster== 'Pregnancy', colorBlindBlack8[3], ifelse(cluster== 'Reproductive organs', colorBlindBlack8[1], ifelse(cluster== 'Fitness', colorBlindBlack8[7], colorBlindBlack8[8]))))

d= arrange(d, cluster)

d$trait= factor(d$trait, levels= traits)

colors <- filter(d, !duplicated(trait)) %>% arrange(trait) %>% pull(colour)


d$sig= ifelse(d$p< 0.05/ (nrow(d)/2), '**', ifelse(d$p< 0.05, '*', ''))
d= filter(d, p1== 'Maternal')
d$p1= 'Gestational duration'

rg_plot= ggplot(d, aes(trait, p1, fill= rg)) +
geom_tile(colour = "white", size= 1) +
theme_cowplot(font_size= 9) +
scale_fill_gradient2(low= colorBlindBlack8[2], high= colorBlindBlack8[4], mid= 'white', guide= F) +
theme(axis.text.x = element_text(angle = 45, hjust = 0),
        axis.title.x = element_blank(),
        axis.title.y = element_blank()) +
scale_x_discrete(position = "top") +
geom_text(data= d, aes(trait, p1, label= sig), size= 6/ .pt) +
theme(  panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
	axis.ticks= element_blank(),
        panel.border = element_rect(colour= 'black', fill= NA, size=1),
        plot.margin = unit(c(0, 1, 0, 0), "cm"),
        axis.line= element_blank(),
	axis.text.x= element_text(angle= 45, hjust=0, colour= colors))


x2= plot_grid(rg_plot,p1, nrow= 2, align= 'v', rel_heights= c(0.85, 2))

ggsave(snakemake@output[[3]], plot= x2, width= 113 - 2.5, height= 127 - 25 - 1 , units= 'mm', dpi= 300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
library(tidyverse)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

x= fread(snakemake@input[[1]])

x$p1= gsub('.txt.sumstats.gz', '', apply(x[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2]))
x$p2= gsub('.txt.sumstats.gz', '', apply(x[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))

x1= fread(snakemake@input[[2]])

x1$p1= gsub('.txt.sumstats.gz', '', apply(x1[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2]))
x1$p2= gsub('.txt.sumstats.gz', '', apply(x1[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))
x1$rg= -1 * x1$rg
d= rbind(x, x1)

#traits= filter(d, p< 0.05/ 14, !grepl('BW', p2), !grepl('GA', p2)) %>% pull(p2)

d$trait= d$p2
d$trait= with(d, ifelse(trait== 'GAraw', 'Maternal gestational duration',
ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Maternal BW',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
                ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Estradiol (women)',
                ifelse(trait== 'POP', 'Pelvic organ prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Fetal',
                ifelse(trait== 'BW_fetal_effect', 'Fetal only',
                ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'BW_maternal_effect', 'Maternal only',
                ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis')))))))))))))))))))))))

d= filter(d, !grepl('BW', p2), !grepl('GA', p2), !grepl('_male', p2))

traits= unique(arrange(d, p) %>% pull(trait))
d$trait= factor(d$trait, levels= rev(traits))

p1= ggplot(d, aes(rg, trait, colour= p1)) + 
geom_pointrange(aes(xmax= rg + 1.96 * se, xmin= rg - 1.96 * se), position = position_dodge(width = 0.3), fatten= 1) +
scale_colour_manual(values= colorBlindBlack8[c(8,3)], guide= FALSE) +
theme_cowplot(font_size= 9) +
scale_x_continuous(limits= c(-1, 1), breaks= seq(-1, 1, 0.5)) +
xlab('Genetic correlation') +
geom_vline(xintercept= 0, size= 0.3) +
geom_vline(xintercept= c(seq(-1, 1, 0.25)), colour= 'grey', linetype= 'dashed', alpha= 0.5, size= 0.2) +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3),
        axis.title.y= element_blank())


ggsave(snakemake@output[[1]], plot= p1, width= 88, height= 120, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')

p1= ggplot(d, aes(rg, trait, colour= p1)) +
geom_pointrange(aes(xmax= rg + 1.96 * se, xmin= rg - 1.96 * se), position = position_dodge(width = 0.3), fatten= 1) +
scale_colour_manual(values= colorBlindBlack8[c(8,3)], name= 'Trait') +
theme_cowplot(font_size= 9) +
scale_x_continuous(limits= c(-1, 1), breaks= seq(-1, 1, 0.5)) +
xlab('Genetic correlation') +
geom_vline(xintercept= 0, size= 0.3) +
geom_vline(xintercept= c(seq(-1, 1, 0.25)), colour= 'grey', linetype= 'dashed', alpha= 0.5, size= 0.2) +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3),
        axis.title.y= element_blank())


ggsave(snakemake@output[[3]], plot= p1, width= 88, height= 120, units= 'mm', dpi= 300)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
library(scales)
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
library(tidyverse)
library(fmsb)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


x= fread(snakemake@input[[1]])
x$p1= gsub('.txt.sumstats.gz', '', apply(x[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2]))
x$p2= gsub('.txt.sumstats.gz', '', apply(x[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))

x1= fread(snakemake@input[[2]])
x1$p1= gsub('.txt.sumstats.gz', '', apply(x1[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2]))
x1$p2= gsub('.txt.sumstats.gz', '', apply(x1[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))

x= rbind(x, x1)

traits= unique(filter(x, p< 0.05/ 14, !grepl('BW', p2), !grepl('GA', p2)) %>% pull(p2))

d= fread(snakemake@input[[3]])

table_supp= d
table_supp$pheno= 'Gestational duration'
d$gcp.pm= ifelse(d$pval.gcpzero.2tailed< 0.05/length(traits), d$gcp.pm, 0)

d= filter(d, repr_pheno %in% traits)

d= arrange(d, desc(gcp.pm))

df= fread(snakemake@input[[4]])

table_supp2= df
table_supp2$pheno= 'Preterm delivery' 

table_supp= rbind(table_supp, table_supp2)

df$gcp.pm= ifelse(df$pval.gcpzero.2tailed< 0.05/length(traits), df$gcp.pm, 0)

df= filter(df, repr_pheno %in% traits)

d= inner_join(d, df, by= 'repr_pheno')
d$trait= d$repr_pheno
d$trait= with(d, ifelse(trait== 'GAraw', 'Maternal gestational duration',
ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Maternal BW',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
                ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Fetal',
                ifelse(trait== 'BW_fetal_effect', 'Fetal only',
                ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'BW_maternal_effect', 'Maternal only',
                ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis')))))))))))))))))))))))

d$repr_pheno= d$trait
x= as.data.frame(matrix(d$gcp.pm.x, ncol= nrow(d)))
x=rbind(x, as.data.frame(matrix(d$gcp.pm.y, ncol= nrow(d))))



names(x)= d$repr_pheno
rownames(x)= c('Preterm delivery', 'Gestational duration ')
x= rbind(rep(1,nrow(d)) , rep(0,nrow(d)) , x)

inches= 25.4

pdf(snakemake@output[[1]], width= 88 / inches, height= 88 / inches)
par(mar=c(0,0,0,0))


radarchart(abs(x), axistype= 0,

    #custom polygon
    pcol= c(colorBlindBlack8[3], colorBlindBlack8[8]) , pfcol= c(alpha(colorBlindBlack8[3], 0.4), alpha(colorBlindBlack8[8], 0.4)) , plwd=1, pty= 16, plty= 1, vlcex= 0.8, vlabels= c('Testosterone\n(women)', 'Age at\nfirst birth', 'Age at\nmenopause', 'Number of\nlive births', 'SHBG\n(women)', 'CBAT\n(women)'),
    #custom the grid
    cglcol="grey", cglty=1, axislabcol="#525252", caxislabels= seq(0, 1, 0.25), cglwd=0.8, calcex= 0.4

    #custom labels
    )

dev.off()

table_supp$trait= table_supp$repr_pheno
table_supp$trait= with(table_supp, ifelse(trait== 'GAraw', 'Maternal gestational duration',
ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Maternal BW',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
                ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Fetal',
                ifelse(trait== 'BW_fetal_effect', 'Fetal only',
                ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'BW_maternal_effect', 'Maternal only',
                ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis')))))))))))))))))))))))


fwrite(table_supp, snakemake@output[[2]], sep= '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)


d= fread(snakemake@input[[1]], h= T)


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

female_repr= c('breast', "cervix, uterine", 'endometrium', 'ovary', 'placenta', 'vagina', 'fallopian tube')
male_repr= c('ductus deferens', 'testis', 'seminal vesicle', 'prostate', 'epididymis')
muscle= c('smooth muscle', 'heart muscle', 'skeletal muscle')

d$organ= with(d, ifelse(tissue %in% female_repr, 'Female reproductive', ifelse(tissue %in% male_repr, 'Male reproductive', ifelse(tissue %in% muscle, 'Muscle', 'Others'))))

p1= ggplot(d, aes(-log10(MannW_pvalue), I(i_listmedian/ base_list_median), colour= organ)) +
geom_point(size= 1.5) +
theme_cowplot(font_size= 8) +
scale_colour_manual('Legend', values= c(colorBlindBlack8[c(3, 2, 8)], 'grey'), guide= 'none') +
geom_text_repel(data= filter(d, MannW_pvalue< 0.05), aes(label= tissue), fontface = 'bold') +
geom_vline(xintercept= -log10(0.05), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_vline(xintercept= -log10(0.05/nrow(d)), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
ylab('Enrichment') +
xlab('-log10(pvalue)')


ggsave(snakemake@output[[1]], plot= p1, width= 120, height= 90, units= 'mm', dpi= 300)


p1= ggplot(d, aes(-log10(MannW_pvalue), I(i_listmedian/ base_list_median), colour= organ)) +
geom_point(size= 1.5) +
theme_cowplot(font_size= 10) +
scale_colour_manual('Legend', values= c(colorBlindBlack8[c(3, 2, 8)], 'grey')) +
geom_text_repel(data= filter(d, MannW_pvalue< 0.05), aes(label= tissue), fontface = 'bold', show_guide = FALSE) +
geom_vline(xintercept= -log10(0.05), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_vline(xintercept= -log10(0.05/nrow(d)), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
ylab('Enrichment') +
xlab('-log10(pvalue)')

ggsave(snakemake@output[[2]], plot= p1, width= 120, height= 90, units= 'mm', dpi= 300)
11
12
script:
	'manhattan_plot.R'
25
26
script:
	'lm_effect_origin.R'
37
38
script:
	'effect_origin_dendrogram.R'
48
49
script:
	'effect_origin_ternary.R'
64
65
script:
	'gene_based_vs_coloc_iPSC.R'
74
75
script:
	'BW_coloc_spider.R'
87
88
script:
        'KCNAB1_pheWAS.R'
101
102
script:
	'ADCY5_pheWAS.R'
118
119
script:
	'ADCY5_FST_AFR_EUR.R'
129
130
script:
	'BW_genetic_correlations.R'
143
144
script:
        'repr_pheno_correlations.R'
153
154
script:
	'partitioned_h2.R'
164
165
script:
	'MacArthurlab_enrichment.R'
180
181
script:
	'ADCY5_effect_direction.R'
193
194
script:
	'BW_conditioning.R'
207
208
script:
        'BW_conditioning_top.R'
218
219
script:
        'mediation_BW_GA_individual_level_data.R'
230
231
script:
        'mediation_BW_GA_individual_level_data_decode.R'
245
246
script:
	'MR_GA_BW_haplotype.R'
257
258
script:
	'repr_pheno_coloc.R'
270
271
script:
	'repr_pheno_LCV.R'
282
283
script:
	'repr_pheno_correlations.R'
292
293
script:
	'RNA_enrichment.R'
305
306
script:
	'QQ_plot.R'
316
317
script:
	'h2_allphenos.R'
326
327
script:
	'h2_cohorts.R'
335
336
script:
        'genet_correlations_meta.R'
346
347
script:
	'manhattan_plot_postTerm.R'
357
358
script:
        'manhattan_plot_postTerm.R'
368
369
script:
	'forest_plot_EEFSEC.R'
380
381
script:
	'MR_sex_hormones_GA.R'
391
392
script:
	'cell_type_enrichment.R'
402
403
script:
	'labor_deg.R'
413
414
script:
	'coloc_sex_hormones.R'
423
424
script:
	'evo.R'
432
433
script:
	'GA_BW_PGS_correlations.R'
445
446
script:
	'GAraw_vs_allPTD.R'
11
12
13
14
15
run:
	d= pd.read_csv(input[0], sep= '\t', header= None, names= ['CHR', 'start', 'end', 'geneSymbol', 'Ensembl_gene'])
	d= d.loc[~d.geneSymbol.str.contains(' '), :]
	d= d[['CHR', 'start', 'end', 'geneSymbol']]
	d.to_csv(output[0], sep= '\t', header= True, index= False)
24
25
26
27
28
29
run:
	d= pd.read_csv(input[0], sep='\t', header= 0)
	pop= ['CEU', 'TSI', 'FIN', 'GBR', 'IBS']
	d= d.loc[d.Population.isin(pop)]
	d['IID']= d['Individual ID']
	d.to_csv(output[0], sep= '\t', header= False, index= False, columns= ['IID'])
37
38
39
run:
	vcfs= [x for x in input if '1000g' in x]
	shell('/home/pol/software/bcftools-1.9/bcftools concat {input} -o {output[0]} -Oz')
52
53
shell:
	'/home/pol/software/plink2 --vcf {input[0]} --max-alleles 2 --keep {input[1]} --make-bed --out {params[0]}'
65
66
67
68
69
70
71
72
73
74
75
run:
	d= pd.read_csv(input[0], sep= '\t', header= None, names= ['CHR', 'RSID', 'cm', 'POS', 'A1', 'A2'])
	d['REF']= np.where(d.A1.str.len() > d.A2.str.len(), 'I', d.A1)
	d['EFF']= np.where(d.A2.str.len() > d.A1.str.len(), 'I', d.A2)
	d['REF']= np.where(d.EFF== 'I', 'D', d.REF)
	d['EFF']= np.where(d.REF== 'I', 'D', d.EFF)
	d['RSID']= np.where(d.REF > d.EFF, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.EFF + ':' + d.REF, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.REF + ':' + d.EFF)
	d= d[['CHR', 'RSID', 'cm', 'POS', 'A1', 'A2']]
	d.to_csv(output[0], sep= '\t', header= False, index= False)
	shell('mv {input[1]} {output[1]}')
	shell('mv {input[2]} {output[2]}')
83
84
85
86
87
88
89
90
run:
	d= pd.read_csv(input[0], header= 0, sep= '\t', compression= 'gzip', usecols= ['ID', 'pvalue'])
	d.dropna(subset= ['ID'], inplace= True)
	d= d.loc[d.ID != '-', :]
	d= d[['ID', 'pvalue']]
	d.columns= ['SNP', 'p']
	d['SNP']= d.SNP.str.replace('^23:', 'X:')
	d.to_csv(output[0], sep= '\t', header= True, index= None, columns= ['SNP', 'p'])
105
106
shell:
	'/home/pol/software/gcta_1.93.2beta/gcta64 --bfile {params[0]} --maf 0.01 --fastBAT {input[1]} --fastBAT-gene-list {input[2]} --out {params[1]} --thread-num {threads}'
12
13
14
15
run:
	d=pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'RSID', 'pvalue'])[['RSID', 'CHR', 'POS', 'pvalue']]
	d.columns= ['SNP', 'CHR', 'POS', 'P']
	d.to_csv(output[0], header= True, index= False, sep= '\t')
23
24
25
26
run:
        d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['CHR', 'POS'])
        d.sort_values(['CHR', 'POS'], inplace= True)
        d['pos2']= d.POS
38
39
40
41
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	d= d.loc[d.Relationship== 'unrel', :]
	pop= ['CEU', 'TSI', 'FIN', 'GBR', 'IBS']
54
55
56
run:
	vcfs= [infile for infile in input if 'vcf' in infile]
	shell('/home/pol/software/bcftools-1.9/bcftools concat -a -O v -R {input[0]} {vcfs} -o {output[0]}')
68
69
shell:
	'/home/pol/software/plink --vcf {input[0]} --keep {input[1]} --make-bed -out {params[0]}'
77
78
79
80
81
run:
	d= pd.read_csv(input[0], sep= '\t', header= None, names= ['chr', 'snp', 'x1', 'pos', 'a1', 'a2'])
	d= d[d.duplicated(['snp'], keep=False)]
	d.drop_duplicates(subset= ['snp'], keep= 'first')
	d.to_csv(output[0], sep= '\t', header= False, index= False)
94
95
shell:
	'~/software/plink --bfile {params[0]} --clump {input[0]} --exclude {input[1]} --clump-r2 0.05 --clump-kb 1000 --clump-p1 5e-8 --clump-p2 1e-5 --out {params[1]}'
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
library(data.table)
library(dplyr)
library(coloc)
library(parallel)

df= fread(snakemake@input[[1]], select= c('ID', 'BETA', 'SE', 'TOTALSAMPLESIZE', 'EAF'))
df$MAF= ifelse(df$EAF>0.5, 1 - df$EAF, df$EAF)

z= fread(snakemake@input[[2]])
z$n= 716
df= inner_join(df, z, by= 'ID')

rm(z)

pph_outfile= snakemake@output[[1]]
results_outfile= snakemake@output[[2]]

if (!grepl('sQTL', snakemake@output[[1]])) {

cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tprotein\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tprotein\n', file= snakemake@output[[2]])
} else {

cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tprotein\tgene\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tprotein\tgene\n', file= snakemake@output[[2]])

}

prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6


df= data.frame(df)

colocalization_eqtl= function(temp_df){
	protein= unique(temp_df$gene)
        if (nrow(temp_df)== 0) {

        PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')


        } else {
	temp_df = filter(temp_df, SE>0, se> 0)

	if (grepl('allPTD', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID,s= 0.067)
        } else if (grepl('postTerm', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, s= 0.122)
        } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID) }

        data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$n, type= 'quant', snp= temp_df$ID)
        myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, MAF=temp_df$MAF, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
        if (length(myres)==1 ) { 
        PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')
        } else {
        PPH= data.frame(t(myres[[1]]))
        PPH$protein= protein
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= myres[[2]]
        res$protein= protein
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
}
}
}



mclapply(split(df, df$gene), colocalization_eqtl, mc.cores= 3)
15
16
script:
	'coloc_iPSC.R'
SnakeMake From line 15 of iPSC/Snakefile
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
library(dplyr)
library(data.table)

d= fread(snakemake@input[[1]])
d= filter(d, !is.na(Z))

x= fread(snakemake@input[[2]])
x= filter(x, !is.na(Z))

ld= fread(snakemake@input[[3]])

d= inner_join(d, x, by= 'SNP')
d= inner_join(d, ld, by= 'SNP')


source(snakemake@params[[1]])
setwd(snakemake@params[[2]])

LCV= RunLCV(d$L2, d$Z.y, d$Z.x, ldsc.intercept= 1, n.1= (d$N.y), n.2= (d$N.x))

cat('zscore\tpval.gcpzero.2tailed\tgcp.pm\tgcp.pse\trho.est\trho.err\tpval.fullycausal1\tpval.fullycausal2\th2.zscore1\th2.zscore2\tpheno\trepr_pheno\n', file = snakemake@output[[1]])

z= data.frame(zscore= LCV$zscore, pval.gcpzero.2tailed= LCV$pval.gcpzero.2tailed, gcp.pm= LCV$gcp.pm, gcp.pse= LCV$gcp.pse, rho.est= LCV$rho.est, rho.err= LCV$rho.err, pval.fullycausal1= LCV$pval.fullycausal[1],pval.fullycausal2= LCV$pval.fullycausal[2], h2.zscore1= LCV$h2.zscore[1], h2.zscore2= LCV$h2.zscore[2], pheno= snakemake@wildcards[['pheno']], repr_pheno= snakemake@wildcards[['repr_pheno']])

fwrite(z, snakemake@output[[1]], sep= '\t')
13
14
script:
	'LCV.R'
SnakeMake From line 13 of LCV/Snakefile
22
23
24
25
26
	shell:
		'''
		head -1 {input[0]} > {output[0]}
                tail -n +2 -q {input} >> {output[0]}
		'''
SnakeMake From line 22 of LCV/Snakefile
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
import pandas as pd
import numpy as np
from scipy.special import chdtri
import gzip
import csv

def not_number(s):
	if s != None:
		try:
			float(s)
			return False
		except ValueError:
			return True
	else:
		return True


def select_format(repr_pheno, row):
	'For each wildcard assign the correct formating function.'
	if repr_pheno== 'Preeclampsia':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= preeclampsia(row)
	if repr_pheno== 'POP': 
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= POP(row)
	if repr_pheno== 'miscarriage':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= miscarriage(row)
	if repr_pheno== 'GA_fetal':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= fet_GA(row)
	if repr_pheno== 'BW_maternal':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_maternal(row)
	if repr_pheno== 'BW_fetal':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_fetal(row)
	if repr_pheno== 'BW_maternal_effect':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_maternal_adjusted_effect(row)
	if repr_pheno== 'BW_fetal_effect':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_fetal_adjusted_effect(row)
	if repr_pheno== 'leiomyoma_uterus':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= leiomyoma_uterus(row)
	if repr_pheno== 'AMenopause':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= AMenopause(row)
	if repr_pheno in ['Oestradiol_fem', 'NLB', 'AFB', 'AMenarche', 'endometriosis']:
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= UKBB_traits(row)
	if repr_pheno in ['SHBG_fem', 'Testosterone_fem', 'Testosterone_male', 'SHBG_male', 'CBAT_fem', 'CBAT_male']:
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= pritchard(row)
	if repr_pheno == 'PCOS':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= PCOS(row)
	if repr_pheno in ['Ruth_CBAT_female', 'Ruth_CBAT_male', 'Ruth_SHBG_female', 'Ruth_SHBG_male', 'Ruth_Testosterone_female', 'Ruth_Testosterone_male', 'Ruth_oestradiol']:
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= Ruth(row, repr_pheno) 
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]


def AMenopause(row):
	'REPROGEN Age at menopause.'
	EAF= float(row['EAF'])
	CHR= row['CHR']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['POS'])
	REF= row['Other_Allele'].upper()
	EFF= row['Effect_Allele'].upper()
	BETA= float(row['Effect'])
	pvalue= float(row['Pval'])
	SE= float(row['SE'])
	N= int(row['N'])
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def Ruth(row, repr_pheno):
	''
	EAF= float(row['effect_allele_frequency'])
	CHR= row['chromosome']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['base_pair_location'])
	REF= row['other_allele']
	EFF= row['effect_allele']
	BETA= float(row['beta'])
	pvalue= float(row['p_value'])
	SE= float(row['standard_error'])
	N= np.where(repr_pheno== 'Ruth_SHBG_female', 189473,
	np.where(repr_pheno== 'Ruth_SHBG_make', 180726,
	np.where(repr_pheno== 'Ruth_Testosterone_female', 230454,
	np.where(repr_pheno== 'Ruth_SHBG_male',194453 ,
	np.where(repr_pheno== 'Ruth_CBAT_female', 188507,
	np.where(repr_pheno== 'Ruth_SHBG_male', 178782, 206927))))))
	rsid= row['variant_id']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def pritchard(row):
	''
	EAF= float(row['A1_FREQ'])
	CHR= row['#CHROM']
	if CHR== 'X': CHR= 23
	if not_number(CHR): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	POS= int(row['POS'])
	CHR= int(CHR)
	REF= row['REF']
	EFF= row['ALT']
	N= int(row['OBS_CT'])
	if not_number(row['BETA']): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	if not_number(row['SE']): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	if not_number(row['P']): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	BETA= float(row['BETA'])
	SE= float(row['SE'])
	pvalue= float(row['P'])
	rsid= row['ID']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def leiomyoma_uterus(row):
	''
	EAF= float(row['EAF'])
	CHR= row['CHR']
	if CHR== 'X': CHR= 23
	if not_number(CHR): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	POS= int(row['POS'])
	CHR= int(CHR)
	REF= row['REF']
	EFF= row['EFF']
	N= row['TOTALSAMPLESIZE']
	BETA= float(row['beta'])
	SE= float(row['se'])
	pvalue= float(row['pvalue'])
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def preeclampsia(row):
	''
	CHR= row['CHR']
	if CHR== 'X': CHR= 23
	if not_number(CHR): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	POS= int(row['POS'])
	CHR= int(CHR)
	REF= row['REF'].upper()
	EFF= row['EFF'].upper()
	N= 4630 + 373345
	rsid= row['rsid']
	BETA= float(row['beta'])
	SE= float(row['se'])
	EAF= float(row['EAF'])
	pvalue= float(row['pvalue'])
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def BW_fetal_adjusted_effect(row):
	'Define each header for Birth weight fetal effect.'
	EAF= float(row['eaf'])
	CHR= row['chr']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['pos'])
	REF= row['nea'].upper()
	if REF== 'R': REF= 'D'
	EFF= row['ea'].upper()
	if EFF== 'R': EFF= 'D'
	BETA= float(row['beta'])
	pvalue= float(row['p'])
	SE= float(row['se'])
	N= int(row['n_ownBW'])
	rsid= row['RSID']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def BW_maternal_adjusted_effect(row):
	'Define each header for Birth weight fetal effect.'
	EAF= float(row['eaf'])
	CHR= row['chr']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['pos'])
	REF= row['nea'].upper()
	if REF== 'R': REF= 'D'
	EFF= row['ea'].upper()
	if EFF== 'R': EFF= 'D'
	BETA= float(row['beta'])
	pvalue= float(row['p'])
	SE= float(row['se'])
	N= int(row['n_offBW'])
	rsid= row['RSID']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]


def BW_maternal(row):
	'Define each header for Birth weight maternal effect.'
	EAF= float(row['eaf'])
	CHR= row['chr']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['pos'])
	REF= row['nea']
	EFF= row['ea']
	if REF== 'R': REF= 'D'
	if EFF== 'R': EFF= 'D'
	BETA= float(row['beta'])
	pvalue= float(row['p'])
	SE= float(row['se'])
	N= int(row['n'])
	rsid= row['SNP']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def BW_fetal(row):
	'Define each header for Birth weight maternal effect.'
	EAF= float(row['eaf'])
	CHR= row['chr']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['pos'])
	REF= row['nea']
	EFF= row['ea']
	if REF== 'R': REF= 'D'
	if EFF== 'R': EFF= 'D'
	BETA= float(row['beta'])
	pvalue= float(row['p'])
	SE= float(row['se'])
	N= int(row['n'])
	rsid= row['rsid']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]


def PCOS(row):
	'Define each header for PCOS excluding 23andme.'
	EAF= float(row['EAF'])
	CHR= row['CHR']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['POS'])
	REF= row['REF']
	EFF= row['EFF']
	BETA= float(row['beta'])
	pvalue= float(row['pvalue'])
	SE= float(row['se'])
	N= int(round(float(row['TOTALSAMPLESIZE'])))
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def UKBB_traits(row):
	'Define each header for UKBB traits (hormones).'
	if row['low_confidence_variant']== 'true': return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	CHR= row['variant'].split(':')[0]
	if CHR== 'X': CHR= 23
	POS= row['variant'].split(':')[1]
	if any([not_number(t) for t in [row['minor_AF'], CHR, POS, row['beta'], row['pval'], row['se'], row['n_complete_samples']]]): return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	CHR= int(CHR)
	POS= int(POS)
	REF= row['variant'].split(':')[2]
	EFF= row['variant'].split(':')[3]
	BETA= float(row['beta'])
	pvalue= float(row['pval'])
	SE= float(row['se'])
	N= int(row['n_complete_samples'])
	if row['minor_allele']== EFF:
		EAF= float(row['minor_AF'])
	else:
		EAF= 1- float(row['minor_AF'])
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def AP_repr(row):
	'Define each header for BOLT-LMM sumstats.'
	EAF= float(row['EAF'])
	CHR= row['CHR']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['POS'])
	REF= row['A2']
	EFF= row['A1']
	BETA= float(row['Beta'])
	pvalue= float(row['P'])
	SE= float(row['se'])
	N= row['N']
	rsid= row['SNP']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]


def POP(row):
	'Define each header for pelvic organ prolapse.'
	if not row['CHR'].isdigit(): return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	EAF= float(row['EAF'])
	MAF= np.where(EAF> 0.5, 1 - EAF, EAF)
	if MAF < 0.005: return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	if row['CHR']== 'X': row['CHR']= 23
	CHR= int(row['CHR'])
	POS= int(row['POS'])
	REF= row['REF']
	EFF= row['EFF']
	BETA= float(row['BETA'])
	pvalue= float(row['pvalue'])
	SE= float(row['SE'])
	N= float(row['N'])
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def fet_GA(row):
	'Define each header for Fetal gestational duration.'
	EAF= ''
	if row['Chr']== 'X': row['Chr']= 23
	CHR= int(row['Chr'])
	POS= int(row['Pos'])
	REF= row['Non_effect_allele'].upper()
	EFF= row['Effect_allele'].upper()
	BETA= float(row['Effect'])
	pvalue= float(row['P'])
	SE= float(row['StdErr'])
	N= int(row['N'])
	rsid= row['Rsid']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def miscarriage(row):
	'Define each header for Miscarriage.'
	EAF= row['Freq1']
	CHR= row['MarkerName'].split(':')[0]
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['MarkerName'].split(':')[1])
	REF= row['Allele2'].upper()
	EFF= row['Allele1'].upper()
	BETA= float(row['Effect'])
	pvalue= float(row['P-value'])
	SE= float(row['StdErr'])
	N= 49996 + 174109
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]


def format_list(input, output):
	with gzip.open(input, 'rt', newline='') as f:
		print(input)
		dialect = csv.Sniffer().sniff(f.readline(), delimiters= ' \t')
		f.seek(0)
		input_file= csv.DictReader(f, dialect= dialect)
		df_list= list()
		with open(output, 'w') as csvfile:
			writer = csv.writer(csvfile, delimiter= '\t')
			writer.writerow([g for g in ['ID', 'rsid', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']])
		for row in input_file:
			rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= select_format(snakemake.wildcards.repr_pheno, row)
			if CHR== 0: continue
			if len(REF) >1: REF= 'I'
			if len(EFF) >1: EFF= 'I'
			if REF== 'I': EFF= 'D'
			if EFF== 'I': REF= 'D'
			if REF> EFF:
				ID= str(CHR) + ':' + str(POS) + ':' + EFF + ':' + REF
				BETA= -1 * float(BETA)
				ref= EFF
				eff= REF
				EAF= 1 - float(EAF)
			else:
				ID= str(CHR) + ':' + str(POS) + ':' + REF + ':' + EFF
				BETA= float(BETA)
				eff= EFF
				ref= REF
			df_list.append([ID, rsid, CHR, POS, EAF, N, ref, eff, BETA, SE, pvalue])
			if len(df_list)== 1000:
				with open(output, 'a', newline= '') as file_handler:
					writer1= csv.writer(file_handler, delimiter= '\t')
					for item in df_list:
						writer1.writerow(item)
				df_list= list()
	with open(output, 'a', newline= '') as file_handler:
			writer1= csv.writer(file_handler, delimiter= '\t')
			for item in df_list:
				writer1.writerow(item)


format_list(snakemake.input[0], snakemake.output[0])
8
9
script:
	'format_sumstats.py'
21
22
23
24
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	d.columns= ['ID', 'SNP', 'CHR', 'POS', 'EAF', 'N', 'A2', 'A1', 'BETA', 'SE', 'pvalue']
	d.dropna(subset= ['pvalue'], axis= 0, inplace= True)
53
54
55
56
57
58
59
60
61
62
63
64
65
        shell:
                """
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
		conda activate ldsc
                python2 /home/pol/software/ldsc/munge_sumstats.py \
                --out {params[0]} \
		--merge-alleles /home/pol/software/ldsc/w_hm3.snplist \
                --sumstats {input[0]} \
                --chunksize 500000
                conda deactivate
                set -eu
                """
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
	run:
                allfiles= [infile for infile in input if wildcards.pheno not in infile]
                allfiles= ','.join(allfiles)
                outfile= params[0] + wildcards.pheno + '_rg'
                infile= input[0]
                shell("""
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --rg {infile},{allfiles} \
                --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --out {outfile}
                conda deactivate
                set -eu
                """)
101
102
103
104
105
106
run:
        with open(input[0], 'r') as f:
                x= f.readlines()
        x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3]
        with open(output[0], 'w') as f:
                f.write(''.join(x))
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
run:
	allfiles= [infile for infile in input if wildcards.repr_pheno not in infile]
	allfiles= ','.join(allfiles)
	outfile= params[0] + wildcards.repr_pheno + '_rg'
	infile= input[0]
	shell("""
	set +eu
	source /home/pol/miniconda3/etc/profile.d/conda.sh
	conda activate ldsc
	python2 /home/pol/software/ldsc/ldsc.py \
	--rg {infile},{allfiles} \
	--ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
	--w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
	--out {outfile}
	conda deactivate
	set -eu
	""")
141
142
143
144
145
146
run:
        with open(input[0], 'r') as f:
                x= f.readlines()
        x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3]
        with open(output[0], 'w') as f:
                f.write(''.join(x))
154
155
156
157
158
159
160
run:
	df_list= list()
	for i in input:
		d= pd.read_csv(i, delim_whitespace= True, header= 0)
		df_list.append(d)
	d= pd.concat(df_list)
	d.to_csv(output[0], sep= '\t', header= True, index= False)
172
173
run:
        d= pd.read_csv(input[0], sep= '\t', header= 0)
211
212
213
214
215
216
217
218
219
220
221
222
223
shell:
        """
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/munge_sumstats.py \
        --out {params[0]} \
        --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \
        --sumstats {input[0]} \
        --chunksize 500000
        conda deactivate
        set -eu
        """
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
run:
        allfiles= [infile for infile in input if 'BW_maternal_effect' not in infile]
        allfiles= ','.join(allfiles)
        outfile= params[0] + 'BW_maternal_effect_rg'
        infile= input[0]
        shell("""
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/ldsc.py \
        --rg {infile},{allfiles} \
        --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --out {outfile}
        conda deactivate
        set -eu
        """)
261
262
263
264
265
266
267
268
269
270
271
272
273
	shell:
		"""
		set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --h2 {input[0]} \
                --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --out {params[0]}
                conda deactivate
                set -eu
		"""
281
282
283
284
285
286
run:
        with open(input[0], 'r') as f:
                x= f.readlines()
        x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3]
        with open(output[0], 'w') as f:
                f.write(''.join(x))
297
298
299
300
301
302
303
304
305
306
307
308
309
	shell:
		"""
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --h2 {input[0]} \
                --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --out {params[0]}
                conda deactivate
                set -eu
                """
317
318
319
320
321
322
323
324
325
326
327
328
	run:
                df_list= list()
                for infile in input:
                        with open(infile, 'r') as f:
                                lines= [line.strip() for line in f if line.startswith('Total Observed')]
                                h2= float(lines[0].split(' ')[4])
                                se= float(lines[0].split('(')[1].replace(')', ''))
                                cohort= infile.split('/')[10].replace('_h2.log', '')
                                d= pd.DataFrame({'cohort': cohort, 'h2': h2, 'se': se}, index= [0])
                                df_list.append(d)
                d= pd.concat(df_list)
                d.to_csv(output[0], sep= '\t', header= True, index= False)
17
18
19
20
run:
        d= pd.read_csv(input[0], sep= '\t', header= 0, compression= 'gzip', usecols= ['RSID', 'CHR', 'POS', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
        d.columns= ['CHR', 'POS', 'A1', 'A2', 'N', 'BETA', 'SE', 'pvalue', 'SNP']
        d.dropna(axis= 0, inplace= True)
40
41
42
43
44
45
46
47
48
49
50
51
52
        shell:
                """
		set +eu
		source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/munge_sumstats.py \
		--merge-alleles /home/pol/software/ldsc/w_hm3.snplist \
                --out {params[0]} \
                --sumstats {input[0]} \
		--chunksize 500000
                conda deactivate
		set -eu
                """
63
64
65
run:
	allfiles= [infile for infile in input if wildcards.pheno not in infile]
	allfiles= ','.join(allfiles)
88
89
90
91
92
93
run:
	with open(input[0], 'r') as f:
		x= f.readlines()
	x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3]
	with open(output[0], 'w') as f:
		f.write(''.join(x))
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
	shell:
		'''
		set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
		python2 /home/pol/software/ldsc/ldsc.py \
		--h2 {input[0]}\
		--ref-ld-chr /home/pol/software/ldsc/baseline/baseline/baselineLD. \
		--w-ld-chr /home/pol/software/ldsc/baseline/weights_hm3_no_hla/weights.\
		--overlap-annot\
		--frqfile-chr /home/pol/software/ldsc/baseline/1000G_Phase3_frq/1000G.EUR.QC.\
		--out {params[0]}
		conda deactivate
                set -eu
		'''
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
        shell:
                '''
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
		cd /home/pol/software/ldsc/cts/
                python2 /home/pol/software/ldsc/ldsc.py \
                --h2-cts {input[0]}\
                --ref-ld-chr-cts {params[1]} \
                --w-ld-chr /home/pol/software/ldsc/baseline/weights_hm3_no_hla/weights.\
		--ref-ld-chr /home/pol/software/ldsc/baseline/baseline/baselineLD.\
                --out {params[0]}
                conda deactivate
                set -eu
                '''
156
157
	run:
                d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['SNP', 'CHR', 'POS', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])[['SNP', 'CHR', 'POS', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']]
185
186
187
188
189
190
191
192
193
194
195
196
197
shell:
        """
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/munge_sumstats.py \
        --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \
        --out {params[0]} \
        --sumstats {input[0]} \
        --chunksize 500000
        conda deactivate
        set -eu
        """
209
210
211
212
213
214
215
216
217
218
219
220
221
	shell:
                """
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --h2 {input[0]} \
                --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --out {params[0]}
                conda deactivate
                set -eu
                """
229
230
231
232
233
234
235
236
237
238
239
240
run:
        df_list= list()
        for infile in input:
                with open(infile, 'r') as f:
                        lines= [line.strip() for line in f if line.startswith('Total Observed')]
                        h2= float(lines[0].split(' ')[4])
                        se= float(lines[0].split('(')[1].replace(')', ''))
                        cohort= infile.split('/')[9].replace('_h2.log', '')
                        d= pd.DataFrame({'cohort': cohort, 'h2': h2, 'se': se}, index= [0])
                        df_list.append(d)
        d= pd.concat(df_list)
        d.to_csv(output[0], sep= '\t', header= True, index= False)
252
253
254
255
256
257
258
259
	run:
                d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['SNP', 'CHR', 'POS', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])[['SNP', 'CHR', 'POS', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']]
                d['SNP']= d.SNP.str.replace(':SNP', '')
                d['SNP']= d.SNP.str.replace(':INDEL', '')
                d['CHR']= d.CHR.apply(str)
                d.columns= ['ID', 'CHR', 'POS', 'N', 'A2', 'A1', 'BETA', 'SE', 'pvalue']
                d.dropna(axis= 0, inplace= True)
                d['CHR']= d.CHR.apply(str)
281
282
283
284
285
286
287
288
289
290
291
292
293
shell:
        """
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/munge_sumstats.py \
        --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \
        --out {params[0]} \
        --sumstats {input[0]} \
        --chunksize 500000
        conda deactivate
        set -eu
        """
304
305
306
307
308
309
310
311
312
313
314
315
316
shell:
        """
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/ldsc.py \
        --h2 {input[0]} \
        --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --out {params[0]}
        conda deactivate
        set -eu
        """
324
325
326
327
328
329
330
331
332
333
334
335
run:
	df_list= list()
	for infile in input:
		with open(infile, 'r') as f:
			lines= [line.strip() for line in f if line.startswith('Total Observed')]
			h2= float(lines[0].split(' ')[4])
			se= float(lines[0].split('(')[1].replace(')', ''))
			cohort= infile.split('/')[10].replace('_allPTD.log', '')
			d= pd.DataFrame({'cohort': cohort, 'h2': h2, 'se': se}, index= [0])
			df_list.append(d)
	d= pd.concat(df_list)
	d.to_csv(output[0], sep= '\t', header= True, index= False)
345
346
347
348
349
350
351
352
353
354
355
356
357
	shell:
                """
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --h2 {input[0]} \
                --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --out {params[0]}
                conda deactivate
                set -eu
                """
365
366
run:
	df_list= list()
387
388
389
390
391
run:
	x= pd.read_csv(input[0], sep= '\t', header= 0)
	d= pd.read_csv(input[1], sep= '\t', header= 0)
	d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
	d[['CHR', 'POS', 'REF', 'EFF', 'INDELS']]= d['MarkerName'].str.split(':', expand= True)
416
417
418
419
420
421
422
423
424
425
426
427
428
shell:
        """
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/munge_sumstats.py \
        --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \
        --out {params[0]} \
        --sumstats {input[0]} \
        --chunksize 500000
        conda deactivate
        set -eu
        """
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
run:
        allfiles= [infile for infile in input if wildcards.PTD_metas not in infile]
        allfiles= ','.join(allfiles)
        outfile= params[0] + wildcards.PTD_metas + '_rg'
        infile= input[0]
        shell("""
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/ldsc.py \
        --rg {infile},{allfiles} \
        --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --out {outfile}
        conda deactivate
        set -eu
        """)
464
465
466
467
468
469
470
471
472
473
474
run:
	with open(input[0], 'r') as f:
		x= f.readlines()
	x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3]
	with open(output[0], 'w') as f:
		f.write(''.join(x))
	with open(input[1], 'r') as f:
		x= f.readlines()
	x= x[x.index('Summary of Genetic Correlation Results\n')+2:-3]
	with open(output[0], 'a') as f:
		f.write(''.join(x))
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
run:
	allfiles= [infile for infile in input if 'individual_cohorts/' + wildcards.big5 not in infile]
	allfiles= ','.join(allfiles)
	print(allfiles)
	outfile= input[0].replace('.txt.sumstats.gz', '_rg')
	infile= input[0]
	shell("""
	set +eu
	source /home/pol/miniconda3/etc/profile.d/conda.sh
	conda activate ldsc
	python2 /home/pol/software/ldsc/ldsc.py \
	--rg {infile},{allfiles} \
	--ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
	--w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
	--out {params[0]}
	conda deactivate
	set -eu
	""")
513
514
515
516
517
518
519
520
521
522
523
524
run:
	for i in range(len(input)):
		with open(input[i], 'r') as f:
			x= f.readlines()
		if i== 0: 
			x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3]
			with open(output[0], 'w') as f:
				f.write(''.join(x))
		else:
			x= x[x.index('Summary of Genetic Correlation Results\n')+2:-3]
			with open(output[0], 'a') as f:
				f.write(''.join(x))
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	d['Allele1']= d['Allele1'].str.upper()
	d['Allele2']= d['Allele2'].str.upper()
	d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
	d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
	d['CHR']= d['CHR'].astype(str).astype(int)
	d['POS']= d['POS'].astype(str).astype(int)
	d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']]
	d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']
	d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
	d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)
	d['CHR']= d['CHR'].astype(str).astype(int)
	d['POS']= d['POS'].astype(str).astype(int)
	d['pvalue']= d['pvalue'].astype(str).astype(float)
	d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
	d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF
	d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]
	rs= pd.read_csv(input[1], sep= '\t', header=0)
	rs.columns= ['ID', 'RSID']
564
565
566
567
568
569
570
571
572
573
574
575
576
run:
        x= pd.read_csv(input[0], sep= '\t', header= 0)
        d= pd.read_csv(input[1], sep= '\t', header= 0)
        d['CHR']= np.where(d['CHR']== 'X', '23', d['CHR'])
        d['POS']= d['POS'].astype(str).astype(int)
        d['CHR']= d['CHR'].astype(str).astype(int)
        d.dropna(axis= 0, inplace= True)
        d= pd.merge(d, x[['CHR', 'SNP', 'BP']], left_on= ['CHR', 'POS'], right_on= ['CHR', 'BP'])
        d= d.loc[~((d.CHR==6) & (d.POS >28477797) & (d.POS< 33448354)), :]
        d= d[['CHR', 'POS', 'RSID', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']]
        d.columns= ['CHR', 'POS', 'SNP', 'A1', 'A2', 'N', 'EAF', 'BETA', 'SE', 'pvalue']
        d.drop_duplicates(['CHR', 'POS', 'A1', 'A2'], keep= 'first', inplace= True)
        d.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['SNP', 'CHR', 'POS', 'N', 'A2', 'A1', 'BETA', 'SE', 'pvalue'])
588
589
590
591
592
593
594
595
596
597
598
599
600
shell:
        """
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/munge_sumstats.py \
        --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \
        --out {params[0]} \
        --sumstats {input[0]} \
        --chunksize 500000
        conda deactivate
        set -eu
        """
611
612
613
614
615
616
617
618
619
620
621
622
623
run:
        shell("""
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/ldsc.py \
        --rg {input[0]},{input[1]} \
        --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --out {params[0]}
        conda deactivate
        set -eu
        """)
637
638
639
640
641
642
643
644
run:
	d= pd.read_table(input[0], sep= '\t', header= 0)
	d['kbid']= d.kbid.str.split('.', expand= True)[0]
	d['Cell_type']= d.Cell_type.str.replace(' ', '-')
	for k, g in d[d['Cell_type'].isin(set(d.Cell_type))].groupby('Cell_type'):
		g.to_csv(params[0] + k + '.txt', header= False, sep= '\t', columns= ['kbid'], index= False)
	d.drop_duplicates('kbid', inplace= True, keep= 'first')
	d.to_csv(output[-1], sep= '\t', header= False, index= False, columns= ['kbid'])
655
656
657
run:
	shell("""
	set +eu
681
682
683
run:
	shell("""
	set +eu
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
	shell:
                '''
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --h2 {input[0]}\
                --ref-ld-chr /home/pol/software/ldsc/baseline/baseline/baselineLD.,{params[1]}. \
                --w-ld-chr /home/pol/software/ldsc/baseline/weights_hm3_no_hla/weights.\
                --overlap-annot \
                --frqfile-chr /home/pol/software/ldsc/baseline/1000G_Phase3_frq/1000G.EUR.QC. \
                --out {params[0]} \
		--thin-annot
                conda deactivate
                set -eu
                '''
736
737
738
739
740
741
742
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	d= d.loc[d.Category== 'L2_1', :]
	x= pd.read_csv(input[1], sep= '\t', header= None, names= ['Gene'])
	d['n_genes']= x.shape[0]
	d['Category']= wildcards.cell_types
	d.to_csv(output[0], sep= '\t', header= True, index= False)
750
751
752
753
754
shell:
	'''
	head -1 {input[0]} > {output[0]}
	tail -n +2 -q {input} >> {output[0]}
	'''
766
767
run:
	shell("""
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
	run:

                shell("""
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --l2 \
                --bfile {params[0]} \
                --ld-wind-cm 1 \
                --annot {input[0]} \
                --out {params[1]} \
                --print-snps {input[1]} \
                --thin-annot
                """)
823
824
825
826
827
828
run:
        if wildcards.cell_types!= 'overall':
                d= pd.DataFrame({'V1': [wildcards.cell_types], 'V2': ','.join(params)})
                d.to_csv(output[0], sep= '\t', header= False, index= False)
        else:
                open(output[0], 'a').close()
836
837
shell:
        'cat {input} > {output[0]}'
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
        shell:
                '''
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --h2-cts {input[0]}\
                --ref-ld-chr /home/pol/software/ldsc/baseline/baseline/baselineLD. \
		--ref-ld-chr-cts {input[1]} \
                --w-ld-chr /home/pol/software/ldsc/baseline/weights_hm3_no_hla/weights.\
                --overlap-annot \
                --frqfile-chr /home/pol/software/ldsc/baseline/1000G_Phase3_frq/1000G.EUR.QC. \
                --out {params[0]} \
                --thin-annot
                conda deactivate
                set -eu
                '''
10
11
shell:
	'grep -v {wildcards.allPTD_coh} {input[0]} | sed -e "s/to_replace/{params[0]}/g" > {output[0]}'
38
39
run:
        d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['MarkerName', 'Allele1', 'P-value'])
59
60
shell:
        'bedtools closest -t all -a {input[0]} -b {input[1]} > {output[0]}'
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
run:
	d= pd.read_csv(input[0], sep= '\t', header=0)
	d['Allele1']= d['Allele1'].str.upper()
	d['Allele2']= d['Allele2'].str.upper()
	d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
	d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
	d['CHR']= d['CHR'].astype(str).astype(int)
	d['POS']= d['POS'].astype(str).astype(int)
	d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value', 'HetISq', 'HetPVal']]
	d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue', 'HetISq', 'HetPVal']
	d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
	d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)
	d['CHR']= d['CHR'].astype(str).astype(int)
	d['POS']= d['POS'].astype(str).astype(int)
	d['pvalue']= d['pvalue'].astype(str).astype(float)
	d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
	d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF
	d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
run:
        d= pd.read_csv(input[0], sep= '\t', compression= 'gzip', usecols= ['CHR', 'POS', 'pvalue', 'nearestGene', 'ID'])
        df= d.loc[d.pvalue< 5*10**-8, :]
        df.sort_values(by= 'pvalue', ascending= True, inplace= True)
        df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True)
        df_list= list()
        for chrom in set(df.CHR):
                d_temp= df.loc[df.CHR== chrom, :]
                positions= d_temp.POS.values
                for pos in positions:
                        if pos in d_temp.POS.values:
                                df_list.append(d_temp.loc[d_temp.POS== pos, :])
                                d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :]
                        else:
                                continue
        x= pd.concat(df_list)
        x['pos1']= x.POS - 1.5*10**6
        x['pos2']= x.POS + 1.5*10**6
        x['CHR']= x.CHR.astype(str)
        x['CHR']= np.where(x.CHR== '23', 'X', x.CHR)
        x.to_csv(output[0], sep='\t', header= True, index= False, columns= ['CHR', 'pos1', 'pos2', 'nearestGene', 'ID', 'pvalue'])
10
11
shell:
	'grep -v {wildcards.GAraw_coh} {input[0]} | sed -e "s/to_replace/{params[0]}/g" > {output[0]}'
38
39
run:
        d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['MarkerName', 'Allele1', 'P-value'])
59
60
shell:
        'bedtools closest -t all -a {input[0]} -b {input[1]} > {output[0]}'
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
run:
	d= pd.read_csv(input[0], sep= '\t', header=0)
	d['Allele1']= d['Allele1'].str.upper()
	d['Allele2']= d['Allele2'].str.upper()
	d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
	d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
	d['CHR']= d['CHR'].astype(str).astype(int)
	d['POS']= d['POS'].astype(str).astype(int)
	d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value', 'HetISq', 'HetPVal']]
	d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue', 'HetISq', 'HetPVal']
	d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
	d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)
	d['CHR']= d['CHR'].astype(str).astype(int)
	d['POS']= d['POS'].astype(str).astype(int)
	d['pvalue']= d['pvalue'].astype(str).astype(float)
	d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
	d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF
	d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
run:
        d= pd.read_csv(input[0], sep= '\t', compression= 'gzip', usecols= ['CHR', 'POS', 'pvalue', 'nearestGene', 'ID'])
        df= d.loc[d.pvalue< 5*10**-8, :]
        df.sort_values(by= 'pvalue', ascending= True, inplace= True)
        df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True)
        df_list= list()
        for chrom in set(df.CHR):
                d_temp= df.loc[df.CHR== chrom, :]
                positions= d_temp.POS.values
                for pos in positions:
                        if pos in d_temp.POS.values:
                                df_list.append(d_temp.loc[d_temp.POS== pos, :])
                                d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :]
                        else:
                                continue
        x= pd.concat(df_list)
        x['pos1']= x.POS - 1.5*10**6
        x['pos2']= x.POS + 1.5*10**6
        x['CHR']= x.CHR.astype(str)
        x['CHR']= np.where(x.CHR== '23', 'X', x.CHR)
        x.to_csv(output[0], sep='\t', header= True, index= False, columns= ['CHR', 'pos1', 'pos2', 'nearestGene', 'ID', 'pvalue'])
11
12
13
14
15
16
17
run:
	d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['CHR', 'POS', 'pvalue', 'nearestGene'], compression= 'gzip')
	d.sort_values(['pvalue'], ascending=True, inplace= True)
	d.drop_duplicates(['CHR', 'POS'], inplace= True, keep= 'first')
	d['ID']= 'chr' + d.CHR.apply(str) + ':' + d.POS.apply(str)
	d.columns= ['CHR', 'POS', 'P-value', 'nearestGene', 'MarkerName']
	d.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['MarkerName', 'P-value'])
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
run:
	df= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['CHR', 'POS', 'pvalue', 'nearestGene'], compression= 'gzip')
	df= df.loc[df.pvalue< 5*10**-8, :]
	df.sort_values(by= 'pvalue', ascending= True, inplace= True)
	df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True)
	df_list= list()
	for chrom in set(df.CHR):
		d_temp= df.loc[df.CHR== chrom, :]
		positions= d_temp.POS.values
		for pos in positions:
			if pos in d_temp.POS.values:
				df_list.append(d_temp.loc[d_temp.POS== pos, :])
				d_temp= d_temp.loc[(d_temp.POS < pos - (1.5 * 10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :]
			else:
				continue
	df= pd.concat(df_list)
	df['CHR']= df.CHR.astype(str)
57
58
59
60
61
62
63
64
65
66
67
run:
	if not os.path.exists(params[1]):
		os.makedirs(params[1])
	df= pd.read_csv(input[0], sep= '\t', header= 0)
	for index, row in df.iterrows():
		snp= row['snp']
		title= '"' + row['nearestGene'] + '"'
		shell('python2 /home/pol/software/locuszoom/bin/locuszoom --metal {input[1]} --refsnp {snp} --flank 250kb --plotonly --no-date --build hg19 --pop EUR --source 1000G_March2012 --prefix {params[0]} title={title} theme=publication')
		outfile= params[1] + 'chr' + str(row['chr']) + '_' + row['nearestGene'] + '.pdf'
		infile= params[0] + '_' + snp.replace(':', '_') + '.pdf'
		shell('qpdf --empty --pages {infile} 1 -- {outfile}; rm {infile}')
80
81
82
run:
	if len(input)== 1:
		shell('cp {input[0]} {output[0]}')
96
97
run:
        d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['CHR', 'POS', 'pvalue'], compression= 'gzip')
126
127
128
129
run:
	if not os.path.exists(params[1]):
		os.makedirs(params[1])
	shell('python2 /home/pol/software/locuszoom/bin/locuszoom --metal {input[0]} --refsnp rs9823520 --flank 250kb --plotonly --no-date --build hg19 --pop EUR --source 1000G_March2012 --prefix {params[0]} title="WNT4-GA" theme=publication')
20
21
shell:
	'/home/pol/software/generic-metal/metal {params[0]} >> {output[1]}'
SnakeMake From line 20 of meta/Snakefile
37
38
39
40
41
42
    shell:
        '''
        /home/pol/software/generic-metal/metal {input[0]} >> {output[3]}
        /home/pol/software/generic-metal/metal {input[1]} >> {output[3]}
	/home/pol/software/generic-metal/metal {input[2]} >> {output[3]}
        '''
SnakeMake From line 37 of meta/Snakefile
55
56
57
58
59
shell:
    '''
    /home/pol/software/generic-metal/metal {input[0]} >> {output[2]}
    /home/pol/software/generic-metal/metal {input[1]} >> {output[2]}
    '''
SnakeMake From line 55 of meta/Snakefile
70
71
shell:
        '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}'
SnakeMake From line 70 of meta/Snakefile
81
82
shell:
        '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}'
SnakeMake From line 81 of meta/Snakefile
92
93
shell:
        '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}'
SnakeMake From line 92 of meta/Snakefile
104
105
shell:
        '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}'
SnakeMake From line 104 of meta/Snakefile
115
116
117
118
shell:
    '''
    /home/pol/software/generic-metal/metal {input[0]} >> {output[1]}
    '''
SnakeMake From line 115 of meta/Snakefile
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
library(MendelianRandomization)
library(data.table)
library(dplyr)

if (!grepl('cluster', snakemake@output[[1]])){
d= fread(snakemake@input[[1]])
names(d)= c('ID', 'beta', 'se', 'pvalue', 'trait')
} else {
d= fread(snakemake@input[[1]])

}
x=fread(snakemake@input[[2]])
x= filter(x, !duplicated(ID))
d= inner_join(d, x, by= 'ID')



funk= function(temp_df){

inputMR= mr_input(bx = temp_df$beta,   bxse= temp_df$se,by = temp_df$BETA, byse = temp_df$SE)

if (nrow(temp_df)>3) {

z= mr_allmethods(inputMR)$Values
names(z)= c('method', 'estimate', 'se', 'lo95', 'up95', 'pvalue')
z$trait= unique(temp_df$trait)

} else {
z= mr_ivw(inputMR)

z= data.frame(method= 'IVW', estimate= z$Estimate, se= z$StdError, lo95= z$CILower, up95= z$CIUpper, pvalue= z$Pvalue, trait= unique(temp_df$trait))

}
return(z)
}


mr= lapply(split(d, d$trait), funk)

mr= do.call('rbind', mr)

fwrite(mr, snakemake@output[[1]], sep= '\t')
16
17
18
19
20
21
22
23
24
25
run:
	d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['CHR', 'POS', 'pvalue', 'ID'])
	x= pd.read_csv(input[1], sep= '\t', header= 0, usecols= ['ID', 'EAF'])
	x= x.loc[((x.EAF>=0.01) & (x.EAF<= 0.99)), :]
	d= d.loc[d.pvalue< 5e-8, :]
	d= d.loc[d.ID.isin(x.ID.values), :]
	d.drop_duplicates('ID', inplace= True)
	if d.shape[0] == 0: 
		open(output[0], 'a').close()
	else:
SnakeMake From line 16 of MR/Snakefile
40
41
42
43
44
45
46
47
48
49
50
run:
	d= pd.read_csv(input[0], sep= '\t', header= None, names= ['CHR', 'SNP', 'x1', 'POS', 'A1', 'A2'])
	d['A1']= np.where(d.A1.str.len() > d.A2.str.len(), 'I', d.A1)
	d['A2']= np.where(d.A1.str.len() < d.A2.str.len(), 'I', d.A2)
	d['A1']= np.where(d.A2== 'I', 'D', d.A1)
	d['A2']= np.where(d.A1== 'I', 'D', d.A2)
	d['SNP']= np.where(d.A1>d.A2, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.A2 + ':' + d.A1, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.A1 + ':' + d.A2)
	d.to_csv(output[0], sep= '\t', header= False, index= False)
	d= d[d.duplicated(['SNP'], keep= False)]
	d.drop_duplicates('SNP', inplace= True, keep= 'first')
	d.to_csv(output[1], sep='\t', columns= ['SNP'], index= False, header= False)
SnakeMake From line 40 of MR/Snakefile
64
65
run:
        shell('~/software/plink --bim {input[2]} --bed {input[3]} --fam {input[4]} --clump {input[0]} --exclude {input[1]} --clump-r2 0.001 --clump-kb 1000 --clump-p1 5e-8 --clump-p2 1e-5 --out {params[1]} || true')
75
76
77
78
79
80
81
82
83
84
85
run:
	if os.stat(input[1]).st_size == 0:
		open(output[0], "w").close
	else:
		d= pd.read_csv(input[0], sep='\t', header= 0, usecols= ['ID', 'BETA', 'SE', 'pvalue'])
		x= pd.read_csv(input[1], delim_whitespace= True, header= 0)
		d= d.loc[d.ID.isin(x.SNP.values), :]
		d= d.groupby('ID').head(1)
		d= d[['ID', 'BETA', 'SE', 'pvalue']]
		d['trait']= wildcards.repr_pheno
		d.to_csv(output[0], sep= '\t', header= False, index= False)
SnakeMake From line 75 of MR/Snakefile
93
94
shell:
	'echo -e "ID\tbeta\tse\tpvalue_exp\ttrait" | cat {input} > {output[0]}'
SnakeMake From line 93 of MR/Snakefile
104
105
script:
        'MR_reproductive_traits.R'
SnakeMake From line 104 of MR/Snakefile
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
run:
        d= pd.read_csv(input[0], sep= '\t', header= 0)
        x= pd.read_csv(input[1], sep= '\t', header= 0, usecols= ['#CHROM', 'POS', 'ID', 'REF', 'ALT'])
        x.columns= ['CHR', 'POS', 'ID', 'REF', 'ALT']
        df= d.loc[(d.OKG_proxy != 'Signal in 1KG') & (d.OKG_proxy != 'No 1KG proxy'), :]
        okg= df.loc[(df.OKG_Other_allele.str.len() == 1) & (d.OKG_Trait_raising.str.len() == 1), :]
        hm= df.loc[(df.OKG_Other_allele.str.len() != 1) | (d.OKG_Trait_raising.str.len() != 1), :]
        hm['beta']= hm.HM_Weight
        hm['ref']= hm.HM_Other_allele
        hm['eff']= hm.HM_Trait_raising
        hm['RSID']= hm.HM_proxy
        hm['se']= hm.HM_SE_weight
        hm= hm[['RSID', 'beta', 'se', 'ref', 'eff', 'Cluster']]
        okg['beta']= okg.OKG_Weight
        okg['ref']= okg.OKG_Other_allele
        okg['eff']= okg.OKG_Trait_raising
        okg['RSID']= okg.OKG_proxy
        okg['se']= okg.OKG_SE_weight
        okg= okg[['RSID', 'beta', 'se', 'ref', 'eff', 'Cluster']]
        d= d.loc[(d.OKG_proxy == 'Signal in 1KG') | (d.OKG_proxy == 'No 1KG proxy'), :]
        d['beta']= d.Weight
        d['ref']= d.Other_allele
        d['eff']= d.Trait_raising
        d['RSID']= d.Signal
        d['se']= d.SE_weight
        d= d[['RSID', 'beta', 'se', 'ref', 'eff', 'Cluster']]
        d= pd.concat([d, hm, okg])
        d= pd.merge(d, x, left_on= ['RSID'], right_on= 'ID')
        d= d.loc[(d.ALT== d.ref) | (d.REF== d.ref), :]
        d= d.loc[(d.ALT== d.eff) | (d.REF== d.eff), :]
        d['beta']= np.where(d.ref > d.eff, -1 * d.beta, d.beta)
        d['ID']= np.where(d.ref > d.eff, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.eff + ':' + d.ref, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.ref + ':' + d.eff)
        d['trait']= np.where(d.Cluster== 'Female SHBG cluster', 'SHBG_fem_cluster', 'Testosterone_fem_cluster')
        d.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['ID', 'beta', 'se', 'trait'])
SnakeMake From line 115 of MR/Snakefile
158
159
script:
        'MR_reproductive_traits.R'
SnakeMake From line 158 of MR/Snakefile
168
169
170
171
172
173
174
run:
	x= pd.read_csv(input[0], sep= '\t', header= None, names= ['ID', 'beta', 'se', 'pvalue', 'trait'])
	x= x.loc[((x.trait== 'SHBG_fem') | (x.trait== 'Testosterone_fem') | (x.trait== 'CBAT_fem')), :]
	x.drop_duplicates(subset= 'ID', inplace= True)
	x[['CHR', 'POS', 'REF', 'EFF']]= x.ID.str.split(':', expand= True)
	x['CHR']= np.where(x.CHR== 'X', '23', x.CHR)
	x.to_csv(output[0], sep= '\t', header= False, index= False, columns= ['CHR', 'POS', 'POS', 'ID'])
SnakeMake From line 168 of MR/Snakefile
186
187
shell:
	'~/software/plink2 --bfile {params[0]} --extract bed1 {input[0]} --memory 5000 --threads {threads} --make-bed --out {params[1]}'
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
run:
        d= pd.read_csv(input[0], sep= '\t', header= None, names= ['CHR', 'SNP', 'x1', 'POS', 'A1', 'A2'])
        d['A1']= np.where(d.A1.str.len() > d.A2.str.len(), 'I', d.A1)
        d['A2']= np.where(d.A1.str.len() < d.A2.str.len(), 'I', d.A2)
        d['A1']= np.where(d.A2== 'I', 'D', d.A1)
        d['A2']= np.where(d.A1== 'I', 'D', d.A2)
        d['CHR']= d.CHR.apply(str)
        d['CHR']= np.where(d.CHR== 'X', '23', d.CHR)
        d['SNP']= np.where(d.A1>d.A2, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.A2 + ':' + d.A1, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.A1 + ':' + d.A2)
        d.to_csv(output[0], sep= '\t', header= False, index= False)
        d= d[d.duplicated(['SNP'], keep= False)]
        d.drop_duplicates('SNP', inplace= True, keep= 'first')
        d.to_csv(output[3], sep='\t', columns= ['SNP'])
        shell('mv {input[1]} {output[1]}')
        shell('mv {input[2]} {output[2]}')
SnakeMake From line 200 of MR/Snakefile
225
226
shell:
        '~/software/plink --bfile {params[0]} --r square --out {params[1]}'
240
241
script:
	'MVMR.R'
SnakeMake From line 240 of MR/Snakefile
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
library(data.table)
library(dplyr)


hrc= fread(snakemake@input[[1]])


funk= function(infile){
d= fread(infile)

print(paste('Filtering file: ', infile))

d= arrange(d, CHR, POS, EFF, REF)

d= filter(d, pvalue< 1, pvalue>0)

d$pval= pnorm(-abs(d$BETA / d$SE)) * 2

d= filter(d, (abs(-log10(pvalue) - -log10(pval)) / -log10(pval)) * 100 <= 10)

d$ID= with(d, ifelse(REF> EFF, paste(CHR, POS, EFF, REF, sep= ':'), paste(CHR, POS, REF, EFF, sep= ':')))

d$SNP= with(d, ifelse(grepl('I', ID), paste(ID, 'INDEL', sep= ':'), paste(ID, 'SNP', sep= ':')))


print(str(d))

d= inner_join(d, hrc, by= 'ID')
d$EAF= ifelse(is.na(d$EAF), d$eaf, d$EAF)

d$BETA= ifelse(d$REF> d$EFF, -1 * d$BETA, d$BETA)
d$EAF= ifelse(d$REF> d$EFF, 1 - d$EAF, d$EAF)

d[d$REF>d$EFF, c("REF", "EFF")]= d[d$REF > d$EFF, c("EFF", "REF")]

d$MAF= ifelse(d$EAF>0.5, 1- d$EAF, d$EAF)
d= filter(d, MAF>= 0.005)

d= filter(d, pvalue>0, pvalue<1, MAF>=0.005, SE>0)

d= filter(d, (MAF * 2 * N) > 6)

d$maf= ifelse(d$eaf> 0.5, 1 - d$eaf, d$eaf)

d= filter(d, abs(maf - MAF) < 0.2)

if (grepl('GAraw/Viva', infile)){



d$EAF= with(d, ifelse(abs(eaf - EAF)> 0.2, 1 - EAF, EAF))
d$BETA= with(d, ifelse(abs(eaf - EAF)> 0.2, -1 * BETA, BETA))
}


if (grepl('GAnrm/Viva', infile)){



d$EAF= with(d, ifelse(abs(eaf - EAF)> 0.2, 1 - EAF, EAF))
d$BETA= with(d, ifelse(abs(eaf - EAF)> 0.2, -1 * BETA, BETA))
}

if (grepl('postTerm/HUNT', infile)){



d$EAF= with(d, ifelse(abs(eaf - EAF)> 0.2, 1 - EAF, EAF))
d$BETA= with(d, ifelse(abs(eaf - EAF)> 0.2, -1 * BETA, BETA))
}


d= arrange(d, pvalue)
d= filter(d, !duplicated(ID))


d= select(d, -c(MAF, ID, eaf, pval))

x2= nrow(d)

d$STRAND= '+'

#outfile= paste0(snakemake@params[[1]], gsub('_temp.txt', '', unlist(strsplit(infile, '/'))[9]), '.txt')


fwrite(d, snakemake@output[[1]], sep= '\t')
}


#input_files= snakemake@input[grepl('sumstats', snakemake@input)]

lapply(snakemake@input[[2]], funk)
96
97
run:
    format_list(input[0], output[0])
107
108
109
110
run:
	for infile in input:
		outfile= params[0] + infile.split('-')[0].split('/')[-1] + '_temp.txt'
		shell('mv {infile} {outfile}')
118
119
run:
    format_list(input[0], output[0])
129
130
131
132
run:
        for infile in input:
                outfile= params[0] + infile.split('-')[0].split('/')[-1] + '_temp.txt'
                shell('mv {infile} {outfile}')
140
141
run:
    format_list(input[0], output[0])
151
152
153
154
run:
        for infile in input:
                outfile= params[0] + infile.split('-')[0].split('/')[-1] + '_temp.txt'
                shell('mv {infile} {outfile}')
163
164
run:
    format_list(input[0], output[0])
174
175
176
177
run:
        for infile in input:
                outfile= params[0] + infile.split('-')[0].split('/')[-1] + '_temp.txt'
                shell('mv {infile} {outfile}')
186
187
188
189
190
191
192
193
194
195
196
197
198
199
run:
	d= pd.read_csv(input[0], header= 0, sep= '\t', usecols= ['#CHROM', 'POS', 'REF', 'ALT', 'AF_EXCLUDING_1000G'])
	d.columns= ['CHR', 'POS', 'REF', 'ALT', 'eaf']
	d['CHR']= np.where(d.CHR=='X', '23', d.CHR)
	KG= pd.read_csv(input[1], header= 0, sep='\t', compression= 'gzip', names= ['ID', 'ALT', 'REF', 'eaf'])
	KG['ID']= KG['ID'].str.replace(':ID', '')
	KG['ID']= KG['ID'].str.replace('X', '23')
	d['eaf']= np.where(d['REF']> d['ALT'], 1 - d.eaf, d.eaf)
	KG['eaf']= np.where(KG['REF']> KG['ALT'], 1- KG.eaf, KG.eaf)
	d['REF']= np.where(d.REF.str.len() > d.ALT.str.len(), 'I', d.REF)
	d['ALT']= np.where(d.REF.str.len()< d.ALT.str.len(), 'I', d.ALT)
	d['REF']= np.where(d.ALT== 'I', 'D', d.REF)
	d['ALT']= np.where(d.REF== 'I', 'D', d.ALT)
	KG['REF']= np.where(KG.REF.str.len() > KG.ALT.str.len(), 'I', KG.REF)
229
230
script:
        'filter_SNPs.R'
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
library(data.table)
library(dplyr)

d= fread(snakemake@input[[1]])

x1= nrow(d)

d= arrange(d, CHR, POS, EFF, REF)

hrc= fread(snakemake@input[[2]], header=T)

d= inner_join(d, hrc, by= 'ID')
rm(hrc)
d$EAF= ifelse(is.na(d$EAF), d$eaf, d$EAF)

d[d$REF>d$EFF, c("REF", "EFF")]= d[d$REF > d$EFF, c("EFF", "REF")]

d$MAF= ifelse(d$EAF>0.5, 1- d$EAF, d$EAF)

d= filter(d, MAF>0.005)

d= filter(d, (MAF * 2 * N) > 6)

d$maf= ifelse(d$eaf> 0.5, 1 - d$eaf, d$eaf)
d$P= as.numeric(d$P)

d= filter(d, P<1, P>0)
d= filter(d, abs(MAF - maf) < 0.2)

d= select(d, -c(maf, MAF, eaf))

x2= nrow(d)

write.table(d, snakemake@output[[1]], col.names= T, row.names=F, sep= '\t', quote= F)

cohort= unlist(strsplit(unlist(strsplit(snakemake@input[[1]], '/'))[[10]], '_'))[2]
cat(c(cohort, '\t', x1, '\t', x2, '\n'), file= snakemake@output[[2]])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
import numpy as np
import re

#d= pd.read_csv(snakemake.input[0], sep= '\t', header= 0)

#d['Allele1']= d['Allele1'].str.upper()
#d['Allele2']= d['Allele2'].str.upper()
#d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
#d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
#d['CHR']= d['CHR'].astype(str).astype(int)
#d['POS']= d['POS'].astype(str).astype(int)
#d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']]
#d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']
#d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
#d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)

#d['CHR']= d['CHR'].astype(str).astype(int)
#d['POS']= d['POS'].astype(str).astype(int)

#d['pvalue']= d['pvalue'].astype(str).astype(float)

#d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
#d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF

#d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]

col_list= ['IMPACT', 'DISTANCE', 'SYMBOL', 'SYMBOL_SOURCE', 'BIOTYPE']
df_list= list()

for vep in pd.read_csv(snakemake.input[1], sep= '\t', header= None, names= ['Variation', 'Location', 'Allele', 'Gene', 'Feature', 'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 'Existing_variation', 'Extra'], comment= '#', chunksize= 100000):
	for i in col_list:
		vep[i]= vep['Extra'].apply(lambda y: dict([(x.split('=', 1)) for x in re.split(';(?=\w)', y) if x.find('=') > -1])[i] if i in y else '')
	vep= vep[['Variation', 'Location', 'Existing_variation', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'SYMBOL_SOURCE', 'BIOTYPE']]
	vep.columns= ['ID', 'Location', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'SYMBOL_SOURCE', 'BIOTYPE']
	vep['BIOTYPE1']= np.where(vep.BIOTYPE== 'protein_coding', 0, np.where(vep.BIOTYPE.str.contains('pseudo'), 2, 1))
	vep['DISTANCE']= np.where(vep.DISTANCE== '', 0, vep.DISTANCE)
	vep[['chr', 'pos', 'All']]= vep.ID.str.split('_', expand= True)
	vep[['EFF', 'REF']]= vep.All.str.split('/', expand= True)
	vep.loc[vep.REF > vep.EFF, ['REF', 'EFF']] = vep.loc[vep.REF > vep.EFF, ['EFF', 'REF']].values
	vep[['CHR', 'POS']]= vep['Location'].str.split(':', expand= True)
	vep['CHR']= np.where(vep['CHR']== 'X', '23', vep['CHR'])
	vep['ID']= vep.CHR.astype(int).astype(str) + ':' + vep.POS.astype(int).astype(str) + ':' + vep.REF + ':' + vep.EFF
	vep= vep[['ID', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'BIOTYPE', 'BIOTYPE1']]
	vep.sort_values(by= ['BIOTYPE1'], ascending= True, inplace= True)
	vep.drop_duplicates(subset= ['ID'], keep= 'first', inplace= True)
	df_list.append(vep)

vep= pd.concat(df_list)

vep.sort_values(by= ['BIOTYPE1'], ascending= True, inplace= True)
vep.drop_duplicates(subset= ['ID'], keep= 'first', inplace= True)
vep= vep[['ID', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'BIOTYPE']]


d= pd.read_csv(snakemake.input[0], sep= '\t', header= 0)
d['Allele1']= d['Allele1'].str.upper()
d['Allele2']= d['Allele2'].str.upper()
d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
d= d.loc[d.TOTALSAMPLESIZE> 66106, :]
d[['CHR', 'POS', 'REF','EFF']]= d['MarkerName'].str.split(':', expand= True)
d['CHR']= d['CHR'].astype(str).astype(int)
d['POS']= d['POS'].astype(str).astype(int)
d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'P-value']]
d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'pvalue']
d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)
d['CHR']= d['CHR'].astype(str).astype(int)
d['POS']= d['POS'].astype(str).astype(int)
d['pvalue']= d['pvalue'].astype(str).astype(float)
d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF
d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]
d['MAF']= np.where(d.EAF>0.5, 1 - d.EAF, d.EAF)
d= d.loc[d.MAF>= 0.1, :]
d= pd.merge(d, vep, on= ['ID'], how= 'left')
d.to_csv(snakemake.output[0], header=True, index= False, sep= '\t')
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	dcols= d.columns.values[1:]
	d.drop('INFO', 1, inplace= True)
	d.columns= dcols
	d= d.loc[d.INFO>= 0.4, :]
	d['MAF']=  np.where(d.EAF_CONTR> 0.5, 1- d.EAF_CONTR, d.EAF_CONTR)
	d= d.loc[d.MAF * 2 * d.N >6, :]
	d.drop('MAF', 1, inplace= True)
	d['REF_ALLELE']= np.where(d.REF_ALLELE.str.len()> d.EFF_ALLELE.str.len(), 'I', d.REF_ALLELE)
	d['EFF_ALLELE']= np.where(d.REF_ALLELE.str.len()< d.EFF_ALLELE.str.len(), 'I', d.EFF_ALLELE)
	d['REF_ALLELE']= np.where(d.EFF_ALLELE== 'I', 'D', d.REF_ALLELE)
	d['EFF_ALLELE']= np.where(d.REF_ALLELE== 'I', 'D', d.EFF_ALLELE)
	d['CHR']= d.CHR.apply(str)
	d['CHR']= np.where(d.CHR== 'X', '23', d.CHR)
	d['ID']= np.where(d.REF_ALLELE> d.EFF_ALLELE, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.EFF_ALLELE + ':' + d.REF_ALLELE, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.REF_ALLELE + ':' + d.EFF_ALLELE)
	d= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_DOM', 'P_VAL_REC', 'INFO']]
	df= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_REC', 'INFO']]
	d= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_DOM','INFO']]
	d= d.loc[(d.P_VAL_DOM!= '.' ), :]
	df= df.loc[(df.P_VAL_REC!= '.'), :]
	d[['BETA_ADD', 'P_VAL_DOM']]= d[['BETA_ADD', 'P_VAL_DOM']].apply(pd.to_numeric, errors= 'coerce')
	df[['BETA_ADD', 'P_VAL_REC']]= df[['BETA_ADD', 'P_VAL_REC']].apply(pd.to_numeric, errors= 'coerce')
	d.dropna(axis= 0, inplace= True)
	df.dropna(axis= 0, inplace= True)
	d.columns= ['ID', 'CHR', 'POS', 'EFF', 'REF', 'N', 'EAF', 'BETA', 'P', 'INFO']
	d.to_csv(output[0], sep= '\t', header= True, index= False)
	df.columns= ['ID', 'CHR', 'POS', 'EFF', 'REF', 'N', 'EAF', 'BETA', 'P', 'INFO']
	df.to_csv(output[1], sep= '\t', header= True, index= False)
56
57
58
59
60
run:
	for i in range(len(input)):
		d= pd.read_csv(input[i], sep= '\t', header= 0)
		d= d.loc[d.INFO>= 0.4, :]
		d['MAF']=  np.where(d.EAF> 0.5, 1- d.EAF, d.EAF)
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
run:
	for i in range(len(input)):
		print(input[i])
		d= pd.read_csv(input[i], header= 0, delim_whitespace= True)
		d[['BETA_ADD', 'P_VAL_DOM', 'P_VAL_REC', 'INFO', 'EAF_CONTR']]= d[['BETA_ADD', 'P_VAL_DOM', 'P_VAL_REC', 'INFO', 'EAF_CONTR']].apply(pd.to_numeric, errors= 'coerce')
		d= d.loc[d.INFO>= 0.4, :]
		d['MAF']=  np.where(d.EAF_CONTR> 0.5, 1- d.EAF_CONTR, d.EAF_CONTR)
		d= d.loc[d.MAF * 2 * d.N >6, :]
		d.drop('MAF', 1, inplace= True)
		d['REF_ALLELE']= np.where(len(d.REF_ALLELE)> len(d.EFF_ALLELE), 'I', d.REF_ALLELE)
		d['EFF_ALLELE']= np.where(len(d.REF_ALLELE)< len(d.EFF_ALLELE), 'I', d.EFF_ALLELE)
		d['REF_ALLELE']= np.where(d.EFF_ALLELE== 'I', 'D', d.REF_ALLELE)
		d['EFF_ALLELE']= np.where(d.REF_ALLELE== 'I', 'D', d.EFF_ALLELE)
		d['CHR']= d.CHR.apply(str)
		d['CHR']= np.where(d.CHR== '0X', 'X', d.CHR)
		d['CHR']= np.where(d.CHR== 'X', '23', d.CHR)
		d['ID']= np.where(d.REF_ALLELE> d.EFF_ALLELE, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.EFF_ALLELE + ':' + d.REF_ALLELE, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.REF_ALLELE + ':' + d.EFF_ALLELE)
		d= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_DOM', 'P_VAL_REC', 'INFO']]
		df= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_REC', 'INFO']]
		d= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_DOM','INFO']]
		d= d.loc[(d.P_VAL_DOM!= '.' ), :]
		df= df.loc[(df.P_VAL_REC!= '.'), :]
		d.dropna(axis= 0, inplace= True)
		df.dropna(axis= 0, inplace= True)
		d.columns= ['ID', 'CHR', 'POS', 'EFF', 'REF', 'N', 'EAF', 'BETA', 'P', 'INFO']
		d.to_csv(output[i], sep= '\t', header= True, index= False)
		df.columns= ['ID', 'CHR', 'POS', 'EFF', 'REF', 'N', 'EAF', 'BETA', 'P', 'INFO']
		out= output[i].replace('dom', 'rec')
		df.to_csv(out, sep= '\t', header= True, index= False)
145
146
script:
	'filter_SNPs.R'
154
155
shell:
	'cat {input} > {output[0]}'
165
166
167
168
shell:
    '''
    /home/pol/software/generic-metal/metal {input[0]} >> {output[1]}
    '''
176
177
178
run:
        d= pd.read_csv(input[0], sep= '\t', header= 0)
        d[['CHR', 'POS', 'REF', 'EFF']]= d['MarkerName'].str.split(':', expand= True)
194
195
shell:
        '/home/pol/software/ensembl-vep/vep -i {input[0]} --check_existing --symbol --biotype --cache -O {output[0]} --offline --force_overwrite'
205
206
script:
	'format_VEP.py'
214
215
216
217
218
219
220
221
222
223
run:
        d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['MarkerName', 'Allele1'])
        d['CHR']= d.MarkerName.str.split(':').str[0]
        d['end']= d.MarkerName.str.split(':').str[1]
        d['CHR']= d.CHR.astype('str').astype('int')
        d['end']= d.end.astype('str').astype('int')
        d['start']= d.end - 1
        d.sort_values(by= ['CHR', 'start'], inplace= True)
        d= d[['CHR', 'start', 'end', 'MarkerName']]
        d.to_csv(output[0], sep= '\t', header= False, index= False)
232
233
shell:
        'bedtools closest -t all -a {input[0]} -b {input[1]} > {output[0]}'
244
245
246
247
248
249
250
251
252
253
254
255
run:
        d= pd.read_csv(input[0], sep= '\t', header=0)
        rs= pd.read_csv(input[1], sep= '\t', header=0)
        d= pd.merge(d, rs, on= 'ID', how= 'left')
        d['RSID']= np.where(pd.isnull(d.RSID), d.name, d.RSID)
        d['RSID']= np.where(d.RSID== '', d.name, d.RSID)
        d['RSID']= np.where(d.RSID== '-', d.name, d.RSID)
        d.drop('name', 1, inplace= True)
        ne= pd.read_csv(input[2], sep= '\t', header= None, names= ['CHR', 'X', 'POS', 'ID', 'c1', 'p1', 'p2', 'nearestGene', 'Ensembl_gene'])
        ne= ne[['ID', 'nearestGene']]
        d= pd.merge(d, ne, on= 'ID', how= 'left')
        d.to_csv(output[0], sep= '\t', header= True, index= False, compression= 'gzip')
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
run:
	for nfile in range(len(input)):
		d= pd.read_csv(input[nfile], sep= '\t', header= 0)
		d['Allele1']= d['Allele1'].str.upper()
		d['Allele2']= d['Allele2'].str.upper()
		d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
		d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
		d['CHR']= d['CHR'].astype(str).astype(int)
		d['POS']= d['POS'].astype(str).astype(int)
		d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']]
		d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']
		d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
		d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)
		d['CHR']= d['CHR'].astype(str).astype(int)
		d['POS']= d['POS'].astype(str).astype(int)
		d['pvalue']= d['pvalue'].astype(str).astype(float)
		d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
		d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF
		d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]
		d.to_csv(output[nfile], header=True, index= False, sep= '\t')
48
49
run:
	for fnumber in range(len(input)):
72
73
74
75
76
77
run:
	meta_files= [x for x in input if 'other_meta' in x]
	for nfile in range(len(meta_files)):
		meta= meta_files[nfile]
		out= output[nfile]
		shell('bedtools closest -t all -a {meta} -b {input[0]} > {out}')
96
97
run:
	rs= pd.read_csv(input[0], sep= '\t', header=0)
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
run:
        for nfile in range(len(input)):
                d= pd.read_csv(input[nfile], sep= '\t', header= 0)
                d['Allele1']= d['Allele1'].str.upper()
                d['Allele2']= d['Allele2'].str.upper()
                d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
                d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
                d['CHR']= d['CHR'].astype(str).astype(int)
                d['POS']= d['POS'].astype(str).astype(int)
                d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']]
                d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']
                d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
                d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)
                d['CHR']= d['CHR'].astype(str).astype(int)
                d['POS']= d['POS'].astype(str).astype(int)
                d['pvalue']= d['pvalue'].astype(str).astype(float)
                d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
                d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF
                d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]
                d.to_csv(output[nfile], header=True, index= False, sep= '\t')
144
145
146
147
148
149
150
run:
        rs= pd.read_csv(input[0], sep= '\t', header=0)
        d= pd.read_csv(input[1], sep= '\t', header=0)
        d= pd.merge(d, rs, on= 'ID', how= 'left')
        d['RSID']= d.name
        d.drop('name', 1, inplace= True)
        d.to_csv(output[0], sep= '\t', header= True, index= False, compression= 'gzip')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
library(data.table)
library(dplyr)
library(coloc)
library(parallel)

df= fread(snakemake@input[[1]], select= c('CHR', 'POS', 'ID', 'BETA', 'SE', 'TOTALSAMPLESIZE', 'EAF'))

df= filter(df, !duplicated(ID))

df$MAF= ifelse(df$EAF>0.5, 1 - df$EAF, df$EAF)

x= fread(snakemake@input[[2]], select= c('CHR', 'POS', 'nearestGene'))
x= x[, c('CHR', 'POS', 'nearestGene')]
names(x)= c('CHR', 'pos2', 'nearestGene')

df= inner_join(df, x, by= 'CHR')

df= filter(df, POS>= pos2 - 1.5*10**6, POS< pos2 + 1.5*10**6)

z= fread(snakemake@input[[3]], select= c('chr', 'pos', 'Allele1', 'Allele2', 'Freq1', 'Effect', 'StdErr', 'TotalSampleSize'))

z$Allele1= toupper(z$Allele1)
z$Allele2= toupper(z$Allele2)

z$ID= with(z, ifelse(Allele1 > Allele2, paste(chr, pos, Allele2, Allele1, sep= ':'), paste(chr, pos, Allele1, Allele2, sep= ':')))

z$maf= ifelse(z$Freq1> 0.5, 1 - z$Freq1, z$Freq1)

z= select(z, ID, maf, Effect, StdErr, TotalSampleSize)

df= inner_join(df, z, by= 'ID')

rm(z)

pph_outfile= snakemake@output[[1]]
results_outfile= snakemake@output[[2]]


cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tprotein\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tprotein\n', file= snakemake@output[[2]])

prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6


df= data.frame(df)


colocalization_eqtl= function(temp_df){
	protein= unique(temp_df$nearestGene)
        if (nrow(temp_df)== 0) {

        PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')

        } else {
	temp_df = filter(temp_df, SE>0, StdErr> 0)

	if (grepl('allPTD', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, s= 0.067, MAF= temp_df$MAF)
        } else if (grepl('postTerm', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, s= 0.122, MAF= temp_df$MAF)
        } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N= temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) }

        data2= list(beta= temp_df$Effect, varbeta= temp_df$StdErr**2, N=temp_df$TotalSampleSize, type= 'quant', snp= temp_df$ID, MAF= temp_df$maf)
        myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
        if (length(myres)==1 ) { 
        PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')
        } else {
        PPH= data.frame(t(myres[[1]]))
        PPH$protein= protein
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= myres[[2]]
        res$protein= protein
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
}
}
}



mclapply(split(df, df$nearestGene), colocalization_eqtl, mc.cores= 3)
12
13
script:
	'coloc_pQTL.R'
18
19
20
21
22
23
24
body .main-container {
  max-width: 1280px !important;
  width: 1280px !important;
}
body {
  max-width: 1280px !important;
}
28
pheno= unlist(strsplit(snakemake@input[[2]], '/'))[8]
40
41
42
43
44
45
46
47
48
49
library("ggplot2")
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library("kableExtra")
library("data.table")
library(moments)
options(warn=-1)
opts_chunk$set(fig.width = 12)
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
p1_list= list()
p2_list= list()
p3_list= list()
df_list= list()
df2_list= list()
fl= length(snakemake@input)

dec= fread(snakemake@input[[grep('DECODE', snakemake@input)]])
dec$MAF= ifelse(dec$EAF>0.5, 1 - dec$EAF, dec$EAF)
dec= select(dec, c(SNP, BETA))
names(dec)= c('SNP', 'BETA_dec')
for (i in 2:fl){

df= fread(snakemake@input[[i]])
df= select(df, -c(STRAND))

cohort= unlist(strsplit(snakemake@input[[i]], '/'))[9]

df$cohort= cohort
df= filter(df, !is.na(EAF))

p1= summarize(df, n_m= median(N, na.rm=T), se_m= mean(SE, na.rm=T))
p1$cohort= cohort
p2= summarize(df, N_max= sqrt(max(N)), EAF_m= median(1/sqrt(2*EAF*(1-EAF)), na.rm=T) / median(SE))
p2$cohort= cohort

d= filter(df, pvalue> median(pvalue, na.rm=T))

p3= summarize(d,SK= skewness(BETA/SE), KU= kurtosis(BETA/SE))
p3$cohort= cohort

p1_list[[cohort]]= p1
p2_list[[cohort]]= p2
p3_list[[cohort]]= p3

df_list[[cohort]]= group_by(df, CHR) %>% sample_n(5000, replace=T)

df= inner_join(df, dec, by= 'SNP')
df$beta_diff= df$BETA - df$BETA_dec

df2_list[[cohort]]= group_by(df, CHR) %>% sample_n(5000, replace=T)

}


p1= do.call("rbind", p1_list)
p2= do.call("rbind", p2_list)
p3= do.call("rbind", p3_list)
d= do.call("rbind", df_list)
d2= do.call('rbind', df2_list)
113
114
115
116
117
118
ggplot(p1, aes(n_m, se_m)) +
geom_point() +
geom_text(aes(label=cohort), hjust=0, vjust=0) +
theme_cowplot() +
xlab('Median(N)') +
ylab('Mean(SE)')
127
128
129
130
131
132
ggplot(p2, aes(N_max, EAF_m)) +
geom_point() +
geom_text(aes(label=cohort), hjust=0, vjust=0) +
theme_cowplot() +
xlab('SQRT(Max(N))') +
ylab('median(1/sqrt(2*EAF*(1-EAF)), na.rm=T) / median(SE)')
145
146
147
148
149
150
ggplot(p3, aes(SK, KU)) +
geom_point() +
geom_text(aes(label=cohort), hjust=0, vjust=0) +
theme_cowplot() +
xlab('Skewness (Z-score)') +
ylab('Kurtosis (Z-score)')
160
161
162
163
164
165
166
167
d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF)

ggplot(d, aes(MAF, BETA)) +
geom_point() +
facet_wrap(vars(cohort), scales= 'free_y', ncol= 3) +
theme_cowplot() +
xlab('MAF') +
ylab('BETA')
177
178
179
180
181
182
183
184
d2$MAF= ifelse(d2$EAF>0.5, 1 - d2$EAF, d2$EAF)

ggplot(d2, aes(MAF, beta_diff)) +
geom_point() +
facet_wrap(vars(cohort), scales= 'free_y', ncol= 3) +
theme_cowplot() +
xlab('MAF') +
ylab('BETA cohort - BETA DECODE')
10
11
12
13
14
15
16
17
18
19
20
library(tint)

knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tint'))
options(htmltools.dir.version = FALSE)
pheno= snakemake@wildcards[['pheno']]
pdf.options(useDingbats = TRUE)
#knitr::opts_chunk$set(dpi=300)
knitr::opts_chunk$set(dev = 'png', warning= FALSE, message= FALSE, dpi= 600)


pheno= ifelse(pheno=='allPTD', 'Preterm Delivery', ifelse(pheno== 'postTerm', 'Post Term', ifelse(pheno=='GAraw', 'Gestational duration', 'Normalized Gestational Duration')))
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
library("ggplot2")
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library("kableExtra")
library(ggrepel)
library("data.table")
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

d= fread(snakemake@input[[1]])

d$p1= gsub('.txt.sumstats.gz', '', apply(d[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2]))
d$p2= gsub('.txt.sumstats.gz', '', apply(d[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))

d$trait= d$p2
d$trait= with(d, ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Birth weight maternal effect',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
		ifelse(trait== 'SHBG_male', 'SHBG (men)',
		ifelse(trait== 'CBAT_fem', 'CBAT (women)',
		ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
		ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
		ifelse(trait== 'BW_fetal', 'Birth weight fetal effect',
		ifelse(trait== 'BW_fetal_effect', 'Birth weight fetal effect (adjusted MG)',
		ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
		ifelse(trait== 'BW_maternal_effect', 'Birth weight maternal effect (adjusted FG)',
                ifelse(trait== 'PCOS', 'Polycistic ovary syndrome', 'Endometriosis'))))))))))))))))))))))
78
cat(paste0('\n- ', factor(d$trait)), sep= "\n")
82
Testosterone in males was further included as a negative control, and only after a first round of genetic correlations.  
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
bw= filter(d, grepl('Birth weight ', d$trait))
df= filter(d, !grepl('Birth weight ', d$trait))

df$significant= ifelse(df$p< 0.05 / (nrow(df) - 7 ), '1', '0')

bw$trait= with(bw, ifelse(trait== 'Birth weight fetal effect', 'Fetal effect', ifelse(
			trait== 'Birth weight fetal effect (adjusted MG)', 'Fetal effect\n (adjusted MG)', ifelse(
			trait=='Birth weight maternal effect (adjusted FG)', 'Maternal effect \n(adjusted FG)', ifelse(
			trait== 'Birth weight maternal effect', 'Maternal effect', '')))))


bw$trait= gsub('Birth weight', '', bw$trait)
ggplot(bw, aes(trait, rg, colour= trait)) +
geom_point() +
geom_errorbar(aes(ymin= I(rg - 1.96*se) , ymax= (rg + 1.96 * se)), width=.1, position=position_dodge(.9)) +
theme_cowplot() +
scale_colour_manual(guide= F, values= colorBlindBlack8[c(1,2,4,8)]) +
xlab('Birth weight') +
ylab('Genetic correlation [95% CI]') +
geom_hline(yintercept= 0) +
ylim(-1, 1) +
scale_y_continuous(breaks= seq(-1, 1, 0.2)) +
geom_hline(yintercept= seq(-1, 1, 0.2), colour= 'grey', size= 0.3, linetype= 'dashed')
115
116
117
118
119
120
121
122
123
124
125
126
ggplot(df, aes(trait, rg, colour= significant)) +
geom_point() +
geom_errorbar(aes(ymin= I(rg - 1.96*se) , ymax= (rg + 1.96 * se)), width=.2, position=position_dodge(.9)) +
theme_cowplot() +
theme(axis.text.x= element_text(angle=45, vjust= 1, hjust= 1)) +
scale_colour_manual(guide= F, values= c('#737373', colorBlindBlack8[2])) +
xlab('Reproductive traits') +
ylab('Genetic correlation [95% CI]') +
geom_hline(yintercept= 0) +
ylim(-1, 1) +
scale_y_continuous(breaks= seq(-1, 1, 0.2)) +
geom_hline(yintercept= seq(-1, 1, 0.2), colour= 'grey', size= 0.3, linetype= 'dashed')
132
We used a Bonferroni corrected threshold for significance (0.05/ 13). We exclude testosterone in males, as this test was performed a posteriori as a negative control for testosterone in women.   
157
While coloc naiveley assumes one causal variant, it does not require an LD matrix that represents the summary statistics used. This is almost impossible to obtain without an LD matrix from each of the studies used in the meta-analysis.  
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
inputs= snakemake@input[grep('pph', snakemake@input)]

df_list= list()

for (infile in inputs) {
d_temp= fread(infile)
#d_temp$trait= gsub('.txt', '', unlist(strsplit(infile, '_'))[2])
df_list[[infile]]= d_temp
}

d= do.call('rbind', df_list)

d$trait= with(d, ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Birth weight maternal effect',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
		ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
		ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
		ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Birth weight fetal effect',
                ifelse(trait== 'BW_fetal_effect', 'Birth weight fetal effect (adjusted MG)',
                ifelse(trait== 'BW_maternal_effect', 'Birth weight maternal effect (adjusted FG)',
                ifelse(trait== 'PCOS', 'Polycistic ovary syndrome', 'Endometriosis'))))))))))))))))))))))

d$locus= gsub('_', ' ', d$locus)
d$locus= gsub('chr', 'Chr', d$locus)
d$locus= gsub('23', 'X', d$locus)
203
We identify the different loci as the chromosome where the locus is located and the nearest protein coding gene to the top associated genetic variant (e.g., Chr5 EBF1).  
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
x1= select(d, locus, PP.H3.abf, trait)
x1$PP= 'Shared locus'

x2= select(d, locus, PP.H4.abf, trait)
x2$PP= 'Shared genetic variant'

names(x1)= c('locus', 'coloc', 'trait', 'PP')
names(x2)= c('locus', 'coloc', 'trait', 'PP')

x= bind_rows(x1, x2)

d$coloc= with(d, ifelse(PP.H4.abf< 0.5, 'No evidence', ifelse(PP.H4.abf>=  0.50 & PP.H4.abf< 0.75, 'Suggestive evidence', 'Strong evidence')))

ggplot(d, aes(trait, locus, size= PP.H4.abf,  fill= coloc, color= coloc, shape= direction, alpha= coloc)) + 
geom_point() + 
theme_cowplot() + 
theme(axis.text.x= element_text(angle=45, vjust= 1, hjust= 1)) +
scale_size_binned('Posterior probability of colocalization', guide= FALSE) +
scale_alpha_manual('Colocalization', values= c(1,0.55, 0.55)) +
scale_shape_manual('Effect direction', values=c(25, 21, 24)) +
scale_fill_manual('Colocalization', values=  c('#737373', colorBlindBlack8[2], colorBlindBlack8[4])) +
scale_colour_manual('Colocalization', values=  c('#737373', colorBlindBlack8[2], colorBlindBlack8[4])) +
xlab('') +
ylab('')
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
x3= select(d, locus, PP.H3.abf, trait)
x3$PP= 'H3'

x4= select(d, locus, PP.H4.abf, trait)
x4$PP= 'H4'

x0= select(d, locus, PP.H0.abf, trait)
x0$PP= 'H0'

x1= select(d, locus, PP.H1.abf, trait)
x1$PP= 'H1'

x2= select(d, locus, PP.H2.abf, trait)
x2$PP= 'H2'

names(x0)= c('locus', 'coloc', 'trait', 'PP')
names(x1)= c('locus', 'coloc', 'trait', 'PP')
names(x2)= c('locus', 'coloc', 'trait', 'PP')
names(x3)= c('locus', 'coloc', 'trait', 'PP')
names(x4)= c('locus', 'coloc', 'trait', 'PP')

x= bind_rows(x0, x1, x2, x3, x4)


x= x[order(x$PP, decreasing= T),]

x$evidence= ifelse(x$coloc>= 0.75, '1', '0')

ggplot(filter(x, PP== 'H3' | PP== 'H4'), aes(fill= factor(PP), y=coloc, x= locus, alpha= evidence)) + 
geom_bar(position="stack", stat="identity") +
scale_fill_manual('Posterior probability', values= c(colorBlindBlack8[2], colorBlindBlack8[4])) +
scale_alpha_manual('Posterior probability', values= c(0.55, 0.8), guide=FALSE) +
facet_wrap(vars(trait), ncol= 3) +
theme_cowplot() +
theme(axis.text.x= element_text(angle=45, vjust= 1, hjust= 1),
strip.background= element_blank(),
legend.position= 'bottom') +
xlab('') +
ylab('')
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
z= filter(d, PP.H4.abf >= 0.75)


res_inputs= snakemake@input[grep('results_', snakemake@input)]

df_list= list()

for (infile in res_inputs) {
x= fread(infile, select= c('snp', 'z.df1', 'z.df2', 'SNP.PP.H4', 'locus', 'trait'))

x= gather(x, pheno, zscore, c(z.df1, z.df2))
x$pvalue=2 *  pnorm(-abs(x$zscore))
x= separate(x, snp, into= c('CHR', 'POS', 'REF', 'EFF'), sep =':', remove= F)
x$POS= as.numeric(x$POS)
x$pheno= ifelse(x$pheno =='z.df1', pheno, x$trait)
df_list[[infile]]= x
}

df= bind_rows(df_list)

df$trait= with(df, ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Birth weight maternal effect',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
		ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
		ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'BW_fetal', 'Birth weight fetal effect',
                ifelse(trait== 'BW_fetal_effect', 'Birth weight fetal effect (adjusted MG)',
		ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_maternal_effect', 'Birth weight maternal effect (adjusted FG)',
                ifelse(trait== 'PCOS', 'Polycistic ovary syndrome', 'Endometriosis'))))))))))))))))))))))

df$locus= gsub('_', ' ', df$locus)
df$locus= gsub('chr', 'Chr', df$locus)
df$locus= gsub('23', 'X', df$locus)

z$id= paste(z$trait, z$locus, sep= ':')
df$id= paste(df$trait, df$locus, sep= ':')
df= filter(df, id %in% z$id)

df$pheno= ifelse(df$pheno== pheno, pheno, df$trait)

for (i in unique(df$id)){


PP= filter(z, id== i)$PP.H4.abf

temp_df= filter(df, id== i)
temp_df$pheno= temp_df$pheno
temp_df$POS= temp_df$POS / 10**6
high_df= filter(temp_df, id == i, SNP.PP.H4== max(SNP.PP.H4))

(ggplot() + 
geom_point(data= temp_df, aes(POS, -log10(pvalue), colour= pheno), size= 1, alpha = 0.5) +
geom_point(data= high_df, aes(POS, -log10(pvalue)), colour= colorBlindBlack8[1], size= 2) + 
facet_wrap(vars(pheno), nrow= 2, scales = "free_y") +
theme_cowplot(font_size=14) +
theme(strip.background= element_blank()) +
scale_colour_manual(guide=FALSE, values= colorBlindBlack8[c(4,2)]) +
ylab('-log10(pvalue)') +
xlab('Position (Mbp)') +
geom_text_repel(data=high_df, aes(x= POS, y= -log10(pvalue), label=snp), hjust= 0.5, size=3, vjust= 1) +
ggtitle(paste('Locus: ', unique(temp_df$locus), '. Posterior probability for shared causal variant: ', round(PP, 3)))) %>% print()


cat('  \n')


}
18
19
cohort= unlist(strsplit(snakemake@input[[1]], '/'))[9]
pheno= unlist(strsplit(snakemake@input[[1]], '/'))[8]
29
30
31
32
33
34
35
36
37
library("ggplot2")
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library("kableExtra")
library("data.table")
options(warn=-1)
#opts_chunk$set(dpi=300, out.width="300px")
45
d= fread(snakemake@input[[1]], h=T)
53
kable(summary(select(d, BETA, SE, pvalue, EAF, N)))
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#dec= fread(snakemake@input[[3]],h=T, select= c('CHR', 'POS', 'BETA', 'SE', 'EFF', 'REF', 'EAF', 'pvalue'))

#names(dec)= c('CHR', 'POS', 'BETA_dec', 'SE_dec', 'A1_dec', 'A2_dec', 'EAF_dec', 'pvalue_dec')

#dec$BETA_dec= ifelse(dec$A1_dec> dec$A2_dec, dec$BETA_dec, -1* dec$BETA_dec)
#dec$EAF_dec= ifelse(dec$A1_dec> dec$A2_dec, dec$EAF_dec, 1- dec$EAF_dec)

x= fread(snakemake@input[[2]], h=T)


df= d
df$CHR= as.numeric(df$CHR)
x$CHR= as.numeric(x$CHR)
df$EAF= ifelse(df$EFF> df$REF, df$EAF, 1 - df$EAF)

df= inner_join(df, x, on= c('CHR', 'POS'))

df= filter(df, (EFF== ea & REF== oa) | (REF== ea & EFF== oa))

if (nrow(df)>0){
ggplot(df, aes(eaf, EAF)) +
geom_point(alpha= 1/10) +
theme_cowplot(12) +
xlab('EAF HRC') +
ylab('EAF Sample')
} else{
print('No match')
}

rm(df)
rm(x)
101
102
103
104
105
106
107
108
109
110
111
112
113
114
d$BETA= ifelse(d$REF> d$EFF, d$BETA, -1* d$BETA)
#dec= inner_join(d, dec, on= c('CHR', 'POS'))

#dec= filter(dec, EFF== A1_dec, REF== A2_dec, pvalue_dec< 0.001)

#dec= group_by(dec, CHR) %>% slice_sample(n= 1000, replace= T)

#ggplot(dec, aes(BETA_dec/SE_dec, BETA/SE)) +
#geom_point(alpha= 1/10) +
#theme_cowplot(12) +
#xlab('DECODE z-score') +
#ylab('Observed z-score')

#rm(dec)
125
126
127
128
129
130
131
d$exp_pvalue= pchisq((d$BETA/d$SE)^2, df=1, lower=F)

ggplot(group_by(d, CHR) %>% sample_n(10000, replace=T ), aes(-log10(exp_pvalue), -log10(pvalue))) +
geom_point(alpha= 1/10) +
theme_cowplot() +
xlab('Expected pvalue') +
ylab('Observed pvalue')
140
141
142
143
144
145
146
147
df= arrange(d, pvalue) %>% mutate(exp1= -log10(1:length (pvalue)/length (pvalue)))

ggplot(filter(df, pvalue<0.05), aes(exp1, -log10(pvalue))) +
  geom_point(size= 0.4) +
  geom_abline(intercept = 0, slope = 1, alpha = .5) +
theme_cowplot(12, font_size= 12) +
xlab('Expected (-log10(p-value))') +
ylab('Observed (-log10(p-value))')
10
11
12
13
14
15
16
17
18
19
20
library(tint)

knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tint'))
options(htmltools.dir.version = FALSE)
pheno= snakemake@wildcards[['pheno']]
pdf.options(useDingbats = TRUE)
#knitr::opts_chunk$set(dpi=300)
knitr::opts_chunk$set(dev = 'png', warning= FALSE, message= FALSE, dpi= 600)


pheno= ifelse(pheno=='allPTD', 'Preterm Delivery', ifelse(pheno== 'postTerm', 'Post Term', ifelse(pheno=='GAraw', 'Gestational duration', 'Normalized Gestational Duration')))
29
30
31
32
33
34
35
36
37
38
39
library("ggplot2")
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library("kableExtra")
library(ggrepel)
library("data.table")
options(warn=-1)

colorBlindBlack8  <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
d= fread(snakemake@input[[1]])

z= fread(snakemake@input[[3]])




df= fread(snakemake@input[[2]], select= (c('MarkerName', 'Effect', 'StdErr', 'HetISq', 'HetPVal', 'TOTALSAMPLESIZE', 'P-value', 'Allele1', 'Allele2')))
names(df)= c('SNP', 'BETA', 'SE', 'HetISq', 'HetPval', 'N', 'pvalue', 'A1', 'A2')
df= filter(df, SNP %in% d$SNP)

df= separate(df, SNP, into= c('CHR', 'POS', 'Ax1', 'Ax2', 'ID'), sep= ':', remove= F)
df$BETA= ifelse(df$A2 > df$A1, -1 * df$BETA, df$BETA)
df$CHR= ifelse(df$CHR== 'X','23', df$CHR)
df$CHR= as.integer(df$CHR)
df$POS= as.integer(df$POS)
df= select(df, -c(A1, A2, ID, Ax1, Ax2))

df$cohort= 'Meta-analysis'
d= bind_rows(d, df)

z$CHR= ifelse(z$CHR== 'X','23', z$CHR)
z$CHR= as.integer(z$CHR)

d= inner_join(d, z, by= 'CHR') %>% filter(POS> pos1, POS< pos2)



d$locus= paste0('Chr ', d$CHR,': ', d$nearestGene)
d$cohort= paste0(d$cohort, ' (n= ', d$N, ')')
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
for (i in unique(d$locus)) {
temp_df= d[d$locus== i, ]

cat('\n')

cat("\n# Forest plot for locus ", i, "\n")

cat("\n")

cat('\n')

cat('Lead variant: \n', temp_df[!is.na(temp_df$HetISq), ]$SNP)

cat('\n')

cat(paste0('\n Meta-analysis: Beta= ', round(temp_df[!is.na(temp_df$HetISq), ]$BETA, 3), ' (95% CI= ', round(temp_df[!is.na(temp_df$HetISq), ]$BETA - 1.96 * temp_df[!is.na(temp_df$HetISq), ]$SE, 3), ', ', round(temp_df[!is.na(temp_df$HetISq), ]$BETA + 1.96 * temp_df[!is.na(temp_df$HetISq), ]$SE, 3), '); pvalue= ', temp_df[!is.na(temp_df$HetISq), ]$pvalue))

cat('\n')

cat('\n')
temp_df= temp_df[order(temp_df$N, decreasing= T), ]

(ggplot(temp_df, aes(x=factor(cohort, level = factor(cohort)), y=BETA, ymin= BETA - 1.96 * SE, ymax= BETA + 1.96 * SE, colour= !is.na(HetISq), shape= !is.na(HetISq)), alpha= 0.5) +
 geom_pointrange(size= 1, alpha= 0.7) +
scale_shape_manual(values= c(15, 18), guide= F) +
 geom_hline(yintercept = 0, linetype=2) +
scale_y_continuous(sec.axis = dup_axis()) +
 coord_flip() +
scale_colour_manual(values= c(colorBlindBlack8[3], colorBlindBlack8[4]), guide= F) +
theme_cowplot() +
 xlab('') +
    ylab('Beta [95% CI]') +
geom_vline(xintercept= 0, linetype= "dotted", colour= 'grey') ) %>% print()

cat('\n')

cat('Test for heterogeneity: I^2^= ', temp_df[!is.na(temp_df$HetISq), ]$HetISq, '%; Het pvalue= ', temp_df[!is.na(temp_df$HetISq), ]$HetPval, '\n')

cat('\\pagebreak')

}
10
11
12
13
14
15
16
17
18
19
20
library(tint)

knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tint'))
options(htmltools.dir.version = FALSE)
pheno= snakemake@wildcards[['pheno']]
pdf.options(useDingbats = TRUE)
#knitr::opts_chunk$set(dpi=300)
knitr::opts_chunk$set(dev = 'png', warning= FALSE, message= FALSE, dpi= 600)


pheno= ifelse(pheno=='allPTD', 'Preterm Delivery', ifelse(pheno== 'postTerm', 'Post Term', ifelse(pheno=='GAraw', 'Gestational duration', 'Normalized Gestational Duration')))
32
33
34
35
36
37
38
39
40
library("ggplot2")
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library("kableExtra")
library(ggrepel)
library("data.table")
options(warn=-1)
45
46
47
48
49
50
51
52
53
54
55
d= fread(snakemake@input[[1]], h= T)
d$MAF= ifelse(d$EAF>0.5,  1 - d$EAF, d$EAF)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

ggplot(d, aes(MAF)) +
  geom_density(fill= colorBlindBlack8[4]) +
theme_cowplot(font_size= 12) +
xlab('MAF')

#d= select(d, -MAF)
61
62
63
64
ggplot(d, aes(TOTALSAMPLESIZE)) +
  geom_density(fill= colorBlindBlack8[3]) +
theme_cowplot(font_size= 12) +
xlab('Sample size')
72
73
74
75
ggplot(d, aes(BETA)) +
  geom_density(fill= colorBlindBlack8[2]) +
theme_cowplot(font_size= 12) +
xlab('Beta')
91
92
**Effective sample size** for binary phenotypes was calculated as:  
$$\frac{2}{(\frac{1}{Ncases} + \frac{1}{Ncontrols})}$$  
110
111
112
ge= data.frame(CHR= c(5, 3, 1, 23, 1), pos_ge= c(157895049, 127881613, 22470407, 115164770, 22470407))

#kable(summary(select(d, BETA, SE, pvalue, EAF, TOTALSAMPLESIZE)), digits = c(3, 3, 5, 4, 0), col.names = c('Beta', 'Standard error', 'P-value', 'Effect allele frequency', 'Sample size'), caption= 'Summary statistics after QC.')
124
The same number of loci is obtained when using a larger radius (1.5Mb).
128
We note that we used a naive approach to identify independent loci. This should be interpreted cautiously. We mapped top genetic variants to the body (TSS or TES) nearest protein coding gene.  
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
df= arrange(d, pvalue)

#df= group_by(df, CHR, POS) %>% filter(row_number() == 1) %>% ungroup()
df= df[!duplicated(df[, c('CHR', 'POS')]), ]

dg= df %>% arrange(CHR, POS) %>% filter(pvalue< 5*10**-8) %>% group_by(CHR) %>%
		mutate(d=POS-lag(POS, default=-Inf), clumpid=cumsum(d>250000)) %>%
		group_by(CHR, clumpid) %>%
		filter(rank(pvalue, ties.method = "random")==1) %>%
		mutate(GENE= ifelse(SYMBOL=='', RSID, SYMBOL)) %>%
		ungroup()

dg= filter(dg, pvalue< 5*10**-8)
dg= group_by(dg, CHR, POS) %>% filter(row_number()== 1)
#dg$GENE= ifelse(grepl('rs|-', dg$GENE), dg$nearestGene, ifelse(dg$GENE=='', dg$nearestGene, dg$GENE))
dg$GENE= dg$nearestGene

  don <- df %>%
    group_by(CHR)      %>%
    summarise(chr_len= max(POS)) %>%
    mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome
    select(-chr_len) %>%
    left_join(df, ., by= 'CHR') %>%
    arrange(CHR, POS) %>% # Add a cumulative position of each SNP
    mutate( BPcum=POS+tot) %>%
	 ungroup()

  axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
  names(axisdf)= c('CHR', 'center')
HC= -log10(5*10**-8)
dg= dg %>% ungroup() %>% select(ID, GENE, CHR, POS, MAF, BETA)
don= left_join(don, select(dg, ID, GENE), by= 'ID')
names(dg)= c('ID', 'GENE', 'CHR', 'POS_new', 'MAF', 'BETA')

lims= 250000

don$disc= ifelse(don$pvalue> 5*10**-8, 0, NA)

for (i in rownames(dg)) {
don= mutate(don, disc= ifelse(CHR== as.integer(dg[i, 'CHR']) & POS>= as.integer(dg[i, 'POS_new']) - lims & POS<= as.integer(dg[i, 'POS_new']) + lims, 2, disc))
}

for (i in rownames(ge)) {
don= mutate(don, disc= ifelse(CHR== as.integer(ge[i, 'CHR']) & POS>= as.integer(ge[i, 'pos_ge']) - lims & POS<= as.integer(ge[i, 'pos_ge']) + lims, 1, disc))
}

don= don[order(don$disc, decreasing= F, na.last= T), ]
don$disc= factor(don$disc, levels=c(0, 1, 2), labels=c('Not significant', 'Previous discovery', 'New discovery'))

cols <- c('Not significant'= 'grey', 'Previous discovery'= colorBlindBlack8[4], 'New discovery'= colorBlindBlack8[2])

don$GENE= ifelse(!is.na(don$GENE), don$nearestGene, don$GENE)
ggplot(don) +
    geom_point(data= don, aes(x=BPcum, y= -log10(pvalue), colour= disc), size=0.3) +   # Show all points
theme_cowplot(font_size= 12) + #theme_minimal_hgrid(12, rel_small = -1) + 
#scale_alpha_manual(values= rep(c(1/10, 1/2), 23)) +
scale_colour_manual(values= cols) +
    scale_x_continuous(label = axisdf$CHR, breaks= axisdf$center, expand=c(0,0) ) + # custom X axis
scale_y_continuous(expand= c(0,0)) +
         xlab('Chromosome') +
    ylab('-log10(pvalue)') +
labs(colour= '') +
geom_hline(yintercept= 0, size= 0.5, colour= 'black') +
geom_hline(yintercept= HC, size= 0.5, linetype= 2, colour= '#878787') +
geom_text_repel(data= don, aes(x= BPcum, y= -log10(pvalue), label= GENE), size= 3, hjust= 1, force= 1, vjust= 1, colour= 'black') +
theme(legend.position= 'bottom') +
guides(colour = guide_legend(override.aes = list(size=3)))
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
indep= fread(snakemake@input[[6]])

indep$nd2P= sapply(strsplit(indep$SP2, ','), length)
indep= inner_join(indep, dg, by= 'CHR')
indep= filter(indep, BP>= POS_new - 1.5*10**6, BP<= POS_new + 1.5*10**6)
indep_df= group_by(indep, GENE) %>% summarize(total= sum(TOTAL), nsig= sum(NSIG), GWS= n(), sug_ev= sum(nd2P), mP= min(P))
indep_df= indep_df[order(indep_df$mP, decreasing=T), ]
indep_df$GENE= factor(indep_df$GENE, levels= indep_df$GENE)
indep_df= filter(indep_df, !grepl('HLA', GENE))

p1= ggplot(data=indep_df, aes(x= GENE, y= GWS)) + 
geom_col(fill=colorBlindBlack8[2], alpha= 0.6) + 
theme_cowplot() +
ylab('# Independent GW significant') +
xlab('Locus') +
coord_flip()

p2= ggplot(data=indep_df, aes(x= GENE, y= total)) + 
geom_col(fill=colorBlindBlack8[4], alpha= 0.6) + 
theme_cowplot() +
ylab('Total # of \n genetic variants in locus') +
xlab('Locus') +
theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
coord_flip() 

p3= ggplot(data=indep_df, aes(x= GENE, y= sug_ev / total)) + 
geom_col(fill=colorBlindBlack8[8], alpha= 0.6) + 
theme_cowplot() +
ylab('Proportion of variants with P<1e-5') +
xlab('Locus') +
ylim(0, 1) +
theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
coord_flip() 

plot_grid(p1, p2, p3, align = "h", nrow= 1)
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
d= mutate(d, maf_tertiles = ntile(MAF, 3))
m1= round(max(d[d$maf_tertiles== 1, 'MAF']), 3)
m2= round(max(d[d$maf_tertiles== 2, 'MAF']), 3)


d$maf_tertiles= factor(d$maf_tertiles, levels=c("1", "2", "3"), labels=c(paste('MAF<', m1), paste(m1,'< MAF >', m2), paste('MAF>', m2)))

df= arrange(d, pvalue) %>% group_by(maf_tertiles) %>% mutate(exp1= -log10(1:length(pvalue)/length(pvalue)))

ggplot(filter(df, pvalue<0.05), aes(exp1, -log10(pvalue), color= maf_tertiles)) +
  geom_point(size= 0.4) +
scale_color_manual(values= colorBlindBlack8[2:4])+ 
  geom_abline(intercept = 0, slope = 1, alpha = .5) +
labs(colour="") +
theme_cowplot(font_size= 12) +
xlab('Expected (-log10(p-value))') +
ylab('Observed (-log10(p-value))') +
theme(legend.position= 'bottom') +
guides(colour = guide_legend(override.aes = list(size=3)))
292
293
294
295
296
297
298
don= filter(don, pvalue< 1*10**-4)
d1= filter(don, pvalue<5*10**-8)
d1= d1[order(d1$pvalue, decreasing= F), ]
d1$pvalue= format(d1$pvalue, digits= 3)
don$pvalue= format(don$pvalue, digits= 3)

kable(filter(d1, GENE!= '') %>% select(ID, RSID, GENE, TOTALSAMPLESIZE, EAF, BETA, SE, pvalue, Consequence), col.names= c('CHR:POS:REF:EFF', 'RSID', 'Gene', 'N', 'EAF', 'Beta', 'SE', 'P-value', 'Consequence'), digits= 3) #%>% kable_styling(latex_options = c("striped", "scale_down"))
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
if (nrow(dg)>1){

(ggplot(dg, aes(MAF, abs(BETA), size= abs(BETA)),  alpha= 0.7) +
geom_point(colour= colorBlindBlack8[4]) +
theme_cowplot(font_size= 12) +
scale_size_continuous(name= 'Absolute Beta', guide= F) +
geom_text_repel(data= dg, aes(label= GENE), guide= F) +
xlab('Minor allele frequency') +
ylab('Absolute effect size')) %>% print()

} else{ 

print('Only one locus identified, check the table.')
plot_comment=''
}
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
z= fread(snakemake@input[[5]], header= T, sep= '\t', select= c('MarkerName', 'Effect', 'P-value', 'HetPVal'))

names(z)= c('ID', 'beta', 'pvalue', 'het_pvalue')
z$ID= gsub(':SNP', '', z$ID)
z$ID= gsub(':INDEL', '', z$ID)

z= inner_join(z, dg, by= 'ID')


if (nrow(z)>1) {

plot_comment= 'No pattern between effect size and heterogeneity. Attention should be paid to the top hit.'

z$Direction= ifelse(z$beta> 0, 'Positive', 'Negative')
ggplot(z, aes(-log10(het_pvalue), -log10(pvalue), size= abs(beta)), alpha= 0.7) +
geom_point(colour= colorBlindBlack8[4]) +
theme_cowplot(font_size= 12) +
scale_size_continuous(name= 'Absolute effect size') +
geom_text_repel(data= z, aes(label= GENE), hjust =1, show.legend = FALSE) +
xlab('-log10(Het pvalue)') +
ylab('-log10(Association pvalue)') +
theme(legend.position="bottom")

} else{
print(paste('Pvalue for heterogeneity: ', z$het_pvalue))
#plot_comment= ''
}
376
377
378
379
kable(filter(don, (IMPACT== 'HIGH') | (IMPACT== 'MODERATE')) %>% select(ID, RSID, SYMBOL, TOTALSAMPLESIZE, EAF, BETA, SE, pvalue, Consequence), col.names= c('CHR:POS:REF:EFF', 'RSID', 'Gene', 'N', 'EAF', 'Beta', 'SE', 'P-value', 'Consequence'), digits= 3)

x= readLines(snakemake@input[[2]])
x= x[match('Heritability of phenotype 1', x) + 2]
393
Ideally, calculate LDscores from our sample (MOBAGENETICS) or from a bigger cohort (UKBIOBANK).
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
d= fread(snakemake@input[[3]])

d$pheno1= gsub('.txt.sumstats.gz', '', sapply(strsplit(as.character(d$p1), '/'), tail, 1))
d$pheno2= gsub('.txt.sumstats.gz', '', sapply(strsplit(as.character(d$p2), '/'), tail, 1))

d$rg= ifelse(d$rg> 1, 1, ifelse(d$rg< ( -1), -1, d$rg))

maxy= with(d, ifelse(max(rg + 1.96*se)> 1, max(rg + 1.96*se), 1))
miny=with(d, ifelse(min(rg - 1.96*se)< -1, min(rg - 1.96*se), -1))

ggplot(d, aes(pheno2, rg, colour= pheno2)) +
  geom_point(alpha= 0.5) +
geom_errorbar(aes(ymin= I(rg - 1.96*se) , ymax= (rg + 1.96 * se)), width=.2, position=position_dodge(.9)) +
theme_cowplot(font_size= 9) +
scale_fill_manual(values= colorBlindBlack8[2:4]) +
scale_colour_manual(guide= F, values= colorBlindBlack8[2:4]) +
xlab('Phenotype') +
ylab(paste0('R coefficient [95% CI] \n', pheno)) +
ylim(c(miny, maxy)) +
theme(legend.position= 'none')

link= 'https://drive.google.com/drive/folders/101ErlqwE4_iFwZFCTM0QZUtUVwOoOE1L?usp=sharing'
10
11
12
13
14
15
16
17
18
19
20
library(tint)

knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tint'))
options(htmltools.dir.version = FALSE)
pdf.options(useDingbats = TRUE)
#knitr::opts_chunk$set(dpi=300)
knitr::opts_chunk$set(dev = 'png', warning= FALSE, message= FALSE, dpi= 600)



model= ifelse(grepl('rec', snakemake@input[[1]]), 'recessive', 'dominant')
31
32
33
34
35
36
37
38
39
library("ggplot2")
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library("kableExtra")
library(ggrepel)
library("data.table")
options(warn=-1)
44
45
46
47
48
49
50
51
52
53
54
55
56
d= fread(snakemake@input[[1]], h= T)
d$MAF= ifelse(d$EAF>0.5,  1 - d$EAF, d$EAF)

d= filter(d, TOTALSAMPLESIZE> 66106)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

ggplot(d, aes(MAF)) +
  geom_density(fill= colorBlindBlack8[4]) +
theme_cowplot(font_size= 12) +
xlab('MAF')

#d= select(d, -MAF)
62
63
64
65
ggplot(d, aes(TOTALSAMPLESIZE)) +
  geom_density(fill= colorBlindBlack8[3]) +
theme_cowplot(font_size= 12) +
xlab('Sample size')
95
96
97
98
add_model= fread(snakemake@input[[2]])
add_model$CHR= ifelse(add_model$CHR== 'X', '23', add_model$CHR)
add_model$CHR= as.numeric(add_model$CHR)
add_model$pos= round((add_model$pos1 + add_model$pos2) / 2)
110
The same number of loci is obtained when using a larger radius (1.5Mb).
114
We note that we used a naive approach to identify independent loci. This should be interpreted cautiously. We mapped top genetic variants to the body (TSS or TES) nearest protein coding gene.  
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
df= arrange(d, pvalue)

#df= group_by(df, CHR, POS) %>% filter(row_number() == 1) %>% ungroup()
df= df[!duplicated(df[, c('CHR', 'POS')]), ]

dg= df %>% arrange(CHR, POS) %>% filter(pvalue< 5*10**-8) %>% group_by(CHR) %>%
		mutate(d=POS-lag(POS, default=-Inf), clumpid=cumsum(d>250000)) %>%
		group_by(CHR, clumpid) %>%
		filter(rank(pvalue, ties.method = "random")==1) %>%
		mutate(GENE= ifelse(SYMBOL=='', RSID, SYMBOL)) %>%
		ungroup()

dg= filter(dg, pvalue< 5*10**-8)
dg= group_by(dg, CHR, POS) %>% filter(row_number()== 1)
#dg$GENE= ifelse(grepl('rs|-', dg$GENE), dg$nearestGene, ifelse(dg$GENE=='', dg$nearestGene, dg$GENE))
dg$GENE= dg$nearestGene

  don <- df %>%
    group_by(CHR)      %>%
    summarise(chr_len= max(POS)) %>%
    mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome
    select(-chr_len) %>%
    left_join(df, ., by= 'CHR') %>%
    arrange(CHR, POS) %>% # Add a cumulative position of each SNP
    mutate( BPcum=POS+tot) %>%
	 ungroup()

  axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
  names(axisdf)= c('CHR', 'center')
HC= -log10(5*10**-8)
dg= dg %>% ungroup() %>% select(ID, GENE, CHR, POS, MAF)
don= left_join(don, select(dg, ID, GENE), by= 'ID')
names(dg)= c('ID', 'GENE', 'CHR', 'POS_new', 'MAF')

lims= 250000

don$disc= ifelse(don$pvalue> 5*10**-8, 0, NA)

don= data.frame(don)
dg= data.frame(dg)
add_model= data.frame(add_model)

for (i in rownames(dg)) {
don= mutate(don, disc= ifelse(CHR== as.integer(dg[i, 'CHR']) & POS>= as.integer(dg[i, 'POS_new']) - lims & POS<= as.integer(dg[i, 'POS_new']) + lims, 2, disc))
}

for (i in rownames(add_model)) {
don= mutate(don, disc= ifelse(CHR== as.integer(add_model[i, 'CHR']) & POS>= as.integer(add_model[i, 'pos']) - lims & POS<= as.integer(add_model[i, 'pos']) + lims, 1, disc))
}

don= don[order(don$disc, decreasing= F, na.last= T), ]
don$disc= factor(don$disc, levels=c(0, 1, 2), labels=c('Not significant', 'Additive model discovery', 'New discovery'))

cols <- c('Not significant'= 'grey', 'Additive model discovery'= colorBlindBlack8[4], 'New discovery'= colorBlindBlack8[2])

don$GENE= ifelse(!is.na(don$GENE), don$nearestGene, don$GENE)
ggplot(don) +
    geom_point(data= don, aes(x=BPcum, y= -log10(pvalue), colour= disc), size=0.3) +   # Show all points
theme_cowplot(font_size= 12) + #theme_minimal_hgrid(12, rel_small = -1) + 
#scale_alpha_manual(values= rep(c(1/10, 1/2), 23)) +
scale_colour_manual(values= cols) +
    scale_x_continuous(label = axisdf$CHR, breaks= axisdf$center, expand=c(0,0) ) + # custom X axis
scale_y_continuous(expand= c(0,0)) +
         xlab('Chromosome') +
    ylab('-log10(pvalue)') +
labs(colour= '') +
geom_hline(yintercept= 0, size= 0.5, colour= 'black') +
geom_hline(yintercept= HC, size= 0.5, linetype= 2, colour= '#878787') +
geom_text_repel(data= don, aes(x= BPcum, y= -log10(pvalue), label= GENE), size= 3, hjust= 1, force= 1, vjust= 1, colour= 'black') +
theme(legend.position= 'bottom') +
guides(colour = guide_legend(override.aes = list(size=3)))
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
d= mutate(d, maf_tertiles = ntile(MAF, 3))
m1= round(max(d[d$maf_tertiles== 1, 'MAF']), 3)
m2= round(max(d[d$maf_tertiles== 2, 'MAF']), 3)


d$maf_tertiles= factor(d$maf_tertiles, levels=c("1", "2", "3"), labels=c(paste('MAF<', m1), paste(m1,'< MAF >', m2), paste('MAF>', m2)))

df= arrange(d, pvalue) %>% group_by(maf_tertiles) %>% mutate(exp1= -log10(1:length(pvalue)/length(pvalue)))

ggplot(filter(df, pvalue<0.05), aes(exp1, -log10(pvalue), color= maf_tertiles)) +
  geom_point(size= 0.4) +
scale_color_manual(values= colorBlindBlack8[2:4])+ 
  geom_abline(intercept = 0, slope = 1, alpha = .5) +
labs(colour="") +
theme_cowplot(font_size= 12) +
xlab('Expected (-log10(p-value))') +
ylab('Observed (-log10(p-value))') +
theme(legend.position= 'bottom') +
guides(colour = guide_legend(override.aes = list(size=3)))
230
231
232
233
234
235
236
don= filter(don, pvalue< 1*10**-4)
d1= filter(don, pvalue<5*10**-8)
d1= d1[order(d1$pvalue, decreasing= F), ]
d1$pvalue= format(d1$pvalue, digits= 3)
don$pvalue= format(don$pvalue, digits= 3)

kable(filter(d1, GENE!= '') %>% select(ID, RSID, GENE, TOTALSAMPLESIZE, EAF, pvalue, Consequence), col.names= c('CHR:POS:REF:EFF', 'RSID', 'Gene', 'N', 'EAF', 'P-value', 'Consequence'), digits= 3) #%>% kable_styling(latex_options = c("striped", "scale_down"))
249
kable(filter(don, (IMPACT== 'HIGH') | (IMPACT== 'MODERATE')) %>% select(ID, RSID, SYMBOL, TOTALSAMPLESIZE, EAF, pvalue, Consequence), col.names= c('CHR:POS:REF:EFF', 'RSID', 'Gene', 'N', 'EAF', 'P-value', 'Consequence'), digits= 3)
10
11
12
13
14
15
16
17
18
library(tint)

knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tint'))
options(htmltools.dir.version = FALSE)
pdf.options(useDingbats = TRUE)
#knitr::opts_chunk$set(dpi=300)
knitr::opts_chunk$set(dev = 'png', warning= FALSE, message= FALSE, dpi= 600)
cohort= ifelse(grepl('MOBA', snakemake@input[[1]]), 'MoBa', '23andMe')
pheno= ifelse(grepl('GAraw', snakemake@input[[1]]), 'GA days', 'GA normalized')
30
31
32
33
34
35
36
37
38
library("ggplot2")
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library("kableExtra")
library(ggrepel)
library("data.table")
options(warn=-1)
45
46
47
48
49
50
51
52
53
54
55
d= fread(snakemake@input[[1]], h= T)
d$MAF= ifelse(d$EAF>0.5,  1 - d$EAF, d$EAF)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

ggplot(d, aes(MAF)) +
  geom_density(fill= colorBlindBlack8[4]) +
theme_cowplot(font_size= 12) +
xlab('MAF')

#d= select(d, -MAF)
61
62
63
64
ggplot(d, aes(TOTALSAMPLESIZE)) +
  geom_density(fill= colorBlindBlack8[3]) +
theme_cowplot(font_size= 12) +
xlab('Sample size')
72
73
74
75
ggplot(d, aes(BETA)) +
  geom_density(fill= colorBlindBlack8[2]) +
theme_cowplot(font_size= 12) +
xlab('Beta')
89
90
**Effective sample size** for binary phenotypes was calculated as:  
$$\frac{2}{(\frac{1}{Ncases} + \frac{1}{Ncontrols})}$$  
105
106
107
108
109
110
111
112
113
114
115
116
ge= data.frame(CHR= c(5, 3, 1, 23, 1), pos_ge= c(157895049, 127881613, 22470407, 115164770, 22470407))


if (grepl('GAraw', snakemake@input[[1]])){
topids= c('1:22462111:A:G', '3:128038373:A:C', '5:157896786:C:T', '23:115184372:A:C', '1:228216997:A:C', '3:123112292:C:T', '3:141147414:C:T', '3:155859113:A:G', '23:131268226:C:T', '2:74207357:A:G', '4:174734471:A:G', '6:32589937:A:G', '6:49559793:G:T', '9:16408826:A:G', '20:62692060:A:C')

} else {
topids= c('1:22414785:G:T', '5:157895049:C:T', '23:115129904:C:T', '1:41955090:A:G', '1:50959262:A:C', '3:14293832:A:G', '3:139004333:A:G', '3:141147414:C:T', '3:155862524:A:G', '3:156697097:A:G', '2:74253326:A:G', '4:55895282:C:T', '4:174739258:A:G', '6:32604898:A:G', '8:75315146:C:G', '9:116935764:C:G')
}
fullmeta= fread(snakemake@input[[3]])
fullmeta= filter(fullmeta, ID %in% topids) %>% select(ID, POS, CHR, BETA, SE, pvalue)
names(fullmeta)= c('ID_f', 'POS_f', 'CHR_f', 'BETA_f', 'SE_f', 'pvalue_f')
128
The same number of loci is obtained when using a larger radius (1.5Mb).
132
We note that we used a naive approach to identify independent loci. This should be interpreted cautiously.
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
df= arrange(d, pvalue)

df= df[!duplicated(df[, c('CHR', 'POS')]), ]

dg= df %>% arrange(CHR, POS) %>% filter(pvalue< 5*10**-8) %>% group_by(CHR) %>%
		mutate(d=POS-lag(POS, default=-Inf), clumpid=cumsum(d>250000)) %>%
		group_by(CHR, clumpid) %>%
		filter(rank(pvalue, ties.method = "random")==1) %>%
		mutate(GENE= nearestGene) %>%
		ungroup()

dg= group_by(dg, CHR, POS) %>% filter(row_number()== 1)


  don <- df %>%
    group_by(CHR)      %>%
    summarise(chr_len= max(POS)) %>%
    mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome
    select(-chr_len) %>%
    left_join(df, ., by= 'CHR') %>%
    arrange(CHR, POS) %>% # Add a cumulative position of each SNP
    mutate( BPcum=POS+tot) %>%
	 ungroup()

  axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
  names(axisdf)= c('CHR', 'center')
HC= -log10(5*10**-8)
dg= dg %>% ungroup() %>% select(ID, GENE, CHR, POS, MAF, BETA)
don= left_join(don, select(dg, ID, GENE), by= 'ID')
names(dg)= c('ID', 'GENE', 'CHR', 'POS_new', 'MAF', 'BETA')
don= left_join(don, fullmeta, by= c('CHR'= 'CHR_f'))

lims= 250000

don$disc= ifelse((don$POS> don$POS_f - lims) & (don$POS < don$POS_f + lims), 2, 0)
don= don[order(don$disc, decreasing= T, na.last= T), ]
don= group_by(don, ID) %>% filter(row_number() == 1)
don$disc= ifelse(is.na(don$disc), 0, don$disc)

don= left_join(don, select(dg, CHR, POS_new), by= 'CHR')

don$disc= ifelse(don$disc== 2, 2, ifelse((don$POS> (don$POS_new - lims)) & (don$POS < (don$POS_new + lims)), 1, 0))
don$disc= ifelse(is.na(don$disc), 0, don$disc)
don= don[order(don$disc, decreasing= T, na.last= T), ]
don= group_by(don, ID) %>% filter(row_number() == 1)

don$disc= ifelse(is.na(don$disc), 0, ifelse(don$disc== 1, 2, ifelse(don$disc== 2, 1, 0)))
don= don[order(don$disc, decreasing= F, na.last= T), ]

don$disc= factor(don$disc, levels=c(0, 1, 2), labels=c('Not significant', 'Full meta discovery', 'New discovery'))

cols <- c('Not significant'= 'grey', 'Full meta discovery'= colorBlindBlack8[4], 'New discovery'= colorBlindBlack8[2])



ggplot(don) +
    geom_point(data= don, aes(x=BPcum, y= -log10(pvalue), colour= disc), size=0.3) +   # Show all points
theme_cowplot(font_size= 12) + #theme_minimal_hgrid(12, rel_small = -1) + 
#scale_alpha_manual(values= rep(c(1/10, 1/2), 23)) +
scale_colour_manual(values= cols) +
    scale_x_continuous(label = axisdf$CHR, breaks= axisdf$center, expand=c(0,0) ) + # custom X axis
scale_y_continuous(expand= c(0,0)) +
         xlab('Chromosome') +
    ylab('-log10(pvalue)') +
labs(colour= '') +
geom_hline(yintercept= 0, size= 0.5, colour= 'black') +
geom_hline(yintercept= HC, size= 0.5, linetype= 2, colour= '#878787') +
geom_text_repel(data= don, aes(x= BPcum, y= -log10(pvalue), label= GENE), size= 3, hjust= 1, force= 1, vjust= 1, colour= 'black') +
theme(legend.position= 'bottom') +
guides(colour = guide_legend(override.aes = list(size=3)))
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
d= mutate(d, maf_tertiles = ntile(MAF, 3))
m1= round(max(d[d$maf_tertiles== 1, 'MAF']), 3)
m2= round(max(d[d$maf_tertiles== 2, 'MAF']), 3)


d$maf_tertiles= factor(d$maf_tertiles, levels=c("1", "2", "3"), labels=c(paste('MAF<', m1), paste(m1,'< MAF >', m2), paste('MAF>', m2)))

df= arrange(d, pvalue) %>% group_by(maf_tertiles) %>% mutate(exp1= -log10(1:length(pvalue)/length(pvalue)))

ggplot(filter(df, pvalue<0.05), aes(exp1, -log10(pvalue), color= maf_tertiles)) +
  geom_point(size= 0.4) +
scale_color_manual(values= colorBlindBlack8[2:4])+ 
  geom_abline(intercept = 0, slope = 1, alpha = .5) +
labs(colour="") +
theme_cowplot(font_size= 12) +
xlab('Expected (-log10(p-value))') +
ylab('Observed (-log10(p-value))') +
theme(legend.position= 'bottom') +
guides(colour = guide_legend(override.aes = list(size=3)))
243
244
245
246
247
248
don= filter(don, pvalue< 1*10**-4)
d1= filter(don, pvalue<5*10**-8)
d1$pvalue= format(d1$pvalue, digits= 3)
don$pvalue= format(don$pvalue, digits= 3)
kable(filter(d1, GENE!= '') %>% select(ID, RSID, GENE, TOTALSAMPLESIZE, EAF, BETA, SE, pvalue), col.names= c('CHR:POS:REF:EFF', 'RSID', 'Gene', 'N', 'EAF', 'Beta', 'SE', 'P-value'), digits= 3)
caption= 'As expected, beta increases with decreasing minor allele frequency.'
258
259
260
261
262
263
264
265
266
267
268
x= inner_join(fullmeta, d, by= c('ID_f'= 'ID'))


(ggplot(x, aes(BETA_f, BETA, size= abs(BETA)),  alpha= 0.7) +
geom_point(colour= colorBlindBlack8[4]) +
theme_cowplot(font_size= 12) +
scale_size_continuous(name= 'Absolute Beta', guide= F) +
geom_text_repel(data= x, aes(label= RSID), guide= F) +
xlab('Effect size full meta-analysis') +
ylab(paste('Effect size without', cohort))+ 
geom_abline(intercept= 0, slope= 1, linetype= 'dashed', colour= 'grey')) %>% print()
279
280
281
282
283
284
285
(ggplot(x, aes(-log10(pvalue_f), -log10(pvalue)),  alpha= 0.7) +
geom_point(colour= colorBlindBlack8[4]) +
theme_cowplot(font_size= 12) +
geom_text_repel(data= x, aes(label= RSID), guide= F) +
xlab('-log10(pvalue) full meta-analysis') +
ylab(paste('-log10(pvalue) without', cohort))+ 
geom_abline(intercept= 0, slope= 1, linetype= 'dashed', colour= 'grey')) %>% print()
11
12
13
14
15
16
17
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['#CHROM', 'POS', 'REF', 'ALT', 'AF_EXCLUDING_1000G'])
	d.columns= ['CHR', 'POS', 'oa', 'ea', 'eaf']
	d= d.loc[((d.eaf> 0.05) & (d.eaf<0.95)), :]
	d['eaf']= np.where(d.oa> d.ea, 1 - d.eaf, d.eaf)
	d= d.sample(n= 1000000)
	d.to_csv(output[0], sep= '\t', header= True, index= False)
27
28
script:
	'file_level_qc.Rmd'
38
39
	script:
                'file_level_qc.Rmd'
49
50
	script:
                'file_level_qc.Rmd'
60
61
	script:
                'file_level_qc.Rmd'
70
71
script:
	'all_files_QC.Rmd'
80
81
script:
        'all_files_QC.Rmd'
91
92
script:
	'all_files_QC.Rmd'
102
103
script:
        'all_files_QC.Rmd'
117
118
script:
	'meta_qc.Rmd'
129
130
131
132
133
134
135
136
137
138
139
140
141
run:
	df= pd.read_csv(input[0], sep= '\t', header= 0)
	df.sort_values('SNP.PP.H4', ascending= False, inplace= True)
	d= df.groupby('locus').head(1).reset_index()
	df['trait']= input[1].split('pph_')[1].replace('.txt', '')
	d['direction']= np.where((d['z.df1'] > 0) & (d['z.df2'] > 0), 'Positive', np.where((d['z.df1'] < 0) & (d['z.df2'] < 0), 'Negative', 'Opposite'))
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	x['trait']= input[1].split('pph_')[1].replace('.txt', '')
	x= pd.merge(x, d[['snp', 'locus', 'SNP.PP.H4', 'direction']])
	x= x.loc[(x['PP.H0.abf'] != 0) & (x['PP.H1.abf'] != 0) & (x['PP.H2.abf'] != 0) & (x['PP.H0.abf'] != 0) & (x['PP.H4.abf'] != 0), :]
	x.dropna(axis= 0, inplace= True)
	x.to_csv(output[0], sep= '\t', header= True, index= False)
	df.to_csv(output[1], sep= '\t', header= True, index= False)
151
152
script:
	'coloc.Rmd'
161
162
163
164
165
166
167
168
169
170
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, compression= 'gzip')
	df= pd.read_csv(input[1], sep= '\t', header= 0)
	df['CHR']= np.where(df.CHR== 'X', '23', df.CHR)
	df['CHR']= df.CHR.astype(str).astype(int)
	d= pd.merge(d, df, on= 'CHR')
	d= d.loc[((d.POS> d.pos1) & (d.POS < d.pos2)), :]
	d.sort_values('pvalue', ascending= True, inplace= True)
	d= d.groupby('nearestGene_y').first()
	d.to_csv(output[0], sep= '\t', header= False, index= False, columns= ['ID']) 
179
180
181
182
183
	shell:
                '''
                grep -f {input[0]} {input[1]} > {output[0]} || true
                touch {output[0]}
                '''
192
193
194
195
196
shell:
        '''
        grep -f {input[0]} {input[1]} > {output[0]} || true
        touch {output[0]}
        '''
205
206
207
208
209
shell:
        '''
        grep -f {input[0]} {input[1]} > {output[0]} || true
        touch {output[0]}
        '''
217
218
219
220
221
        shell:
                '''
		grep -f {input[0]} {input[1]} > {output[0]} || true
		touch {output[0]}
		'''
230
231
232
233
234
235
236
237
run:
	df_list= list()
	for infile in input:
		d= pd.read_csv(infile, sep= '\t', header= None, names= ['SNP', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue', 'STRAND', 'maf'])
		d['cohort']= infile.split('temp/')[1].replace('_topvariants.txt', '')
		df_list.append(d)
	d= pd.concat(df_list)
	d.to_csv(output[0], sep= '\t', header= True, index= False)
245
246
247
248
249
250
251
252
run:
        df_list= list()
        for infile in input:
                d= pd.read_csv(infile, sep= '\t', header= None, names= ['SNP', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue', 'STRAND', 'maf'])
                d['cohort']= infile.split('temp/')[1].replace('_topvariants.txt', '')
                df_list.append(d)
        d= pd.concat(df_list)
        d.to_csv(output[0], sep= '\t', header= True, index= False)
260
261
262
263
264
265
266
267
run:
        df_list= list()
        for infile in input:
                d= pd.read_csv(infile, sep= '\t', header= None, names= ['SNP', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue', 'STRAND', 'maf'])
                d['cohort']= infile.split('temp/')[1].replace('_topvariants.txt', '')
                df_list.append(d)
        d= pd.concat(df_list)
        d.to_csv(output[0], sep= '\t', header= True, index= False)
275
276
277
278
279
run:
        df_list= list()
        for infile in input:
                d= pd.read_csv(infile, sep= '\t', header= None, names= ['SNP', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue', 'STRAND', 'maf'])
                d['cohort']= infile.split('temp/')[1].replace('_topvariants.txt', '')
292
293
script:
	'forest_plots.Rmd'
303
304
script:
	'other_meta.Rmd'
314
315
script:
        'other_meta.Rmd'
326
327
script:
        'other_meta.Rmd'
337
338
script:
        'nonadditive_qc.Rmd'
348
349
script:
        'file_level_qc.Rmd'
359
360
script:
        'file_level_qc.Rmd'
370
371
script:
        'file_level_qc.Rmd'
381
382
script:
        'file_level_qc.Rmd'
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
library(data.table)
library(dplyr)
library(metafor)

funk= function(pheno) {

d_temp= d[d$outcome== pheno, ]

df_list= lapply(c('MT', 'MNT', 'PT'), function(i){


df_temp= d_temp[d_temp$haplotype== i, ]
print(nrow(d_temp))
res.FE= rma(yi= beta, sei= se,  data= df_temp, method= "FE")

df= data.frame(beta= res.FE$beta, se= res.FE$se, pvalue= res.FE$pval, lo95= res.FE$ci.lb, up95= res.FE$ci.ub, het_pvalue= res.FE$QEp, outcome= pheno, haplotype= i)

print(df)

return(df)

})

df= do.call('rbind', df_list)

return(df)

}


moba= fread(snakemake@input[[1]])
decode= fread(snakemake@input[[2]])
hunt= fread(snakemake@input[[3]])

d= rbind(moba, decode)
d= rbind(d, hunt)

df_list= lapply(unique(d$outcome), funk)

x= do.call('rbind', df_list)

df= group_by(d, haplotype, outcome) %>% summarize(n= sum(n))

x= inner_join(x, df, by= c('haplotype', 'outcome'))

fwrite(x, snakemake@output[[1]], sep= '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
library(data.table)
library(dplyr)
library(metafor)

funk= function(pheno) {

d_temp= d[d$exposure== pheno, ]

df_list= lapply(c('MT', 'MNT', 'PT'), function(i){


df_temp= d_temp[d_temp$haplotype== i, ]
print(nrow(d_temp))
res.FE= rma(yi= beta, sei= se,  data= df_temp, method= "FE")

df= data.frame(beta= res.FE$beta, se= res.FE$se, pvalue= res.FE$pval, lo95= res.FE$ci.lb, up95= res.FE$ci.ub, het_pvalue= res.FE$QEp, exposure= pheno, haplotype= i)

print(df)

return(df)

})

df= do.call('rbind', df_list)

return(df)

}


moba= fread(snakemake@input[[1]])
decode= fread(snakemake@input[[2]])
hunt= fread(snakemake@input[[3]])

d= rbind(moba, decode)
d= rbind(d, hunt)

df_list= lapply(unique(d$exposure), funk)

x= do.call('rbind', df_list)

df= group_by(d, haplotype, exposure) %>% summarize(n= sum(n))

x= inner_join(x, df, by= c('haplotype', 'exposure'))

fwrite(x, snakemake@output[[1]], sep= '\t')
13
14
script:
	'PGS_repr_pheno_meta.R'
26
27
script:
        'PGS_fetal_growth_meta.R'
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
library(data.table)
library(dplyr)
library(DESeq2)
library(tidyverse)

df_list= list()

flist= list.files(snakemake@params[[1]], 'CL', full.names=T)

for (i in 1:length(flist)){
d= fread(flist[i])
cname= unlist(strsplit(flist[i], '/'))[10]
d= select(d, Name, NumReads)

names(d)= c('Name', cname)
df_list[[i]]= d

}

x= df_list %>% reduce(left_join, by = "Name")

cols= data.frame(row.names= colnames(x)[2:7], condition= colnames(x)[2:7], subject= colnames(x)[2:7])

cols$condition= gsub('.txt', '', sapply(strsplit(cols$condition, '-'), tail, 1))
cols$subject= sapply(strsplit(cols$subject, '-'), head, 1)
cts= as.matrix(x[, 2:7])
row.names(cts)= x$Name

dds <- DESeqDataSetFromMatrix(countData = round(cts),
                              colData = cols,
                              design= ~ subject + condition)

dds= DESeq(dds)

res= results(dds, name="condition_unt_vs_dec")

res= data.frame(res)
res$geneid= row.names(res)

fwrite(res, snakemake@output[[1]], sep= '\t')
16
17
script:
	'rna_seq_dif.R'
28
29
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID'])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

x= fread(snakemake@input[[1]])

x$p1= gsub('.txt.sumstats.gz', '', apply(x[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2]))
x$p2= gsub('.txt.sumstats.gz', '', apply(x[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))

x1= fread(snakemake@input[[2]])

x1$p1= gsub('.txt.sumstats.gz', '', apply(x1[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2]))
x1$p2= gsub('.txt.sumstats.gz', '', apply(x1[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))
d= rbind(x, x1)


d$trait= d$p2
d$trait= with(d, ifelse(trait== 'GAraw', 'Maternal gestational duration',
ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Maternal BW',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
                ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Fetal',
                ifelse(trait== 'BW_fetal_effect', 'Fetal only',
                ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'BW_maternal_effect', 'Maternal only',
                ifelse(trait== 'PCOS', 'Polycistic ovary syndrome', 'Endometriosis')))))))))))))))))))))))


d= filter(d, grepl('men', trait), !grepl('women', trait))





fwrite(d, snakemake@output[[1]], sep= '\t')
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
run:
	d= pd.read_csv(input[0], sep='\t', header= 0)
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	x['pheno']= 'Gestational duration'
	ptd= pd.read_csv(input[2], sep= '\t', header= 0)
	ptd['pheno']= 'Preterm delivery'
	postterm= pd.read_csv(input[3], sep= '\t', header= 0)
	postterm['pheno']= 'Post term delivery'
	gID= ['3:156697097:A:G', '5:158058432:G:T']
	d= d.loc[d.ID.isin(gID), :]
	d= pd.concat([x, d])
	d= pd.concat([d, ptd])
	d= pd.concat([d, postterm])
	d.sort_values('ID', inplace= True)
	d.to_csv(output[0], header= True, index= False, sep= '\t')
34
35
36
37
38
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	d= pd.merge(d[['ID', 'pheno']], x, on= 'ID', how= 'inner')
	d.to_csv(output[0], sep= '\t', header= True, index= False)
49
50
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
67
68
69
70
run:
        d= pd.read_csv(input[1], sep= '\t', header= 0)
        top= pd.read_csv(input[2], sep= '\t', header= 0, usecols= ['ID', 'nearestGene', 'RSID'])
        d= pd.merge(d, top, left_on= 'rsid', right_on= 'RSID')
82
83
84
85
shell:
	'''
	cp {input[0]} {output[0]}
	'''
 95
 96
 97
 98
 99
100
101
102
103
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	x['eqtl_data']= 'iPSC'
	d= pd.concat([d, x,])
	df= pd.read_csv(input[2], sep= '\t', header= None, names= ['chr', 'pos1', 'pos2', 'Gene_symbol', 'EID'], usecols= ['Gene_symbol', 'EID'])
	df['EID']= df['EID'].str.split('.').str[0]
	d= pd.merge(d, df, left_on= 'gene', right_on= 'EID')
	d.to_csv(output[0], sep= '\t', header= True, index= False)
113
114
script:
	'genetic_correlations_males.R'
SnakeMake From line 113 of tables/Snakefile
122
123
124
125
126
127
128
129
130
run:
	df_list= list()
	for i in input:
		d= pd.read_csv(i, sep= '\t', header= 0, usecols= ['CHR', 'N'])
		coh= i.split('filtered/')[1].replace('.txt', '')
		df_dict= pd.DataFrame({'cohort': coh, 'N': d.N.max()}, index= [0])
		df_list.append(df_dict)
	d= pd.concat(df_list)
	d.to_csv(output[0], sep= '\t', header= True, index= False)
SnakeMake From line 122 of tables/Snakefile
138
139
140
141
142
run:
	d= pd.read_csv(input[0], sep= '\t', header= None, names= ['ID', 'beta', 'se', 'pvalue', 'trait'])
	d[['CHR', 'POS', 'REF', 'EFF']]= d.ID.str.split(':', expand= True)
	d['CHR']= np.where(d.CHR== '23', 'X', d.CHR)
	d.to_csv(output[0], sep= '\t', header= True, index= False)
SnakeMake From line 138 of tables/Snakefile
151
152
153
154
155
156
157
158
159
160
161
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	d= pd.concat([d, x])
	horm= ['CBAT_fem', 'SHBG_fem', 'Testosterone_fem', 'SHBG_fem_cluster', 'Testosterone_fem_cluster']
	df= d.loc[d.trait.isin(horm), :]
	ivw= df.loc[df.method== 'IVW', :]
	egger= df.loc[df.method== 'MR-Egger', :]
	egger_int= df.loc[np.array(df.index[df.method== 'MR-Egger' ] + 1), :]
	d= pd.concat([ivw, egger, egger_int])
	d.to_csv(output[0], sep= '\t', header= True, index= False)
SnakeMake From line 151 of tables/Snakefile
170
171
172
173
174
175
176
177
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['MarkerName', 'HetISq', 'HetPVal'])
	d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	d['REF'], d['EFF']= np.where(d.REF> d.EFF, [d.EFF, d.REF], [d.REF, d.EFF])
	d['ID']= np.where(d.REF> d.EFF, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.EFF + ':' + d.REF, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.REF + ':' + d.EFF)
	d= pd.merge(d, x, on= 'ID')
	d.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['ID', 'HetISq', 'HetPVal'])
SnakeMake From line 170 of tables/Snakefile
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	d['Allele1']= d['Allele1'].str.upper()
	d['Allele2']= d['Allele2'].str.upper()
	d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
	d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
	d['CHR']= d['CHR'].astype(str).astype(int)
	d['POS']= d['POS'].astype(str).astype(int)
	d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']]
	d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']
	df= d.loc[d.pvalue< 5*10**-8, :]
	df.sort_values(by= 'pvalue', ascending= True, inplace= True)
	df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True)
	df_list= list()
	for chrom in set(df.CHR):
		d_temp= df.loc[df.CHR== chrom, :]
		positions= d_temp.POS.values
		for pos in positions:
			if pos in d_temp.POS.values:
				df_list.append(d_temp.loc[d_temp.POS== pos, :])
				d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :]
		else:
			continue
	x= pd.concat(df_list)
	x['pos1']= x.POS - 1.5*10**6
	x['pos2']= x.POS + 1.5*10**6
	x['CHR']= x.CHR.astype(str)
	x['CHR']= np.where(x.CHR== '23', 'X', x.CHR)
	x.to_csv(output[0], sep='\t', header= True, index= False, columns= ['CHR', 'pos1', 'pos2'])
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
run:
        d= pd.read_csv(input[0], sep= '\t', compression= 'gzip', usecols= ['CHR', 'POS', 'pvalue', 'nearestGene'])
        df= d.loc[d.pvalue< 5*10**-8, :]
        df.sort_values(by= 'pvalue', ascending= True, inplace= True)
        df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True)
        df_list= list()
        for chrom in set(df.CHR):
                d_temp= df.loc[df.CHR== chrom, :]
                positions= d_temp.POS.values
                for pos in positions:
                        if pos in d_temp.POS.values:
                                df_list.append(d_temp.loc[d_temp.POS== pos, :])
                                d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :]
                        else:
                                continue
        x= pd.concat(df_list)
        x['pos1']= x.POS - 1.5*10**6
        x['pos2']= x.POS + 1.5*10**6
        x['CHR']= x.CHR.astype(str)
        x['CHR']= np.where(x.CHR== '23', 'X', x.CHR)
        x.to_csv(output[0], sep='\t', header= True, index= False, columns= ['CHR', 'pos1', 'pos2', 'nearestGene'])
72
73
74
75
76
77
78
79
80
81
82
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'EAF', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'RSID', 'ID', 'BETA', 'SE', 'pvalue'])
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	x['CHR']= np.where(x.CHR== 'X', '23', x.CHR)
	x['CHR']= x.CHR.apply(int)
	d= pd.merge(d, x, on= 'CHR')
	d= d.loc[((d.POS>= d.pos1) & (d.POS <= d.pos2)), ]
	d.sort_values('pvalue', ascending= True, inplace= True)
	d= d.groupby('nearestGene').head(1)
	d= d[['CHR', 'POS', 'EAF', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'RSID', 'nearestGene', 'ID', 'BETA', 'SE', 'pvalue']]
	d.to_csv(output[0], sep= '\t', header= True, index= False)
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
run:
	for i in range(2):
		d= pd.read_csv(input[i], sep= '\t', compression= 'gzip', usecols= ['CHR', 'POS', 'pvalue', 'nearestGene'])
		df= d.loc[d.pvalue< 5*10**-8, :]
		df.sort_values(by= 'pvalue', ascending= True, inplace= True)
		df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True)
		df_list= list()
		for chrom in set(df.CHR):
			d_temp= df.loc[df.CHR== chrom, :]
			positions= d_temp.POS.values
			for pos in positions:
				if pos in d_temp.POS.values:
					df_list.append(d_temp.loc[d_temp.POS== pos, :])
					d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :]
				else:
					continue
		x= pd.concat(df_list)
		x['pos1']= x.POS - 1.5*10**6
		x['pos2']= x.POS + 1.5*10**6
		x['CHR']= x.CHR.astype(str)
		x['CHR']= np.where(x.CHR== '23', 'X', x.CHR)
		x.to_csv(output[i], sep='\t', header= True, index= False, columns= ['CHR', 'pos1', 'pos2', 'nearestGene'])
126
127
128
129
130
131
132
133
134
135
136
137
run:
	for i in range(2):
		d= pd.read_csv(input[i], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'EAF', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'RSID', 'ID', 'pvalue'])
		x= pd.read_csv(input[i+2], sep= '\t', header= 0)
		x['CHR']= np.where(x.CHR== 'X', '23', x.CHR)
		x['CHR']= x.CHR.apply(int)
		d= pd.merge(d, x, on= 'CHR')
		d= d.loc[((d.POS>= d.pos1) & (d.POS <= d.pos2)), ]
		d.sort_values('pvalue', ascending= True, inplace= True)
		d= d.groupby('nearestGene').head(1)
		d= d[['CHR', 'POS', 'EAF', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'RSID', 'nearestGene', 'ID', 'pvalue']]
		d.to_csv(output[i], sep= '\t', header= True, index= False)
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
run:
        d= pd.read_csv(input[0], sep= '\t',usecols= ['CHR', 'POS', 'pvalue'])
        df= d.loc[d.pvalue< 5*10**-8, :]
        df.sort_values(by= 'pvalue', ascending= True, inplace= True)
        df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True)
        df_list= list()
        for chrom in set(df.CHR):
                d_temp= df.loc[df.CHR== chrom, :]
                positions= d_temp.POS.values
                for pos in positions:
                        if pos in d_temp.POS.values:
                                df_list.append(d_temp.loc[d_temp.POS== pos, :])
                                d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :]
                        else:
                                continue
        x= pd.concat(df_list)
        x['pos1']= x.POS - 1.5*10**6
        x['pos2']= x.POS + 1.5*10**6
        x['CHR']= x.CHR.astype(str)
        x['CHR']= np.where(x.CHR== '23', 'X', x.CHR)
        x.to_csv(output[0], sep='\t', header= True, index= False, columns= ['CHR', 'POS', 'pos1', 'pos2'])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import pandas as pd
import numpy as np

d= pd.read_csv(snakemake.input[0], sep= '\t', header=0, compression= 'gzip')
d= d.loc[~d['#chrom'].str.contains('_'), :]
d['a1']= d.alts.str.split(',').str[0]
d['a2']= d.alts.str.split(',').str[1]
d['#chrom']= d['#chrom'].str.replace('chr', '')
d['POS']= np.where(d.ref.str.len() < d.alts.str.len(), d.chromStart, d.chromEnd)
d['ref']= np.where(d.ref.str.len()< d.alts.str.len(), 'I', d.ref)
d['ref']= np.where(d.ref.str.len() > d.alts.str.len(), 'D', d.ref)
d['a1']= np.where(d.ref== 'I', 'D', d.a1)
d['a1']= np.where(d.ref== 'D', 'I', d.a1)
df= d.copy()
df= df.loc[df.a2!= '', :]
d.loc[d.ref > d.a1, ['ref', 'a1']] = d.loc[d.ref > d.a1, ['a1', 'ref']].values

d['ID']= d['#chrom'] + ':' + d['POS'].astype(int).astype(str) + ':' + d.ref + ':' + d.a1
df.loc[df.ref > df.a2, ['ref', 'a2']] = df.loc[df.ref > df.a2, ['a2', 'ref']].values
df['ID']= df['#chrom'] + ':' + df['POS'].astype(int).astype(str) + ':' + df.ref + ':' + df.a2
df= df[['ID', 'name']]
d= d[['ID', 'name']]
d= pd.concat([d, df])

# Read RSIDs from HRC
x= pd.read_csv(snakemake.input[1], sep= '\t', header=0, usecols= ['#CHROM', 'POS', 'ID', 'REF', 'ALT'])
x.columns= ['CHROM', 'POS', 'name', 'REF', 'ALT']
x= x.loc[x.name!= '.', :]

x['CHROM']= np.where(x.CHROM== 'X', '23', x.CHROM)
x['CHROM']= x.CHROM.apply(str)

x.loc[x.REF > x.ALT, ['REF', 'ALT']] = x.loc[x.REF > x.ALT, ['ALT', 'REF']].values
x['ID']= x['CHROM'] + ':' + x['POS'].astype(int).astype(str) + ':' + x.REF + ':' + x.ALT
x= x[['ID', 'name']]
x= x.loc[~x.ID.isin(d.ID), :]

d= pd.concat([d, x])

d.to_csv(snakemake.output[0], sep= '\t', header= True, index= False)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
import numpy as np
import re

#d= pd.read_csv(snakemake.input[0], sep= '\t', header= 0)

#d['Allele1']= d['Allele1'].str.upper()
#d['Allele2']= d['Allele2'].str.upper()
#d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
#d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
#d['CHR']= d['CHR'].astype(str).astype(int)
#d['POS']= d['POS'].astype(str).astype(int)
#d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']]
#d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']
#d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
#d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)

#d['CHR']= d['CHR'].astype(str).astype(int)
#d['POS']= d['POS'].astype(str).astype(int)

#d['pvalue']= d['pvalue'].astype(str).astype(float)

#d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
#d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF

#d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]

col_list= ['IMPACT', 'DISTANCE', 'SYMBOL', 'SYMBOL_SOURCE', 'BIOTYPE']
df_list= list()

for vep in pd.read_csv(snakemake.input[1], sep= '\t', header= None, names= ['Variation', 'Location', 'Allele', 'Gene', 'Feature', 'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 'Existing_variation', 'Extra'], comment= '#', chunksize= 100000):
	for i in col_list:
		vep[i]= vep['Extra'].apply(lambda y: dict([(x.split('=', 1)) for x in re.split(';(?=\w)', y) if x.find('=') > -1])[i] if i in y else '')
	vep= vep[['Variation', 'Location', 'Existing_variation', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'SYMBOL_SOURCE', 'BIOTYPE']]
	vep.columns= ['ID', 'Location', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'SYMBOL_SOURCE', 'BIOTYPE']
	vep['BIOTYPE1']= np.where(vep.BIOTYPE== 'protein_coding', 0, np.where(vep.BIOTYPE.str.contains('pseudo'), 2, 1))
	vep['DISTANCE']= np.where(vep.DISTANCE== '', 0, vep.DISTANCE)
	vep[['chr', 'pos', 'All']]= vep.ID.str.split('_', expand= True)
	vep[['EFF', 'REF']]= vep.All.str.split('/', expand= True)
	vep.loc[vep.REF > vep.EFF, ['REF', 'EFF']] = vep.loc[vep.REF > vep.EFF, ['EFF', 'REF']].values
	vep[['CHR', 'POS']]= vep['Location'].str.split(':', expand= True)
	vep['CHR']= np.where(vep['CHR']== 'X', '23', vep['CHR'])
	vep['ID']= vep.CHR.astype(int).astype(str) + ':' + vep.POS.astype(int).astype(str) + ':' + vep.REF + ':' + vep.EFF
	vep= vep[['ID', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'BIOTYPE', 'BIOTYPE1']]
	vep.sort_values(by= ['BIOTYPE1'], ascending= True, inplace= True)
	vep.drop_duplicates(subset= ['ID'], keep= 'first', inplace= True)
	df_list.append(vep)

vep= pd.concat(df_list)

vep.sort_values(by= ['BIOTYPE1'], ascending= True, inplace= True)
vep.drop_duplicates(subset= ['ID'], keep= 'first', inplace= True)
vep= vep[['ID', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'BIOTYPE']]


d= pd.read_csv(snakemake.input[0], sep= '\t', header= 0)
d['Allele1']= d['Allele1'].str.upper()
d['Allele2']= d['Allele2'].str.upper()
d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
d['CHR']= d['CHR'].astype(str).astype(int)
d['POS']= d['POS'].astype(str).astype(int)
d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']]
d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']
d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)
d['CHR']= d['CHR'].astype(str).astype(int)
d['POS']= d['POS'].astype(str).astype(int)
d['pvalue']= d['pvalue'].astype(str).astype(float)
d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF
d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]
d= pd.merge(d, vep, on= ['ID'], how= 'left')
d.to_csv(snakemake.output[0], header=True, index= False, sep= '\t')
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	x['CHR']= np.where(x['CHR']== '23', 'X', x['CHR'])
	d[['CHR', 'POS', 'REF', 'EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
	d= d.loc[d.SNP== 'SNP', :]
	d['POS2']= d['POS']
	d['CHR']= np.where(d['CHR']== '23', 'X', d['CHR'])
	d['POS']= d['POS'].astype(str).astype(int)
	df_list= list()
	for index, row in x.iterrows():
		temp_df= d.loc[d.CHR== row['CHR'], :]
		temp_df= temp_df.loc[((temp_df.POS >= int(row['pos1'])) & (temp_df.POS <= int(row['pos2']))), :]
		df_list.append(temp_df)
	d= pd.concat(df_list)
	d['Allele']= d['Allele1'].str.upper() + '/' + d['Allele2'].str.upper()
	d['STRAND']= '+'
	d.sort_values(by= ['CHR', 'POS'], inplace= True)
	d.to_csv(output[0], sep= '\t', header= False, index= False, columns= ['CHR', 'POS', 'POS2', 'Allele', 'STRAND'])
SnakeMake From line 11 of VEP/Snakefile
37
38
shell:
	'/home/pol/software/ensembl-vep/vep -i {input[0]} --check_existing --symbol --biotype --cache -O {output[0]} --offline --force_overwrite'
48
49
script:
	'format_VEP.py'
SnakeMake From line 48 of VEP/Snakefile
58
59
60
61
run:
        d= pd.read_csv(input[0], sep= '\t', header= 0)
        x= pd.read_csv(input[1], sep= '\t', header= 0)
        d= d.loc[~d.geneSymbol.isin(x.name2), :]
SnakeMake From line 58 of VEP/Snakefile
88
89
90
91
92
93
94
95
96
97
98
99
run:
	d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['MarkerName', 'Allele1'])
	d['CHR']= d.MarkerName.str.split(':').str[0]
	d['end']= d.MarkerName.str.split(':').str[1]
	d['CHR']= d.CHR.astype('str').astype('int')
	d['end']= d.end.astype('str').astype('int')
	d['start']= d.end - 1 
	d['MarkerName']= d.MarkerName.str.replace(':SNP', '')
	d['MarkerName']= d.MarkerName.str.replace(':INDEL', '')
	d.sort_values(by= ['CHR', 'start'], inplace= True)
	d= d[['CHR', 'start', 'end', 'MarkerName']]
	d.to_csv(output[0], sep= '\t', header= False, index= False)
SnakeMake From line 88 of VEP/Snakefile
108
109
shell:
	'bedtools closest -t all -a {input[0]} -b {input[1]} > {output[0]}'
118
119
script:
	'format_dbSNP.py'
SnakeMake From line 118 of VEP/Snakefile
129
130
131
132
133
134
135
136
137
138
139
140
run:
	d= pd.read_csv(input[0], sep= '\t', header=0)
	rs= pd.read_csv(input[1], sep= '\t', header=0)
	d= pd.merge(d, rs, on= 'ID', how= 'left')
	d['RSID']= np.where(pd.isnull(d.RSID), d.name, d.RSID)
	d['RSID']= np.where(d.RSID== '', d.name, d.RSID)
	d['RSID']= np.where(d.RSID== '-', d.name, d.RSID)
	d.drop('name', 1, inplace= True)
	ne= pd.read_csv(input[2], sep= '\t', header= None, names= ['CHR', 'X', 'POS', 'ID', 'c1', 'p1', 'p2', 'nearestGene', 'Ensembl_gene'])
	ne= ne[['ID', 'nearestGene']]
	d= pd.merge(d, ne, on= 'ID', how= 'left')
	d.to_csv(output[0], sep= '\t', header= True, index= False, compression= 'gzip')
SnakeMake From line 129 of VEP/Snakefile
148
149
150
151
152
153
154
run:
        d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['ID', 'CHR', 'POS'])
        d['end']= d.POS
        d['start']= d.end - 1
        d.sort_values(by= ['CHR', 'start'], inplace= True)
        d= d[['CHR', 'start', 'end', 'ID']]
        d.to_csv(output[0], sep= '\t', header= False, index= False)
SnakeMake From line 148 of VEP/Snakefile
163
164
shell:
        'bedtools closest -t all -k 2 -a {input[0]} -b {input[1]} > {output[0]}'
173
174
175
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['ID', 'nearestGene'])
	ne= pd.read_csv(input[1], sep= '\t', header= None, names= ['CHR', 'X', 'POS', 'ID', 'c1', 'p1', 'p2', 'nearestGene2', 'Ensembl_gene'])
SnakeMake From line 173 of VEP/Snakefile
188
189
190
run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'P-value'])
	d= d.loc[d['P-value']< 5e-5, :]
SnakeMake From line 188 of VEP/Snakefile
207
208
shell:
        '/home/pol/software/ensembl-vep/vep -i {input[0]} --check_existing --symbol --biotype --cache -O {output[0]} --offline --force_overwrite'
217
218
script:
        'format_VEP.py'
SnakeMake From line 217 of VEP/Snakefile
ShowHide 387 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/PerinatalLab/metaGWAS
Name: metagwas
Version: v1.0.0
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...