BioWorkflows: Snakemake workflow: Meta-analysis of GWAS of gestational duration, preterm and post-term deliveries (EGG Consortium)

library(data.table)
library(dplyr)
library(coloc)
library(parallel)

pph_outfile= snakemake@output[[1]]
results_outfile= snakemake@output[[2]]

cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tlocus\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tlocus\n', file= snakemake@output[[2]])


prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6

d= fread(snakemake@input[[1]], select= c('ID', 'CHR', 'POS', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue', 'EAF'))
d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF)

x= fread(snakemake@input[[2]], select= c('ID', 'N','BETA', 'SE', 'pvalue', 'EAF'))

x$MAF= ifelse(x$EAF>0.5, 1- x$EAF, x$EAF)

names(x)= c('ID', 'N', 'beta', 'se', 'p', 'eaf', 'maf')

d= inner_join(d, x, by= 'ID')

if (sum(is.na(d$eaf)) == nrow(d)) {
d$maf= d$MAF
} 

z= fread(snakemake@input[[3]])

z$CHR= as.numeric(gsub('chr', '', z$chr))

z$locus= 1:nrow(z)


funk= function(i) {
        row= z[i,]
	locus= paste0('locus_', i)
        temp_df= filter(d, CHR== as.integer(row[, 'CHR']), POS >= as.integer(row[, 'start']), POS<= as.integer(row[, 'stop']))

	if (nrow(temp_df)== 0) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0,  PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
	pph_list[[i]]= PPH
	res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res
	fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')

	} else {
	temp_df= filter(temp_df, SE>0, se>0)
	if (grepl('PCOS|miscarriage|POP|endometriosis|Preeclampsia|leiomyoma_uterus', snakemake@input[[2]])) {
        if (grepl('PCOS', snakemake@input[[2]])) {s_pheno=  (1184 + 670 + 157 +658 +984 + 485 + 462 )/ (1184 + 670 + 157 +658 +984 + 485 + 462 + 5799 + 1379 +2807 +6774 +2963+ 407 + 96172)}
        if (grepl('miscarriage', snakemake@input[[2]])) {s_pheno=   49996 / ( 174109 + 49996)}
        if (grepl('POP', snakemake@input[[2]])) {s_pheno= 7053 / (57407 + 7053) }
        if (grepl('endometriosis', snakemake@input[[2]])) {s_pheno= 1496 / (192678 + 1496 )}
        if (grepl('Preeclampsia', snakemake@input[[2]])){ s_pheno= 4630/ (4630 + 373345)}
        if (grepl('leiomyoma_uterus', snakemake@input[[2]])){ s_pheno= ( 14569) / (85792 + 14569)}
        if (grepl('allPTD', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.067)
        } else if (grepl('postTerm', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.122)
        } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) }

        data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'cc', snp= temp_df$ID, s= s_pheno, MAF= temp_df$maf)

        } else { 
	if (grepl('allPTD', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.067)
        } else if (grepl('postTerm', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.122)
        } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) }

        data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'quant', snp= temp_df$ID, MAF= temp_df$maf)
}
myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
	if (length(myres)==1 ) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
        pph_list[[i]]= PPH
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res
	fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')
	next 
	} else {
	PPH= data.frame(t(myres[[1]]))
        PPH$locus= locus
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= myres[[2]]
        res$locus= locus
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)

}
}
}

mclapply(1:nrow(z), funk, mc.cores= 3)

R dplyr data.table Quant coloc From line 1 of colocalization/coloc_BW_GA_GW.R

library(data.table)
library(dplyr)
library(coloc)
library(parallel)

pph_outfile= snakemake@output[[1]]
results_outfile= snakemake@output[[2]]

cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tlocus\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tlocus\n', file= snakemake@output[[2]])


prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6

d= fread(snakemake@input[[1]], select= c('ID', 'CHR', 'POS', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue', 'EAF'))
d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF)

x= fread(snakemake@input[[2]], select= c('ID', 'TOTALSAMPLESIZE','BETA', 'SE', 'pvalue', 'EAF'))

x$MAF= ifelse(x$EAF>0.5, 1- x$EAF, x$EAF)

names(x)= c('ID', 'N', 'beta', 'se', 'p', 'eaf', 'maf')

d= inner_join(d, x, by= 'ID')

if (sum(is.na(d$eaf)) == nrow(d)) {
d$maf= d$MAF
} 

z= fread(snakemake@input[[3]])

z$CHR= as.numeric(gsub('chr', '', z$chr))

z$locus= 1:nrow(z)


funk= function(i) {
        row= z[i,]
	locus= paste0('locus_', i)
        temp_df= filter(d, CHR== as.integer(row[, 'CHR']), POS >= as.integer(row[, 'start']), POS<= as.integer(row[, 'stop']))

	if (nrow(temp_df)== 0) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0,  PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
	pph_list[[i]]= PPH
	res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res
	fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')

	} else {
	temp_df= filter(temp_df, SE>0, se>0)
	s_pheno= 0.067
	data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF)
	data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'cc', snp= temp_df$ID, s= s_pheno, MAF= temp_df$maf)
	myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
	if (length(myres)==1 ) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
        pph_list[[i]]= PPH
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res
	fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')
	next 
	} else {
	PPH= data.frame(t(myres[[1]]))
        PPH$locus= locus
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= myres[[2]]
        res$locus= locus
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)

}
}
}

mclapply(1:nrow(z), funk, mc.cores= 3)

R dplyr data.table Quant coloc From line 1 of colocalization/coloc_GA_vs_PTD_GW.R

library(data.table)
library(dplyr)
library(coloc)

prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6

d= fread(snakemake@input[[1]])
d= select(d, ID, CHR, POS, TOTALSAMPLESIZE, BETA, SE, pvalue, EAF)
d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF)

x= fread(snakemake@input[[2]])

x$BETA= ifelse(x$REF > x$EFF, -1 * x$BETA, x$BETA)

x= select(x, ID, TOTALSAMPLESIZE, BETA, SE, pvalue, EAF)
x$MAF= ifelse(x$EAF>0.5, 1- x$EAF, x$EAF)
names(x)= c('ID', 'N', 'beta', 'se', 'p', 'eaf', 'maf')

d= inner_join(d, x, by= 'ID')

if (sum(is.na(d$eaf)) == nrow(d)) {
d$maf= d$MAF
} 

z= fread(snakemake@input[[3]])
z$CHR= ifelse(z$CHR== 'X', '23', z$CHR)
z$CHR= as.integer(z$CHR)

z1= fread(snakemake@input[[4]])
z1$CHR= ifelse(z1$CHR== 'X', '23', z1$CHR)
z1$CHR= as.integer(z1$CHR)
z1= filter(z1, nearestGene== 'LRP5' | nearestGene== 'SCML4')

z= rbind(z, z1)

pph_list= list()
res_list= list()

for(i in 1:nrow(z)) {
        row <- z[i,]
	locus= paste0('chr', row[,'CHR'], '_', row[,'nearestGene'])
        temp_df= filter(d, CHR== as.integer(row[, 'CHR']), POS >= as.integer(row[, 'pos1']), POS<= as.integer(row[, 'pos2']))

	if (nrow(temp_df)== 0) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0,  PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
	pph_list[[i]]= PPH
	res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res

	} else {
	temp_df= filter(temp_df, SE>0, se>0)
	s_pheno= 0.067
	data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF)
	data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'cc', snp= temp_df$ID, s= s_pheno, MAF= temp_df$maf)
	myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
	if (length(myres)==1 ) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
        pph_list[[i]]= PPH
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res
	next 
	}
	PPH= data.frame(t(myres[[1]]))
	PPH$locus= locus        
	pph_list[[i]]= PPH
        res= myres[[2]]
        res$locus= locus
	res_list[[i]]= res
}
}

pph= data.frame(do.call('rbind', pph_list))
res= data.frame(do.call('rbind', res_list))


write.table(pph, snakemake@output[[1]], sep= '\t', row.names=F, col.names= T, quote=F)
write.table(res, snakemake@output[[2]], sep= '\t', row.names=F, col.names= T, quote=F)

R dplyr data.table Quant coloc From line 1 of colocalization/coloc_GA_vs_PTD.R

library(data.table)
library(dplyr)
library(coloc)

prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6

d= fread(snakemake@input[[1]], select= c('ID', 'CHR', 'POS', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue', 'EAF'))
d= select(d, ID, CHR, POS, TOTALSAMPLESIZE, BETA, SE, pvalue, EAF)
d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF)

x= fread(snakemake@input[[2]], select= c('ID', 'REF', 'EFF', 'BETA', 'EAF', 'SE', 'N', 'pvalue'))

x$BETA= ifelse(x$REF > x$EFF, -1 * x$BETA, x$BETA)

x= select(x, ID, N, BETA, SE, pvalue, EAF)
x$MAF= ifelse(x$EAF>0.5, 1- x$EAF, x$EAF)
names(x)= c('ID', 'N', 'beta', 'se', 'p', 'eaf', 'maf')

d= inner_join(d, x, by= 'ID')

if (sum(is.na(d$eaf)) == nrow(d)) {
d$maf= d$MAF
} 

z= fread(snakemake@input[[3]])
z$CHR= ifelse(z$CHR== 'X', '23', z$CHR)
z$CHR= as.integer(z$CHR)

pph_list= list()
res_list= list()

for(i in 1:nrow(z)) {
        row <- z[i,]
	locus= paste0('chr', row[,'CHR'], '_', row[,'nearestGene'])
        temp_df= filter(d, CHR== as.integer(row[, 'CHR']), POS >= as.integer(row[, 'POS']) - 250000, POS<= as.integer(row[, 'POS']) + 25000)

	if (nrow(temp_df)== 0) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0,  PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
	pph_list[[i]]= PPH
	res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res

	} else {
	temp_df= filter(temp_df, SE>0, se>0)
        if (grepl('PCOS|miscarriage|POP|endometriosis|Preeclampsia|leiomyoma_uterus', snakemake@input[[2]])) {
	if (grepl('PCOS', snakemake@input[[2]])) {s_pheno=  (1184 + 670 + 157 +658 +984 + 485 + 462 )/ (1184 + 670 + 157 +658 +984 + 485 + 462 + 5799 + 1379 +2807 +6774 +2963+ 407 + 96172)}
	if (grepl('miscarriage', snakemake@input[[2]])) {s_pheno=   49996 / ( 174109 + 49996)}
	if (grepl('POP', snakemake@input[[2]])) {s_pheno= 7053 / (57407 + 7053) }
	if (grepl('endometriosis', snakemake@input[[2]])) {s_pheno= 1496 / (192678 + 1496 )}
	if (grepl('Preeclampsia', snakemake@input[[2]])){ s_pheno= 4630/ (4630 + 373345)}
	if (grepl('leiomyoma_uterus', snakemake@input[[2]])){ s_pheno= ( 14569) / (85792 + 14569)}
	if (grepl('allPTD', snakemake@input[[1]])) {
	data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.067)
	} else if (grepl('postTerm', snakemake@input[[1]])) {
	data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.122) 
	} else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) }

	data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'cc', snp= temp_df$ID, s= s_pheno, MAF= temp_df$maf)

	} else {

        if (grepl('allPTD', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.067) 
        } else if (grepl('postTerm', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, MAF= temp_df$MAF, s= 0.122) 
        } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) }

	data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$N, type= 'quant', snp= temp_df$ID, MAF= temp_df$maf) 

	}
	myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
	if (length(myres)==1 ) { 
	PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, locus= locus)
        pph_list[[i]]= PPH
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, locus= locus)
	res_list[[i]]= res
	next 
	}
	PPH= data.frame(t(myres[[1]]))
	PPH$locus= locus        
	pph_list[[i]]= PPH
        res= myres[[2]]
        res$locus= locus
	res_list[[i]]= res
}
}

pph= data.frame(do.call('rbind', pph_list))
res= data.frame(do.call('rbind', res_list))


write.table(pph, snakemake@output[[1]], sep= '\t', row.names=F, col.names= T, quote=F)
write.table(res, snakemake@output[[2]], sep= '\t', row.names=F, col.names= T, quote=F)

R dplyr data.table Quant coloc From line 1 of colocalization/coloc.R

script:
	'coloc.R'

SnakeMake From line 14 of colocalization/Snakefile

run:
	df_list= list()
	for i in input:
		d= pd.read_csv(i, sep= '\t', header= 0)
		x= i.split('pph_')[1].replace('.txt', '')
		d['trait']= x
		df_list.append(d)
	d= pd.concat(df_list)
	d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 23 of colocalization/Snakefile

run:
        df_list= list()
        for i in input:
                d= pd.read_csv(i, sep= '\t', header= 0)
                x= i.split('results_')[1].replace('.txt', '')
                d['trait']= x
                df_list.append(d)
        d= pd.concat(df_list)
        d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 39 of colocalization/Snakefile

script:
        'coloc_GA_vs_PTD.R'

SnakeMake From line 61 of colocalization/Snakefile

script:
	'coloc_GA_vs_PTD_GW.R'

SnakeMake From line 75 of colocalization/Snakefile

script:
        'coloc_BW_GA_GW.R'

SnakeMake From line 89 of colocalization/Snakefile

import pandas as pd
import numpy as np


def flip_beta(df):
        'Flip EFF and REF allele if REF> EFF. Flip beta direction with same condition. Assumed column names: beta, REF, EFF.'
        df['BETA']= np.where(df.REF>df.EFF, -1 * df.BETA, df.BETA)
        df['REF'], df['EFF']= np.where(df.REF> df.EFF, [df.EFF, df.REF], [df.REF, df.EFF])
        return df

def add_ID(x):
	x['REF']= np.where(x.REF.str.len() > x.EFF.str.len(), 'I', x.REF)
	x['EFF']= np.where(x.REF.str.len() < x.EFF.str.len(), 'I', x.EFF)
	x['REF']= np.where(x.EFF== 'I', 'D', x.REF)
	x['EFF']= np.where(x.REF== 'I', 'D', x.EFF)
	x['ID']= np.where(x.REF> x.EFF, x.CHR.apply(str) + ':' + x.POS.apply(str) + ':' + x.EFF + ':' + x.REF, x.CHR.apply(str) + ':' + x.POS.apply(str) + ':' + x.REF + ':' + x.EFF)
	x= flip_beta(x)
	return x


def format_df(x, reg):
	d= pd.read_csv(x, sep= ',', header= 0)
	d['chr']= d.chr.apply(str)
	d= pd.merge(d, reg, left_on= 'chr', right_on= 'CHR')
	d= d.loc[((d.pos >= d.pos1) & (d.pos<= d.pos2)), :]
	h1= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h1.coef', 'h1.se', 'h1.pval']]
	h1.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']
	h2= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h2.coef', 'h2.se', 'h2.pval']]
	h2.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']
	h3= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h3.coef', 'h3.se', 'h3.pval']]
	h3.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']
	h1= add_ID(h1)
	h2= add_ID(h2)
	h3= add_ID(h3)
	h1.to_csv(snakemake.output[0], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	h2.to_csv(snakemake.output[1], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	h3.to_csv(snakemake.output[2], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	print('Completed file:' + x)

regions= pd.read_csv(snakemake.input[0], sep= '\t', header= 0)

format_df(snakemake.input[1], regions)

Python Pandas numpy From line 1 of effect_origin/format_CCHMC_haplotype.py

script:
	'format_CCHMC_haplotype.py'

SnakeMake From line 29 of effect_origin/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['contig', 'position', 'testedAllele', 'otherAllele', 'h.Bmnt', 'h.Bmnt.se', 'h.Bmnt.p', 'h.Bmt', 'h.Bmt.se', 'h.Bmt.p', 'h.Bft', 'h.Bft.se', 'h.Bft.p'])[['contig', 'position', 'testedAllele', 'otherAllele', 'h.Bmnt', 'h.Bmnt.se', 'h.Bmnt.p', 'h.Bmt', 'h.Bmt.se', 'h.Bmt.p', 'h.Bft', 'h.Bft.se', 'h.Bft.p']]
	d.columns= ['CHR', 'POS', 'EFF', 'REF', 'h2_beta', 'h2_se', 'h2_pvalue', 'h1_beta', 'h1_se', 'h1_pvalue', 'h3_beta', 'h3_se', 'h3_pvalue']
	h1= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'h1_beta', 'h1_se', 'h1_pvalue']]
	h1.columns= ['CHR', 'POS', 'EFF', 'REF', 'BETA', 'SE', 'pvalue']
	h2= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'h2_beta', 'h2_se', 'h2_pvalue']]
	h2.columns= ['CHR', 'POS', 'EFF', 'REF', 'BETA', 'SE', 'pvalue']
	h3= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'h3_beta', 'h3_se', 'h3_pvalue']]
	h3.columns= ['CHR', 'POS', 'EFF', 'REF', 'BETA', 'SE', 'pvalue']
	h1= add_ID(h1)
	h2= add_ID(h2)
	h3= add_ID(h3)
	h1.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	h2.to_csv(output[1], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	h3.to_csv(output[2], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])

SnakeMake From line 40 of effect_origin/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	d[['CHR', 'POS', 'REF', 'EFF']]= d.snp.str.split(':', expand= True)
	h1= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'beta_h1', 'se_h1', 'pvalue_h1']]

SnakeMake From line 64 of effect_origin/Snakefile

shell:
	'/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}'

SnakeMake From line 90 of effect_origin/Snakefile

run:
	h1= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value', 'Direction', 'HetISq', 'HetPVal'])
	h2= pd.read_csv(input[1], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value', 'HetISq', 'HetPVal'])
	h3= pd.read_csv(input[2], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value', 'HetISq', 'HetPVal'])
	h1['N_cohorts']= 6 - h1['Direction'].apply(lambda x: str.count(x, '?'))
	h1.columns= ['MarkerName', 'Allele1', 'Allele2', 'beta_h1', 'se_h1', 'pvalue_h1', 'Direction_h1', 'HetISq_h1', 'HetPval_h1', 'n_cohorts']
	h1['beta_h1']= np.where(h1.Allele2> h1.Allele1, -1 * h1.beta_h1, h1.beta_h1)
	h2.columns= ['MarkerName', 'Allele1', 'Allele2', 'beta_h2', 'se_h2', 'pvalue_h2', 'HetISq_h2', 'HetPval_h2']
	h2['beta_h2']= np.where(h2.Allele2> h2.Allele1, -1 * h2.beta_h2, h2.beta_h2)
	h3.columns= ['MarkerName', 'Allele1', 'Allele2', 'beta_h3', 'se_h3', 'pvalue_h3', 'HetISq_h3', 'HetPval_h3']
	h3['beta_h3']= np.where(h3.Allele2> h3.Allele1, -1 * h3.beta_h3, h3.beta_h3)
	d= pd.merge(h1, h2[['MarkerName', 'beta_h2', 'se_h2', 'pvalue_h2', 'HetISq_h2', 'HetPval_h2']], on= ['MarkerName'], how= 'inner')
	d= pd.merge(d, h3[['MarkerName', 'beta_h3', 'se_h3', 'pvalue_h3', 'HetISq_h3', 'HetPval_h3']], on= ['MarkerName'], how= 'inner')
	d= d[['MarkerName', 'Allele1', 'Allele2', 'beta_h1', 'se_h1', 'pvalue_h1', 'Direction_h1', 'HetISq_h1', 'HetPval_h1', 'n_cohorts', 'beta_h2', 'se_h2', 'pvalue_h2', 'HetISq_h2', 'HetPval_h2', 'beta_h3', 'se_h3', 'pvalue_h3', 'HetISq_h3', 'HetPval_h3']]
	d['Allele1'], d['Allele2']= np.where(d.Allele2> d.Allele1, [d.Allele2, d.Allele1], [d.Allele1, d.Allele2])
	d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 101 of effect_origin/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'RSID', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue'])
	d.drop_duplicates(['ID'], keep= 'first', inplace= True)
	d.sort_values('pvalue', inplace= True, ascending= True)
	d= d.iloc[0:99999, :]
	d= d[['RSID', 'CHR', 'POS', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue']]
	d.columns= ['CHR', 'POS', 'RSID', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'PVALUE']
	d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 9 of EGG_sumstats/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'RSID', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue'])

SnakeMake From line 25 of EGG_sumstats/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'RSID', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue'])
        d.drop_duplicates(['ID'], keep= 'first', inplace= True)

SnakeMake From line 45 of EGG_sumstats/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'RSID', 'REF', 'EFF', 'EAF', 'TOTALSAMPLESIZE', 'BETA', 'SE', 'pvalue'])

SnakeMake From line 65 of EGG_sumstats/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	top_list= list()
	non_top_list= list()
	for lname in set(d.locus):
		df_temp= d.loc[d.locus== lname, :]
		df_temp.sort_values(['PP'], ascending= False, inplace= True)
		df_temp['PPcum']= df_temp.PP.cumsum()
		top_vars= df_temp.loc[df_temp.PPcum< 0.95, :]
		non_top= df_temp.loc[df_temp.PPcum>= 0.95, :]
		top_list.append(top_vars)
		non_top_list.append(non_top)
	top= pd.concat(top_list)
	non_top= pd.concat(non_top_list)
	top.to_csv(output[0], sep= '\t', header= True, index= False)
	non_top.to_csv(output[1], sep= '\t', header= True, index= False)

SnakeMake From line 30 of enrichment/Snakefile

run:
	d= pd.read_csv(input[0], sep='\t', header= 0)
	pli= pd.read_csv(input[1], header= 0, sep= '\t', usecols= ['gene_id', 'gene', 'chromosome', 'start_position', 'end_position', 'pLI'])[['gene_id', 'gene', 'chromosome', 'start_position', 'end_position', 'pLI']]
	d= d.loc[d.nearestGene.isin(pli.gene.values), :]
	pli.columns= ['EID', 'gene', 'CHR', 'start', 'end', 'pLI']
	pli.dropna(subset= ['pLI'], inplace= True)
	pli_genes= pli.loc[pli.pLI>= 0.9, 'gene'].values.tolist()
	df= d.loc[d.nearestGene.isin(pli_genes), :]
	b= len(pli_genes) - df.shape[0]
	c= d.shape[0] - df.shape[0]
	d= df.shape[0]
	a= pli.shape[0] - b - d - c
	oddsratio, pvalue = st.fisher_exact([[a, b],[c, d]], alternative= 'greater')
	z= ['pli', a, b, c, d, (d / (d+c)), (b / (a + b)), oddsratio, pvalue]
	with open(output[0], 'w') as file_handler:
		file_handler.write('\t'.join([str(item) for item in z]) + '\n')

SnakeMake From line 54 of enrichment/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	rna= pd.read_csv(input[1], sep= '\t', header= 0)
	rna['GA']= np.where(rna['Gene name'].isin(d.nearestGene.values), 1, 0)
	rna['NX_rk']= rna.groupby('Gene name')['NX'].rank('average', ascending= True)
	df_list= list()
	for tissue in set(rna.Tissue):
		ilist= rna.loc[((rna.GA== 1) & (rna.Tissue == tissue)), 'NX_rk']
		base= rna.loc[((rna.GA== 0) & (rna.Tissue == tissue)), 'NX_rk']
		mannw_pvalue= st.mannwhitneyu(ilist, base, alternative= 'greater')[1]
		i_median= np.median(ilist)
		base_median= np.median(base)
		df_list.append([tissue, i_median, base_median, mannw_pvalue])
	z= pd.DataFrame.from_records(df_list)
	z.to_csv(output[0], sep= '\t', header= ['tissue', 'i_listmedian', 'base_list_median', 'MannW_pvalue'], index= False)

SnakeMake From line 78 of enrichment/Snakefile

run:
	pheno= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'nearestGene'])
	x= pd.read_csv(input[1], sep= '\t', header= None, names= ['CHR', 'start', 'end', 'gene', 'EnsembleID'])
	add= [line.strip() for line in open(input[2], 'r')]
	df= pheno.loc[pheno.nearestGene.isin(add), :]
	b= len(add) - df.shape[0]
	c= pheno.shape[0] - df.shape[0]
	d= df.shape[0]
	a= x.shape[0] - b - d - c
	oddsratio, pvalue = st.fisher_exact([[a, b],[c, d]], alternative= 'greater')
	z= ['dominant', a, b, c, d, (d / (d+c)), (b / (a + b)), oddsratio, pvalue]
	with open(output[0], 'w') as file_handler:
		file_handler.write('\t'.join([str(item) for item in z]) + '\n')
	rec= [line.strip() for line in open(input[3], 'r')]
	df= pheno.loc[pheno.nearestGene.isin(rec), :]
	b= len(rec) - df.shape[0]
	c= pheno.shape[0] - df.shape[0]
	d= df.shape[0]
	a= x.shape[0] - b - d - c
	oddsratio, pvalue = st.fisher_exact([[a, b],[c, d]], alternative= 'greater')
	z= ['recessive', a, b, c, d, (d / (d+c)), (b / (a + b)), oddsratio, pvalue]
	with open(output[0], 'a') as file_handler:
		file_handler.write('\t'.join([str(item) for item in z]) + '\n')

SnakeMake From line 103 of enrichment/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'nearestGene'])
	stc= pd.read_csv(input[1], header= 0, sep= '\t', usecols= ['geneid', 'log2FoldChange', 'pvalue'])[['geneid', 'log2FoldChange', 'pvalue']]

SnakeMake From line 134 of enrichment/Snakefile

run:
	pheno= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID', 'nearestGene'])
	x= pd.read_csv(input[1], sep= '\t', header= None, names= ['CHR', 'start', 'end', 'gene', 'EnsembleID'])
	lab= pd.read_csv(input[2], sep= '\t', header= 0)
	for i in set(lab.Cell_type):
		temp_df= lab.loc[lab.Cell_type== i, :]
		df= pheno.loc[pheno.nearestGene.isin(temp_df.gene_name.to_list()), :]
		b= len(temp_df.gene_name.to_list()) - df.shape[0]
		c= pheno.shape[0] - df.shape[0]
		d= df.shape[0]
		a= x.shape[0] - b - d - c
		oddsratio, pvalue = st.fisher_exact([[a, b],[c, d]], alternative= 'greater')
		z= [i, a, b, c, d, (d / (d+c)), (b / (a + b)), oddsratio, pvalue]
		with open(output[0], 'a') as file_handler:
			file_handler.write('\t'.join([str(item) for item in z]) + '\n')
	df= pheno.loc[pheno.nearestGene.isin(lab.gene_name.to_list()), :]

SnakeMake From line 159 of enrichment/Snakefile

library(data.table)
library(dplyr)
library(coloc)
library(parallel)

df= fread(snakemake@input[[1]], select= c('RSID', 'BETA', 'SE', 'TOTALSAMPLESIZE', 'EAF'))

df= filter(df, !duplicated(RSID))

df$MAF= ifelse(df$EAF>0.5, 1 - df$EAF, df$EAF)

z= fread(snakemake@input[[2]])
z$n= 206
z$maf= ifelse(z$Freq< 0.5, 1 - z$Freq, z$Freq)
df= inner_join(df, z, by= c('RSID'= 'SNP'))

rm(z)

pph_outfile= snakemake@output[[1]]
results_outfile= snakemake@output[[2]]


cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tprotein\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tprotein\n', file= snakemake@output[[2]])

prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6


df= data.frame(df)

colocalization_eqtl= function(temp_df){
	protein= unique(temp_df$Gene)
        if (nrow(temp_df)== 0) {

        PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')

        } else {
	temp_df = filter(temp_df, SE>0, se> 0)

	if (grepl('allPTD', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$RSID, s= 0.067, MAF= temp_df$MAF)
        } else if (grepl('postTerm', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$RSID, s= 0.122, MAF= temp_df$MAF)
        } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N= temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$RSID, MAF= temp_df$MAF) }

        data2= list(beta= temp_df$b, varbeta= temp_df$se**2, N=temp_df$n, type= 'quant', snp= temp_df$RSID, MAF= temp_df$maf)
        myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
        if (length(myres)==1 ) { 
        PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')
        } else {
        PPH= data.frame(t(myres[[1]]))
        PPH$protein= protein
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= myres[[2]]
        res$protein= protein
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
}
}
}



mclapply(split(df, df$Gene), colocalization_eqtl, mc.cores= 3)

R dplyr data.table Quant coloc From line 1 of eQTLs/coloc_endometrium.R

library(data.table)
library(dplyr)
library(coloc)
library(parallel)

df= fread(snakemake@input[[1]], select= c('ID', 'BETA', 'SE', 'TOTALSAMPLESIZE', 'EAF'))

df= filter(df, !duplicated(ID))

df$MAF= ifelse(df$EAF>0.5, 1 - df$EAF, df$EAF)

z= fread(snakemake@input[[2]], select= c('gene_id', 'ID', 'maf', 'slope', 'slope_se'))


z$n= with(z, ifelse(grepl('Ovary', snakemake@input[[2]]), 167, ifelse(grepl('Uterus', snakemake@input[[2]]), 269, 141)))


df= inner_join(df, z, by= 'ID')

rm(z)

pph_outfile= snakemake@output[[1]]
results_outfile= snakemake@output[[2]]


cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tprotein\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tprotein\n', file= snakemake@output[[2]])

prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6


df= data.frame(df)

colocalization_eqtl= function(temp_df){
	protein= unique(temp_df$gene_id)
        if (nrow(temp_df)== 0) {

        PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')

        } else {
	temp_df = filter(temp_df, SE>0, slope_se> 0)

	if (grepl('allPTD', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, s= 0.067, MAF= temp_df$MAF)
        } else if (grepl('postTerm', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, s= 0.122, MAF= temp_df$MAF)
        } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N= temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) }

        data2= list(beta= temp_df$slope, varbeta= temp_df$slope_se**2, N=temp_df$n, type= 'quant', snp= temp_df$ID, MAF= temp_df$maf)
        myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
        if (length(myres)==1 ) { 
        PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')
        } else {
        PPH= data.frame(t(myres[[1]]))
        PPH$protein= protein
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= myres[[2]]
        res$protein= protein
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
}
}
}



mclapply(split(df, df$gene_id), colocalization_eqtl, mc.cores= 3)

R dplyr data.table Quant coloc From line 1 of eQTLs/coloc_GTEx.R

script:
        'coloc_endometrium.R'

SnakeMake From line 11 of eQTLs/Snakefile

run:
	df_list= list()
	for i in input:
		d= pd.read_csv(i, header= 0, sep= '\t', usecols= ['ID'])
		df_list.append(d)
	x= pd.concat(df_list)
	x.drop_duplicates('ID', keep= 'first', inplace= True)
	x.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 20 of eQTLs/Snakefile

run:
	x= ['ID', 'gene_id', 'maf', 'slope', 'slope_se']
	with open(output[0], mode="w") as file:
		file.write("\t".join(x) + "\n")
	d= pd.read_csv(input[1], sep= '\t', header= 0)
	for chunk in pd.read_csv(input[0], sep= '\t', header= 0, chunksize= 500000, compression= 'gzip', usecols= ['gene_id', 'variant_id', 'maf', 'slope', 'slope_se']) :
		chunk[['CHR', 'POS', 'REF', 'EFF', 'build']]= chunk.variant_id.str.split('_', expand= True)
		chunk['ID']= np.where(chunk.REF> chunk.EFF, chunk.CHR + ':' + chunk.POS + ':' + chunk.EFF + ':' + chunk.REF, chunk.CHR + ':' + chunk.POS + ':' + chunk.REF + ':' + chunk.EFF)
		chunk= chunk[['ID', 'gene_id', 'maf', 'slope', 'slope_se']]
		chunk= pd.merge(chunk, d, on= 'ID')
		chunk.to_csv(output[0], sep= '\t', header= False, index= False, mode= 'a')

SnakeMake From line 36 of eQTLs/Snakefile

script:
        'coloc_GTEx.R'

SnakeMake From line 60 of eQTLs/Snakefile

import pandas as pd
import numpy as np


def flip_beta(df):
        'Flip EFF and REF allele if REF> EFF. Flip beta direction with same condition. Assumed column names: beta, REF, EFF.'
        df['BETA']= np.where(df.REF>df.EFF, -1 * df.BETA, df.BETA)
        df['REF'], df['EFF']= np.where(df.REF> df.EFF, [df.EFF, df.REF], [df.REF, df.EFF])
        return df

def add_ID(x):
        x['REF']= np.where(x.REF.str.len() > x.EFF.str.len(), 'I', x.REF)
        x['EFF']= np.where(x.REF.str.len() < x.EFF.str.len(), 'I', x.EFF)
        x['REF']= np.where(x.EFF== 'I', 'D', x.REF)
        x['EFF']= np.where(x.REF== 'I', 'D', x.EFF)
        x['ID']= np.where(x.REF> x.EFF, x.CHR.apply(str) + ':' + x.POS.apply(str) + ':' + x.EFF + ':' + x.REF, x.CHR.apply(str) + ':' + x.POS.apply(str) + ':' + x.REF + ':' + x.EFF)
        x= flip_beta(x)
        return x


def format_df(x):
	d= pd.read_csv(x, sep= ',', header= 0)
	d['chr']= d.chr.apply(str)
	d= d.loc[d.chr== '2', :]
	d= d.loc[d.pos== 113521754, :]
	h1= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h1.coef', 'h1.se', 'h1.pval']]
	h1.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']
	h2= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h2.coef', 'h2.se', 'h2.pval']]
	h2.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']
	h3= d.loc[:, ['chr', 'pos', 'ref', 'alt', 'h3.coef', 'h3.se', 'h3.pval']]
	h3.columns= ['CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']
	h1= add_ID(h1)
	h2= add_ID(h2)
	h3= add_ID(h3)
	h1.to_csv(snakemake.output[0], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	h2.to_csv(snakemake.output[1], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	h3.to_csv(snakemake.output[2], sep= '\t', header= True, index= False, columns= ['ID', 'CHR', 'POS', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
	print('Completed file:' + x)


format_df(snakemake.input[0])

Python Pandas numpy From line 1 of fetal_SNP/format_CCHMC_haplotype.py

script:
        'format_CCHMC_haplotype.py'

SnakeMake From line 9 of fetal_SNP/Snakefile

run:
	d= pd.read_csv(input[0], sep= ' ', header= 0)
	d['CHR']= 2
	d['POS']= 113521754
	d['REF']= 'C'
	d['EFF']= 'T'

SnakeMake From line 20 of fetal_SNP/Snakefile

run:
        d= pd.read_csv(input[0], sep= ' ', header= 0)
        d[['CHR', 'POS', 'REF', 'EFF']]= d.snp.str.split(':', expand= True)
        h1= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'beta_h1', 'se_h1', 'pvalue_h1']]
        h1.columns= ['CHR', 'POS', 'EFF', 'REF', 'BETA', 'SE', 'pvalue']
        h2= d.loc[:, ['CHR', 'POS', 'EFF', 'REF', 'beta_h2', 'se_h2', 'pvalue_h2']]
        h2.columns= ['CHR', 'POS', 'EFF', 'REF', 'BETA', 'SE', 'pvalue']

SnakeMake From line 47 of fetal_SNP/Snakefile

shell:
        '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}'

SnakeMake From line 73 of fetal_SNP/Snakefile

run:
	h1= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value'])
	h1.columns= ['ID', 'EFF', 'REF', 'beta_MT', 'se_MT', 'pvalue_MT']
	h1['beta_MT']= np.where(h1.REF > h1.EFF, -1 * h1.beta_MT, h1.beta_MT)
	h2= pd.read_csv(input[1], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value'])

SnakeMake From line 84 of fetal_SNP/Snakefile

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)




d= fread(snakemake@input[[1]])
d1= fread(snakemake@input[[2]])

d$beta= as.numeric(d$beta)
d$se= as.numeric(d$se)
d$pvalue= as.numeric(d$pvalue)

d1= filter(d1, PP.H4.abf> 0.75)
d= filter(d, pheno %in% d1$pheno_FINNGEN)

mani= fread(snakemake@input[[3]], select= c('phenocode', 'name'))
names(mani)= c('pheno', 'description')

d= inner_join(d, mani, by= 'pheno')

x= fread(snakemake@input[[4]])

x1= fread(snakemake@input[[5]])
x1= filter(x1, PP.H4.abf> 0.75)

x= filter(x, pheno %in% x1$pheno_PAN_UKBB)

mani=fread(snakemake@input[[6]], select= c('phenocode', 'trait_type', 'description'))
mani$pheno= paste(mani$trait_type, mani$phenocode, sep= '_')

x= inner_join(x, mani, by= 'pheno')

d$zscore= d$beta / d$se
x$zscore= x$beta / x$se

d= select(d, pheno, description, zscore, pvalue, trait)
x= select(x, pheno, description, zscore, pvalue, trait)
d= bind_rows(d, x)




d$zscore= ifelse(d$zscore> 10, 10, ifelse(d$zscore< -10, -10, d$zscore))

d$trait= ifelse(d$trait== 'Gestational duration', 'rs28654158 (gestational duration)', 'rs11708067 (birth weight)')

d$trait= factor(d$trait, levels= rev(c('rs28654158 (gestational duration)', 'rs11708067 (birth weight)')))

d$description= with(d, ifelse(grepl('Other diabetes', description), 'Other diabetes', description))

d$description= with(d, ifelse(grepl('Non-insulin-dep', description), 'Non-insulin dependent diabetes', description))
d$description= with(d, ifelse(grepl('Diabetes, varying def', description), 'Diabetes, wide', description))
d$description= with(d, ifelse(grepl('Intestinal adhesions', description), 'Intestinal adhesions', description))

d$description= with(d, ifelse(grepl('Type 2 diabetes, strict', description), 'Type 2 diabetes', description))

d$description= with(d, ifelse(grepl('Type 2 diabetes with other specified/multiple/unspecified complications', description), 'Type 2 diabetes with complications', description))

d$description= with(d, ifelse(grepl('Diabetes, insuline treatment', description), 'Diabetes, insuline treatment', description))

d$description= with(d, ifelse(grepl('Creatinine', description), 'Creatinine in urine', description))

ord <- hclust( dist(d$zscore, method = "euclidean"), method = "ward.D" )$order
d= d[ord, ]
d$description= factor(d$description, levels= unique(d$description))


p1= ggplot(d, aes(y= trait, x= description, fill= round(zscore), alpha= factor(as.numeric(pvalue< 5e-6)))) +
geom_tile(colour = "white", size= 1) +
theme_cowplot(font_size= 8) +
scale_alpha_discrete(guide=F, range= c(0.3, 1)) +
scale_fill_gradient2(low= colorBlindBlack8[3], high= colorBlindBlack8[8], mid= 'white', guide= F) +
theme(  axis.text.x= element_text(hjust= 1, angle= 45),
	axis.text.y= element_text(),
	axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        plot.margin= unit(c(t= 0, r= 0, b= 0, l= 0), unit= 'cm'),
	axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3)) +
coord_equal() +
labs(x = NULL, y = NULL)




ggsave(snakemake@output[[1]], p1, height= 100, width= 127, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')


p1= ggplot(d, aes(description, trait, fill= round(zscore), alpha= factor(as.numeric(pvalue< 5e-6)))) +
geom_tile(colour = "white", size= 1) +
theme_cowplot(font_size= 8) +
scale_alpha_discrete(guide=F) +
scale_fill_gradient2(low= colorBlindBlack8[3], high= colorBlindBlack8[8], mid= 'white') +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        plot.margin= margin(t= 0, r= 0, l= 0, b= 0, unit= 'pt')) +
scale_x_discrete(position = "top")


ggsave(snakemake@output[[3]], p1, height= 100, width= 140, units= 'mm', dpi= 300)

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 2 of figures/ADCY5_effect_direction.R

library(dplyr)
library(data.table)
library(ggplot2)
library(cowplot)
library(ggrepel)
library(tidyr)
library(showtext)
colorBlindBlack8  <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')
as= 8
as1= 9
showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])
names(d)= c('CHR', 'POS', 'FST_EUR_AFR')

d1= fread(snakemake@input[[2]])
names(d1)= c('CHR', 'POS', 'FST_EUR_EAS')

d2= fread(snakemake@input[[3]])
names(d2)= c('CHR', 'POS', 'FST_AFR_EAS')

d= inner_join(d, d1, by= c('CHR', 'POS')) %>% inner_join(., d2, by= c('CHR', 'POS'))

rm(d1); rm(d2)
d$v_ids= paste(d$CHR, d$POS, sep= ':')

z= fread(snakemake@input[[4]])

zl= gather(z, control_set, v_ids, Set_1:Set_10000)

bw_pos= c(123065778)
ga_pos= c(123112292)

zl= inner_join(zl, d[, c('v_ids', 'FST_EUR_AFR', 'FST_EUR_EAS', 'FST_AFR_EAS')], by= 'v_ids')

zl= filter(zl, Input_SNP== '3:123065778' | Input_SNP== '3:123112292')

zl$haplotype= with(zl, ifelse(Input_SNP== '3:123065778', 'Birth weight', 'Gestational duration'))

zl= zl[!duplicated(zl$v_ids), ]

zl= data.frame(zl)
d= data.frame(d)
df_list= list()
r_num= 1
for (i in c('FST_EUR_AFR', 'FST_AFR_EAS', 'FST_EUR_EAS')){

ga_pvalue=wilcox.test(zl[zl$haplotype== 'Gestational duration', i], mu= d[d$v_ids== '3:123112292', i], alternative= 'less')$p.value

m1= d[d$v_ids== '3:123112292', i]
mc1= mean(zl[zl$haplotype== 'Gestational duration', i], na.rm=T)
medc1= median(zl[zl$haplotype== 'Gestational duration', i], na.rm=T)
prop_above= prop.table(table(d[d$v_ids== '3:123112292', i]> zl[zl$haplotype== 'Gestational duration', i]))[2]
temp_df= data.frame(haplotype= 'Gestational duration', ancestries= i, FST= m1, FST_mean_controls= mc1, FST_median_controls= medc1, pvalue= ga_pvalue)


ga_pvalue= wilcox.test(zl[zl$haplotype== 'Birth weight', i], mu= d[d$v_ids== '3:123065778', i], alternative= 'less')$p.value

medc1= median(zl[zl$haplotype== 'Birth weight', i], na.rm=T)
m1= d[d$v_ids== '3:123065778', i]
mc1= mean(zl[zl$haplotype== 'Birth weight', i], na.rm=T)

temp_df2= data.frame(haplotype= 'Birth weight', ancestries= i, FST= m1, FST_mean_controls= mc1, FST_median_controls= medc1, pvalue= ga_pvalue)
temp_df= rbind(temp_df, temp_df2)
df_list[[r_num]]= temp_df

r_num= r_num + 1
}

xp= do.call('rbind', df_list)

xp$enrichment= with(xp, FST / FST_median_controls)

bw= filter(zl, haplotype== 'Birth weight') %>% select(FST_EUR_AFR, FST_EUR_EAS, FST_AFR_EAS)
ga= filter(zl, haplotype== 'Gestational duration') %>% select(FST_EUR_AFR, FST_EUR_EAS, FST_AFR_EAS)

names(bw)= c('FST_EUR_AFR_bw', 'FST_EUR_EAS_bw', 'FST_AFR_EAS_bw')

df1= cbind(bw, ga)

ga_fst= d[d$v_ids== '3:123112292', 'FST_EUR_AFR']
bw_fst= d[d$v_ids== '3:123065778', 'FST_EUR_AFR']
ga_fst_pvalue= xp[xp$haplotype== 'Gestational duration' & xp$ancestries== 'FST_EUR_AFR', 'enrichment']
bw_fst_pvalue= xp[xp$haplotype== 'Birth weight' & xp$ancestries== 'FST_EUR_AFR', 'enrichment']


p1= ggplot(df1, aes(x=x) ) +
  geom_density( aes(x = FST_EUR_AFR, y = ..density..), fill= colorBlindBlack8[4], colour= colorBlindBlack8[4]) +
annotate('text', x=0.6, y= 10, label="Gestational \nduration", color= colorBlindBlack8[4], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=0.6, y= -10 - 0.5, label="Birth weight", color= colorBlindBlack8[2], size= as1/ .pt, fontface = 'bold') +
  annotate('text', x=ga_fst, y=5 + 0.5, label="rs28654158", color= colorBlindBlack8[4], size= as/ .pt) +
  annotate('text', x=bw_fst, y= -10 - 0.5, label="rs11708067", color= colorBlindBlack8[2], hjust= 0, size= as/ .pt) +
  annotate('text', x= 0.6, y= 1, label= paste0('Enrichment x', round(ga_fst_pvalue, 1)), color= colorBlindBlack8[4], size= as/ .pt) +
  annotate('text', x= 0.6, y= -1, label= paste0('Enrichment x', round(bw_fst_pvalue, 1)), color= colorBlindBlack8[2], size= as/ .pt) +
  geom_density(aes(x = FST_EUR_AFR_bw, y = -..density..), fill= colorBlindBlack8[2], colour= colorBlindBlack8[2]) +
  theme_cowplot(font_size = 8) +
scale_x_continuous(expand= c(0, 0)) +
scale_y_continuous(limits= c(-11, 11), breaks= c(-10, -5, 0, 5, 10), labels= c(10, 5, 0, 5, 10)) +
  xlab("Fst Africans - Europeans") +
ylab('Density') +
geom_segment(aes(x = ga_fst, y = 0, xend = ga_fst, yend = 5)) +
geom_segment(aes(x = bw_fst, y = 0, xend = bw_fst, yend = -10))+
geom_hline(yintercept= 0, colour= 'grey') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))



ggsave(snakemake@output[[1]], p1, width= 63, height= 63, units= 'mm', dpi= 300)

ga_fst= d[d$v_ids== '3:123112292', 'FST_EUR_EAS']
bw_fst= d[d$v_ids== '3:123065778', 'FST_EUR_EAS']
ga_fst_pvalue= xp[xp$haplotype== 'Gestational duration' & xp$ancestries== 'FST_EUR_EAS', 'enrichment']
bw_fst_pvalue= xp[xp$haplotype== 'Birth weight' & xp$ancestries== 'FST_EUR_EAS', 'enrichment']

p1= ggplot(df1, aes(x=x) ) +
  geom_density( aes(x = FST_EUR_EAS, y = ..density..), fill= colorBlindBlack8[4], colour= colorBlindBlack8[4]) +
annotate('text', x=0.57, y= 9, label="Gestational \nduration", color= colorBlindBlack8[4], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=0.57, y= -10, label="Birth weight", color= colorBlindBlack8[2], size= as1/ .pt, fontface = 'bold') +
  annotate('text', x=ga_fst, y= 10 + 0.5, label="rs28654158", color= colorBlindBlack8[4], hjust= 0, size= as/ .pt) +
  annotate('text', x=bw_fst, y= -5 - 0.5, label="rs11708067", color= colorBlindBlack8[2], size= as/ .pt) +
  annotate('text', x= 0.6, y= 1, label= paste0('Enrichment x', round(ga_fst_pvalue, 1)), color= colorBlindBlack8[4], size= as/ .pt) +
  annotate('text', x= 0.6, y= -1, label= paste0('Enrichment x', round(bw_fst_pvalue, 1)), color= colorBlindBlack8[2], size= as/ .pt) +
  geom_density( aes(x = FST_EUR_EAS_bw, y = -..density..), fill= colorBlindBlack8[2], colour= colorBlindBlack8[2]) +
scale_x_continuous(expand= c(0, 0)) +
scale_y_continuous(limits= c(-11, 11), breaks= c(-10, -5, 0, 5, 10), labels= c(10, 5, 0, 5, 10)) +
  theme_cowplot(font_size = 8) +
  xlab("Fst East Asians - Europeans") +
ylab('Density') +
geom_segment(aes(x = ga_fst, y = 0, xend = ga_fst, yend = 10)) +
geom_segment(aes(x = bw_fst, y = 0, xend = bw_fst, yend = -5)) +
geom_hline(yintercept= 0, colour= 'grey') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))


ggsave(snakemake@output[[2]], p1, width= 63, height= 63, units= 'mm', dpi= 300)

ga_fst= d[d$v_id== '3:123112292', 'FST_AFR_EAS']
bw_fst= d[d$v_id== '3:123065778', 'FST_AFR_EAS']
ga_fst_pvalue= xp[xp$haplotype== 'Gestational duration' & xp$ancestries== 'FST_AFR_EAS', 'enrichment']
bw_fst_pvalue= xp[xp$haplotype== 'Birth weight' & xp$ancestries== 'FST_AFR_EAS', 'enrichment']

p1= ggplot(df1, aes(x=x) ) +
geom_density( aes(x = FST_AFR_EAS, y = ..density..), fill= colorBlindBlack8[4], colour= colorBlindBlack8[4]) +
annotate('text', x=0.72, y=7, label="Gestational \nduration", color= colorBlindBlack8[4], size= as1/ .pt, fontface = 'bold') + 
annotate('text', x=0.72, y= -7, label="Birth weight", color= colorBlindBlack8[2], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=ga_fst, y=5 + 0.5, label="rs28654158", color= colorBlindBlack8[4], size= as/ .pt) +
annotate('text', x=bw_fst, y= -5 - 0.5, label="rs11708067", color= colorBlindBlack8[2], hjust= 0, size= as/ .pt) +
annotate('text', x= 0.75, y= 1, label= paste0('Enrichment x', round(ga_fst_pvalue, 1)), color= colorBlindBlack8[4], size= as/ .pt) +
annotate('text', x= 0.75, y= -1, label= paste0('Enrichment x', round(bw_fst_pvalue, 1)), color= colorBlindBlack8[2], size= as/ .pt) +
geom_density( aes(x = FST_AFR_EAS_bw, y = -..density..), fill= colorBlindBlack8[2], colour= colorBlindBlack8[2]) +
scale_x_continuous(expand= c(0, 0)) +
theme_cowplot(font_size = 8) +
xlab("Fst Africans - East Asians") +
scale_y_continuous(limits= c(-11, 11), breaks= c(-10, -5, 0, 5, 10), labels= c(10, 5, 0, 5, 10)) +
ylab('Density') +
geom_segment(aes(x = ga_fst, y = 0, xend = ga_fst, yend = 5)) +
geom_segment(aes(x = bw_fst, y = 0, xend = bw_fst, yend = -5))+
geom_hline(yintercept= 0, colour= 'grey') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))



ggsave(snakemake@output[[3]], p1, width= 63, height= 63, units= 'mm', dpi= 300)

fwrite(df1, snakemake@output[[4]], sep= '\t')
fwrite(xp, snakemake@output[[5]], sep= '\t')

R ggplot2 dplyr data.table tidyr cowplot ggrepel showtext From line 1 of figures/ADCY5_FST_AFR_EUR.R

library(dplyr)
library(data.table)
library(ggplot2)
library(cowplot)
library(ggrepel)
library(tidyr)
library(showtext)
colorBlindBlack8  <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])
names(d)[8]= 'phenocode'
mani= fread(snakemake@input[[2]])

trait_list= c('biomarkers', 'continuous', 'icd10')
mani= mani[mani$trait_type %in% trait_list, ]

mani= filter(mani, saige_heritability_EUR> 0.01)
mani= mani[order(mani$saige_heritability_EUR, decreasing= TRUE), ]
mani= mani[!duplicated(mani$phenocode), ]

mani$phenocode= paste(mani$trait_type, mani$phenocode, sep= '_')
mani= mani[, c('phenocode', 'description')]
mani= mani[!duplicated(mani$description), ]

d= inner_join(d, mani[, c('description', 'phenocode')], by= 'phenocode')
d$cohort= 'UKBB'

x= fread(snakemake@input[[3]])
names(x)[8]= 'phenocode'
mani= fread(snakemake@input[[4]])
mani= mani[, c('phenocode', 'name')]
names(mani)= c('phenocode', 'description')
mani= mani[!duplicated(mani$description), ]

x= inner_join(x, mani, by= 'phenocode')
x$cohort= 'FINNGEN'

d= rbind(d, x)
d= d[order(d$PP.H4.abf, decreasing= F), ]
d= filter(d, PP.H4.abf> 0.01, PP.H4.abf + PP.H3.abf> 0.75)

d$preg_trait= factor(d$preg_trait)
empty_bar <- 5
to_add <- data.frame( matrix(NA, empty_bar*nlevels(d$preg_trait), ncol(d)) )
colnames(to_add) <- colnames(d)
to_add$preg_trait <- rep(levels(d$preg_trait), each=empty_bar)
d <- rbind(d, to_add)
d <- d %>% arrange(preg_trait)


d$id= seq(1, nrow(d))

label_data= d
number_of_bar <- nrow(label_data)
angle <-  90 - 360 * (label_data$id-0.5) /number_of_bar
label_data$hjust<-ifelse( angle < -90, 1, 0)


label_data$angle<-ifelse(angle < -90, angle+180, angle)

#d$id= factor(d$id, levels= d$id[order(d$PP.H4.abf)])

base_data= d %>%
  group_by(preg_trait) %>%
  filter(is.na(PP.H4.abf)) %>%
  summarize(start=min(id), end=max(id) ) %>%
  rowwise() %>%
  mutate(title=mean(c(start, end)))

arc100= rep(1, 2)
arc75= rep(0.75, 2)
arc50= rep(0.50, 2)
arc25= rep(0.25, 2)

label_data$description= with(label_data, ifelse(grepl('Other diabetes', description), 'Other diabetes', description))

label_data$description= with(label_data, ifelse(grepl('Non-insulin-dep', description), 'Non-insulin dependent diabetes', description))
label_data$description= with(label_data, ifelse(grepl('Diabetes, varying def', description), 'Diabetes, wide', description))
label_data$description= with(label_data, ifelse(grepl('Intestinal adhesions', description), 'Intestinal adhesions', description))

label_data$description= with(label_data, ifelse(grepl('Type 2 diabetes, strict', description), 'Type 2 diabetes', description))

label_data$description= with(label_data, ifelse(grepl('Type 2 diabetes with other specified/multiple/unspecified complications', description), 'Type 2 diabetes with complications', description))

label_data$description= with(label_data, ifelse(grepl('and lymph nodes, not elsewhere classified', description), 'Diseases of veins', description))

label_data$description= with(label_data, ifelse(grepl('Diabetes, insuline treatment', description), 'Diabetes, insuline treatment', description))

label_data$description= with(label_data, ifelse(grepl('Creatinine', description), 'Creatinine in urine', description))

p1= ggplot(d, aes(as.factor(id), PP.H4.abf, fill= preg_trait, alpha= PP.H4.abf)) +
geom_bar(stat="identity", colour= NA) +
scale_alpha_continuous(range= c(0.4, 1), guide= F) +
geom_segment(data=base_data, aes(x = end, y = arc100, xend = start, yend = arc100), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  geom_segment(data=base_data, aes(x = end, y = arc75, xend = start, yend = arc75), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  geom_segment(data=base_data, aes(x = end, y = arc50, xend = start, yend = arc50), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  geom_segment(data=base_data, aes(x = end, y = arc25, xend = start, yend = arc25), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  annotate("text", x = ((base_data$end[1] + base_data$start[1]) / 2), y = c((0.25 + 0.075) , (0.50 + 0.075), (0.75 + 0.075) , (1 + 0.075)), label = c("0.25", "0.50", "0.75", "1") , color="grey", size=2.5 , angle=13, fontface="bold", hjust= 0.5) +
   annotate("text", x = ((base_data$end[2] + base_data$start[2]) / 2), y = c((0.25 + 0.075) , (0.50 + 0.075), (0.75 + 0.075) , (1 + 0.075) ), label = c("0.25", "0.50", "0.75", "1") , color="grey", size=2.5, angle=13, fontface="bold", hjust=0.5) +
ylim(-0.2, 2) + # Limits of the plot = very important. The negative value controls the size of the inner circle, the positive one is useful to add size over each bar
theme_cowplot() +
scale_fill_manual(values=colorBlindBlack8[c(2,4)], guide= F) +
scale_colour_manual(values=colorBlindBlack8[c(2,4)], guide= F) +
#    plot.margin = margin(t= -200, r= -40, b= -200, l=-70, unit= 'mm')   ) +
labs(x=NULL, y=NULL)  +
  coord_polar(start = 0) +
geom_text(data=filter(label_data, PP.H4.abf> 0.75), aes(x= factor(id), y=PP.H4.abf + 0.01, label=description, hjust=hjust), color="black", fontface="bold",alpha=0.6, size=6/ .pt, angle= filter(label_data, PP.H4.abf> 0.750)$angle, inherit.aes = FALSE) +
theme(axis.line=element_blank(),axis.text.x=element_blank(),
          axis.text.y=element_blank(),axis.ticks=element_blank(),
          axis.title.x=element_blank(),
          axis.title.y=element_blank(),legend.position="none",
          panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
          panel.grid.minor=element_blank(),plot.background=element_blank(),
axis.ticks.length = unit(0, "mm"))

ggsave(snakemake@output[[1]], plot= p1, width= 127, height= 127, dpi= 300, units= 'mm')

fwrite(d, snakemake@output[[2]], sep= '\t')

R ggplot2 dplyr data.table tidyr cowplot ggrepel showtext From line 1 of figures/ADCY5_pheWAS.R

library(scales)
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
library(tidyverse)
library(fmsb)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

flist= snakemake@input #list.files('/mnt/hdd/common/pol/metaGWAS/colocalization/GAraw/', 'pph_BW_', full.names=T)

funk= function(x){
d= fread(x)
d= filter(d, PP.H1.abf + PP.H2.abf + PP.H3.abf + PP.H4.abf + PP.H0.abf> 0)
fname= gsub('.txt', '', gsub('pph_', '', unlist(strsplit(x, '/'))[9]))
d= separate(d, locus, into= c('chrom', 'locus'), sep= '_')
d$sloc= d$PP.H4.abf + d$PP.H3.abf
d= select(d, PP.H4.abf, sloc, locus)

names(d)= c(fname, paste0(fname, '_sloc'), 'locus')
return(d)
}

d= lapply(flist, funk)

d= reduce(d, full_join, by = "locus")

d= arrange(d, BW_maternal_effect)

# Spider plot maternal

x= as.data.frame(matrix(d$BW_maternal_effect, ncol= nrow(d)))
x=rbind(x, as.data.frame(matrix(d$BW_maternal_effect_sloc, ncol= nrow(d))))
names(x)= d$locus

rownames(x)= c('BW maternal effect', 'BW maternal effect ')

x= rbind(rep(1,nrow(d)) , rep(0,nrow(d)) , x)


png(snakemake@output[[1]], width= 60, height= 60, res= 300, units= 'mm')
par(mar=c(0,0,0,0))

radarchart(x, axistype= 0, 

    #custom polygon
    pcol= c(colorBlindBlack8[4], colorBlindBlack8[2]) , pfcol= c(alpha(colorBlindBlack8[4], 0.4), alpha(colorBlindBlack8[2], 0.4)) , plwd=1, pty= 32, plty= 1,
    #custom the grid
    cglcol="grey", cglty=1, axislabcol="#525252", caxislabels= seq(0, 1, 0.25), caxisoffset= 0.1, cglwd=0.8, calcex= 0.4,

    #custom labels
    vlcex= 0.43
    )

dev.off()


# Spider plot fetal

x= as.data.frame(matrix(d$BW_fetal_effect, ncol= nrow(d)))
x=rbind(x, as.data.frame(matrix(d$BW_fetal_effect_sloc, ncol= nrow(d))))
names(x)= d$locus

rownames(x)= c('BW fetal effect', 'BW fetal effect ')

x= rbind(rep(1,nrow(d)) , rep(0,nrow(d)) , x)


png(snakemake@output[[2]], width= 60, height= 60, res= 300, units= 'mm')
par(mar=c(0,0,0,0))

radarchart(x, axistype= 0,

    #custom polygon
    pcol= c(colorBlindBlack8[4], colorBlindBlack8[2]) , pfcol= c(alpha(colorBlindBlack8[4], 0.4), alpha(colorBlindBlack8[2], 0.4)) , plwd=1, pty= 32, plty= 1,
    #custom the grid
    cglcol="grey", cglty=1, axislabcol="#525252", caxislabels= seq(0, 1, 0.25), caxisoffset= 0.1, cglwd=0.8, calcex= 0.4,

    #custom labels
    vlcex= 0.43
    )

dev.off()

R tidyverse dplyr data.table tidyr cowplot ggrepel knitr scales showtext From line 1 of figures/BW_coloc_spider.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

as= 8
as1= 8

z= fread(snakemake@input[[1]])
z$chr= as.numeric(gsub('chr', '', z$chr))
z$chr= as.character(z$chr)
z$locus= 1:nrow(z)


funk= function(infile){
d= fread(infile)
names(d)[1:11]= names(d)[2:12]
d=d[, 1:11]

d= filter(d, p<5e-6)

d$bC= ifelse(d$b< 0, -1 * d$bC, d$bC)
d$b= ifelse(d$b< 0, -1 * d$b, d$b)

d= separate(d, SNP, into= c('chr', 'POS', 'REF', 'EFF'), sep= ':')

d$POS= as.numeric(d$POS)
d$chr= as.character(d$chr)
d$GWAS= ifelse(grepl('BW_maternal_effect_GA', infile), 'BW_maternal_GA', 'BW_fetal_GA')
d= inner_join(d, z, on= 'chr') 
d= d %>% filter(POS>= start, POS< stop)

d= group_by(d, locus) %>% arrange(p) %>% filter(row_number()== 1)

return(d)

}

df_list= lapply(snakemake@input[grepl('BW', snakemake@input)], funk)

d= do.call('rbind', df_list)

d$beta_dif= with(d, (bC - b) / b)


mor= filter(d, GWAS== 'BW_maternal_GA') %>% pull(beta_dif)
barn= filter(d, GWAS== 'BW_fetal_GA') %>% pull(beta_dif)

p1= ggplot() +
geom_density( mapping=aes(x = mor, y = ..density..), fill= colorBlindBlack8[3], colour= colorBlindBlack8[3]) +
annotate('text', x=0.1, y= 3, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=0.1, y= -15, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
geom_density(mapping= aes(x = barn, y = -..density..), fill= colorBlindBlack8[8], colour= colorBlindBlack8[8]) +
  theme_cowplot(font_size = 8) +
scale_x_continuous(expand= c(0, 0)) +
  xlab("Relative difference in effect size on \nbirth weight after conditioning") +
ylab('Density') +
geom_hline(yintercept= 0, colour= 'grey') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))


ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)

p1= ggplot(d, aes(beta_dif, group= GWAS, fill= GWAS)) +
geom_hline(yintercept= 0, colour= 'black') +
geom_density(color= NA) +
annotate('text', x=-0.75, y= 1, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=0.1, y= 15, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
theme_cowplot(font_size= 8) +
#scale_colour_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_fill_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_x_continuous(expand= c(0, 0)) +
scale_y_continuous(expand=c(0, 0.5)) +
  xlab("Relative difference in effect size on \nbirth weight after conditioning") +
ylab('Density') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))

ggsave(snakemake@output[[3]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/BW_conditioning.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

as= 8
as1= 8

z= fread(snakemake@input[[1]])

z$SNP= with(z, ifelse(ref> eff, paste(chr, pos, eff, ref, sep= ':'), paste(chr, pos, ref, eff, sep= ':')))

funk= function(infile){
d= fread(infile)
names(d)[1:11]= names(d)[2:12]
d=d[, 1:11]

d$bC= ifelse(d$b< 0, -1 * d$bC, d$bC)
d$b= ifelse(d$b< 0, -1 * d$b, d$b)

d$GWAS= ifelse(grepl('BW_maternal_effect_GA', infile), 'BW_maternal_GA', 'BW_fetal_GA')

var= ifelse(grepl('BW_maternal_effect_GA', infile), 'Maternal Only', 'Fetal Only')
temp_z= z[z$origin== var, ]

d= filter(d, SNP %in% temp_z$SNP)

return(d)

}

df_list= lapply(snakemake@input[grepl('BW', snakemake@input)], funk)

d= do.call('rbind', df_list)

d$beta_dif= with(d, (bC - b) / b)


mor= filter(d, GWAS== 'BW_maternal_GA') %>% pull(beta_dif)
barn= filter(d, GWAS== 'BW_fetal_GA') %>% pull(beta_dif)

p1= ggplot() +
geom_density( mapping=aes(x = mor, y = ..density..), fill= colorBlindBlack8[3], colour= colorBlindBlack8[3]) +
annotate('text', x= 0.1, y= 3, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x= 0.1, y= -15, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
geom_density(mapping= aes(x = barn, y = -..density..), fill= colorBlindBlack8[8], colour= colorBlindBlack8[8]) +
  theme_cowplot(font_size = 8) +
scale_x_continuous(expand= c(0, 0)) +
  xlab("Relative difference in effect size on \nbirth weight after conditioning") +
ylab('Density') +
geom_hline(yintercept= 0, colour= 'grey') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))


ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)

p1= ggplot(d, aes(beta_dif, group= GWAS, fill= GWAS)) +
geom_hline(yintercept= 0, colour= 'black') +
geom_density(color= NA) +
annotate('text', x=-0.55, y= 1, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=0.1, y= 10, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
theme_cowplot(font_size= 8) +
#scale_colour_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_fill_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_x_continuous(expand= c(0, 0)) +
scale_y_continuous(expand=c(0, 0.5)) +
  xlab("Relative difference in effect size on \nbirth weight after conditioning") +
ylab('Density') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))

ggsave(snakemake@output[[3]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/BW_conditioning_top.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')


showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)



d= fread(snakemake@input[[1]])
d= filter(d, grepl('GAraw', p1), grepl('BW', p2))
d$p1= 'Gestational duration (maternal)'


x= fread(snakemake@input[[2]])

x= filter(x, grepl('GA_fetal', p1), grepl('BW', p2))
x$p1= 'Gestational duration (fetal)'



d= rbind(d, x)

d$p2= gsub('.txt.sumstats.gz', '', apply(d[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))
d$trait= d$p2
d$trait= with(d, ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Maternal',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
                ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Fetal',
                ifelse(trait== 'BW_fetal_effect', 'Fetal \nonly',
                ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'BW_maternal_effect', 'Maternal \nonly',
                ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis'))))))))))))))))))))))



p1= ggplot(d, aes(trait, rg, colour= p1)) +
geom_pointrange(aes(ymin= rg - se * 1.96, ymax= rg + se * 1.96), position = position_dodge(0.3), width = 1/10, size= 0.4, fatten= 0.6) +
scale_colour_manual(values= colorBlindBlack8[c(8,3)], guide= FALSE) +
theme_cowplot(font_size= 8) +
scale_y_continuous(limits= c(-0.2, 0.8), breaks= seq(-0.2, 0.8, 0.2)) +
ylab('Genetic correlation') +
xlab('Effect on birth weight') +
geom_hline(yintercept= 0, size= 0.3) +
geom_hline(yintercept= c(-0.2, seq(0.2, 0.8, 0.2)), colour= 'grey', linetype= 'dashed', alpha= 0.5, size= 0.2) +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
	axis.ticks= element_line(size= 0.3))


ggsave(snakemake@output[[1]], plot= p1, width= 60, height= 60, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/BW_genetic_correlations.R

library(data.table)
library(dplyr)
library(cowplot)
library(ggrepel)
library('showtext')


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])
x= fread(snakemake@input[[2]])
d= inner_join(d, x, by= 'Name')


d= d[sample(nrow(d)),]

d= d[order(d$Category, decreasing= F), ]

d$Name= factor(d$Name, levels= unique(d$Name))

d$Name2= gsub('_', ' ', gsub("^.*\\.","", d$Name))
d$Name2= factor(d$Name2, levels= unique(d$Name2))

p1= ggplot(d, aes(Name2, -log10(Coefficient_P_value), colour= Category, fill= Category)) + 
geom_point(size= 2, shape= 21, stroke= 0.1) +
xlab('Tissues') +
ylab('-log10(Enrichment)') +
theme_cowplot(font_size= 8) +
geom_hline(yintercept= -log10(0.05), colour= '#d9d9d9') +
theme(axis.text.x = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05),
	panel.grid.major.x= element_blank(),
	legend.position="none") +
geom_text_repel(data= filter(d, Coefficient_P_value< 0.05), aes(Name2, -log10(Coefficient_P_value), colour= Category, label= Name2, show_guide = FALSE))


ggsave(snakemake@output[[1]], plot= p1, width= 120, height= 90, units= 'mm', dpi= 300)

p2= ggplot(d, aes(Name2, -log10(Coefficient_P_value), colour= Category, fill= Category)) +
geom_point(size= 2, shape= 21, stroke= 0.1) +
xlab('Tissues') +
ylab('-log10(Enrichment)') +
theme_cowplot(font_size= 8) +
geom_hline(yintercept= -log10(0.05), colour= '#d9d9d9') +
theme(axis.text.x = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05),
        panel.grid.major.x= element_blank()) +
geom_text_repel(data= filter(d, Coefficient_P_value< 0.05), aes(Name2, -log10(Coefficient_P_value), colour= Category, label= Name2), show_guide = FALSE)

ggsave(snakemake@output[[2]], plot= p2, width= 120, height= 90, units= 'mm', dpi= 300)

R ggplot2 dplyr data.table cowplot ggrepel showtext From line 1 of figures/cell_type_enrichment.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
library(ggtern)
options(warn=-1)



colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)



shbg= fread(snakemake@input[[1]])

testo= fread(snakemake@input[[2]])


#shbg$locus= gsub("^.*\\_","", shbg$locus)
#testo$locus= gsub("^.*\\_","", testo$locus)




colT= colorBlindBlack8[4]
colR= colorBlindBlack8[1]
colL= colorBlindBlack8[2]

shbg$One_or_Other= shbg$PP.H0.abf + shbg$PP.H1.abf + shbg$PP.H2.abf
shbg$coloc= shbg$PP.H4.abf
shbg$shared_locus= shbg$PP.H3.abf

p1= ggtern(shbg, aes(One_or_Other, coloc, shared_locus)) +
geom_point(colour= colorBlindBlack8[8], fill= colorBlindBlack8[8], shape= 21) +
scale_alpha_continuous(range= c(0.6, 1), guide= F) +
scale_size_continuous(range= c(.001, 10), guide= F) +
theme_custom(tern.plot.background = NULL, tern.panel.background = 'white', col.T = colT, col.L = colL, col.R = colR, col.grid.minor = "white") +
Tarrowlab("Probability of shared causal variant") +
Larrowlab("Probability of locus not shared") +
Rarrowlab("Probability of shared locus (distinct causal variant)")  +
theme_showarrows()  +
theme_notitles() +
theme(text=element_text(family="arial", size= 10),
        tern.axis.arrow.T = element_blank(),
        tern.axis.arrow.L = element_blank(),
        tern.axis.arrow.R = element_blank(),
        tern.axis.text.T = element_text(color = colT),
        tern.axis.text.L = element_text(color = colL),
        tern.axis.text.R = element_text(color = colR),
        tern.axis.arrow.text.T = element_text(color = colT),
        plot.margin = margin(0, 0, 0, 0, "cm"),
        tern.axis.arrow.text.L = element_text(color = colL),
        tern.axis.arrow.text.R = element_text(color = colR),
        tern.panel.grid.major = element_line(linetype = 6, size = 0.3))

testo$One_or_Other= testo$PP.H0.abf + testo$PP.H1.abf + testo$PP.H2.abf
testo$coloc= testo$PP.H4.abf
testo$shared_locus= testo$PP.H3.abf

p2= ggtern(testo, aes(One_or_Other, coloc, shared_locus)) +
geom_point(colour= colorBlindBlack8[8], fill= colorBlindBlack8[8], shape= 21) +
scale_alpha_continuous(range= c(0.6, 1), guide= F) +
scale_size_continuous(range= c(.001, 10), guide= F) +
theme_custom(tern.plot.background = NULL, tern.panel.background = 'white', col.T = colT, col.L = colL, col.R = colR, col.grid.minor = "white") +
Tarrowlab("Probability of shared causal variant") +
Larrowlab("Probability of locus not shared") +
Rarrowlab("Probability of shared locus (distinct causal variant)")  +
theme_showarrows()  +
theme_notitles() +
theme(text=element_text(family="arial", size= 10),
        tern.axis.arrow.T = element_blank(),
        tern.axis.arrow.L = element_blank(),
        tern.axis.arrow.R = element_blank(),
        tern.axis.text.T = element_text(color = colT),
        tern.axis.text.L = element_text(color = colL),
        tern.axis.text.R = element_text(color = colR),
        tern.axis.arrow.text.T = element_text(color = colT),
        plot.margin = margin(0, 0, 0, 0, "cm"),
        tern.axis.arrow.text.L = element_text(color = colL),
        tern.axis.arrow.text.R = element_text(color = colR),
        tern.panel.grid.major = element_line(linetype = 6, size = 0.3))

ggsave(snakemake@output[[1]], plot= p1, width= 95, height= 95, units= 'mm', dpi= 300)

ggsave(snakemake@output[[2]], plot= p2, width= 95, height= 95, units= 'mm', dpi= 300)

R dplyr data.table tidyr cowplot ggrepel knitr showtext ggtern From line 1 of figures/coloc_sex_hormones.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
library(ggdendro)
library(gridExtra)
library(dendextend)
library(plyr)
library(ggtree)
library(scales)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[1]])


x= fread(snakemake@input[[2]], select= c('nearestGene', 'RSID'))

d= inner_join(d, x, by= c('rsid'= 'RSID'))

d$GENE= d$nearestGene
d$GENE= with(d, ifelse(GENE== 'CDC42', 'CDC42/ WNT4', ifelse(GENE== 'HIVEP3', 'HIVEP3/ EDN2', ifelse(GENE== 'TET3', 'TET3/ DGUOK-AS1', ifelse(GENE== 'TCEA2', 'TCEA2/ OPRL1', GENE)))))
d$nearestGene= d$GENE

d$nearestGene= with(d, ifelse(rsid== 'rs3129768', 'HLA-DQA1', ifelse(rsid== 'rs5991030', 'AGTR2', ifelse(rsid== 'rs5930554', 'RAP2C', nearestGene)))) 

d$nearestGene= with(d, ifelse(rsid== 'rs6780427', 'KCNAB1', nearestGene))
d$nearestGene= with(d, ifelse(rsid== 'rs6879092', 'EBF1', nearestGene))


d$nearestGene= gsub(' ', '', d$nearestGene)
d$nearestGene= paste0("(", d$nearestGene, ")")
d$rsid_lab= with(d, paste(rsid, nearestGene))

d$beta_PT= with(d, ifelse(beta_MT<0, -1 * beta_PT, beta_PT))
d$beta_MNT= with(d, ifelse(beta_MT<0, -1 * beta_MNT, beta_MNT))
d$beta_MT= with(d, ifelse(beta_MT<0, -1 * beta_MT, beta_MT))

d= gather(d, haplotype, beta, c('beta_MT', 'beta_MNT', 'beta_PT'))

max_beta= max(d$beta)
min_beta= min(d$beta)


d$haplotype= with(d, ifelse(haplotype== 'beta_MT', 'Maternal\ntransmitted', ifelse(haplotype== 'beta_MNT', 'Maternal\nnon-transmitted', 'Paternal\ntransmitted')))
d$rsid_lab= factor(d$rsid_lab, levels= unique(d$rsid_lab))


d$class_name= factor(d$class_name, levels= c("Maternal", "MF SD", "MF OD", "Fetal MatT", "Fetal"))

d= d %>% arrange(class_name, desc(probability)) %>% ungroup()
d$rsid_lab= factor(d$rsid_lab, levels= unique(d$rsid_lab))

labs <- sapply(
  strsplit(levels(d$rsid_lab), " "), 
  function(x) parse(text = paste0(x[1], "~italic('", x[2], "')"))
)

p1= ggplot(d, aes(rsid_lab, haplotype, fill= beta)) +
  theme_cowplot(8) +
  geom_tile() +
  #scale_fill_gradient2(low= colorBlindBlack8[4], high= colorBlindBlack8[2], mid= 'white', limits= c(min_beta, max_beta), guide= 'none', midpoint= 0) +
scale_fill_gradientn(colours=c(colorBlindBlack8[4], 'white', colorBlindBlack8[2]), values= rescale(c(min_beta, 0, max_beta)), limits= c(min_beta, max_beta), guide= 'none') +
  coord_equal() +
  scale_x_discrete(labels= labs) +
  theme(axis.title= element_blank(),
        axis.ticks= element_blank(),
        plot.margin = margin(0, 0, 0, 0, "mm"),
        text= element_text(size= 9/ .pt),
        axis.text.y= element_text(hjust= 0.5),
	axis.text.x= element_text(angle= 45, hjust= 1),
        axis.line = element_line(colour = 'black', size = 0.2)) +
  geom_text_repel(data= filter(d, haplotype== 'Paternal\ntransmitted'), aes(x= rsid_lab, y= 4,
                label= round(probability, 2)),  direction= 'y', size= 8/ .pt, box.padding = 0.01)

ggsave(snakemake@output[[1]], plot= p1, width= 180, height= 60, units= 'mm', dpi= 300)

p1= ggplot(d, aes(rsid_lab, haplotype, fill= beta)) +
  theme_cowplot(8) +
  geom_tile() +
  scale_fill_gradientn(colours=c(colorBlindBlack8[4], 'white', colorBlindBlack8[2]), values= rescale(c(min_beta, 0, max_beta)), limits= c(min_beta, max_beta), name= 'Effect size') +
  coord_equal() +
scale_x_discrete(labels= labs) +  
theme(axis.title= element_blank(),
        axis.ticks= element_blank(),
        plot.margin = margin(0, 9, 0,0, "mm"),
        text= element_text(size= 9/ .pt),
        axis.text.y= element_text(hjust= 0.5),
        axis.line = element_line(colour = 'black', size = 0.2),
	legend.position= 'bottom') +
  geom_text_repel(data= filter(d, haplotype== 'Paternal\ntransmitted'), aes(x= rsid_lab, y= -0.05,
                                                                                label= round(probability, 2)), direction= "y" ,
                  size= 6.5/ .pt) 
ggsave(snakemake@output[[2]], plot= p1, width= 180, height= 100, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[3]], sep= '\t')

R ggplot2 dplyr data.table tidyr cowplot ggrepel gridExtra knitr scales plyr showtext ggtree dendextend From line 1 of figures/effect_origin_dendrogram.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
library(ggtern)
options(warn=-1)


x= fread(snakemake@input[[1]], h= T)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

x$rsid= with(x, ifelse(rsid== 'chrX:116013571', 'rs5991030', ifelse(rsid== 'chrX:132178061', 'rs5930554', rsid)))

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[2]], header= T, select= c('RSID', 'ID', 'nearestGene'))

x= inner_join(x, d, by= c('rsid'= 'RSID'))

x$nearestGene= with(x, ifelse(rsid== 'rs3129768', 'HLA-DQA1', ifelse(rsid== 'rs5991030', 'AGTR2', ifelse(rsid== 'rs5930554', 'RAP2C', nearestGene)))) 

x$nearestGene= with(x, ifelse(rsid== 'rs6780427', 'KCNAB1', nearestGene))

x$nearestGene= with(x, ifelse(rsid== 'rs6879092', 'EBF1', nearestGene))

#d= gather(x, haplotype, beta, c('Paternal', 'MaternalT', 'MaternalNT'))

#d$rsid_label= with(d, paste0(RSID, ' (', nearestGene, ')'))


#max_beta= max(abs(d$beta))

#d$class= factor(d$class, levels= c("MF SD", "MF OD", "Maternal", "Fetal MatT", "Fetal"))

#d= arrange(d, class, desc(max_prob))

#d$rsid_label= factor(d$rsid_label, levels= unique(d$rsid_label))


colT= colorBlindBlack8[4]
colR= colorBlindBlack8[1]
colL= colorBlindBlack8[2]

x$MF= x$MF_OD + x$MF_SD
x$Fet= x$Fetal_MatT + x$Fetal

p1= ggtern(x, aes(Maternal, Fet, MF)) +
geom_point(colour= colorBlindBlack8[8], fill= colorBlindBlack8[8], shape= 21) +
scale_alpha_continuous(range= c(0.6, 1), guide= F) +
scale_size_continuous(range= c(.001, 10), guide= F) +
theme_custom(tern.plot.background = NULL, tern.panel.background = 'white', col.T = colT, col.L = colL, col.R = colR, col.grid.minor = "white") +
Tarrowlab("Fetal only effect") +
Larrowlab("Maternal only effect") +
Rarrowlab("Maternal and fetal effect")  +
theme_showarrows()  +
theme_notitles() +
theme(text=element_text(family="arial", size= 10),
        tern.axis.arrow.T = element_blank(),
        tern.axis.arrow.L = element_blank(),
        tern.axis.arrow.R = element_blank(),
        tern.axis.text.T = element_text(color = colT),
        tern.axis.text.L = element_text(color = colL),
        tern.axis.text.R = element_text(color = colR),
        tern.axis.arrow.text.T = element_text(color = colT),
        plot.margin = margin(0, 0, 0, 0, "cm"),
        tern.axis.arrow.text.L = element_text(color = colL),
        tern.axis.arrow.text.R = element_text(color = colR),
        tern.panel.grid.major = element_line(linetype = 6, size = 0.3)) 
print('
ggtern(data=d, aes(-log10(pvalue_h1),-log10(pvalue_h2),-log10(pvalue_h3), label= nearestGene, size= abs(BETA), alpha= -log10(pvalue))) +
geom_point(colour= "black", fill= colorBlindBlack8[8], shape= 21) +
scale_alpha_continuous(range= c(0.6, 1), guide= F) +
scale_size_continuous(range= c(.001, 10), guide= F) +
theme_custom(tern.plot.background = NULL, tern.panel.background = "white", col.T = colT, col.L = colL, col.R = colR, col.grid.minor = "white") +
Tarrowlab("Maternal non-transmitted allele") + 
Larrowlab("Maternal transmitted allele") + 
Rarrowlab("Paternal transmitted allele")  +
theme_showarrows()  +
theme_notitles() +
theme(text=element_text(family="arial", size= 10),
	tern.axis.arrow.T = element_blank(),
	tern.axis.arrow.L = element_blank(),
	tern.axis.arrow.R = element_blank(),
        tern.axis.text.T = element_text(color = colT),
        tern.axis.text.L = element_text(color = colL),
        tern.axis.text.R = element_text(color = colR),
	tern.axis.arrow.text.T = element_text(color = colT), 
	plot.margin = margin(0, 0, 0, 0, "cm"), 
	tern.axis.arrow.text.L = element_text(color = colL),
	tern.axis.arrow.text.R = element_text(color = colR),
	tern.panel.grid.major = element_line(linetype = 6, size = 0.3)) +
geom_text(data= filter(d, nearestGene== "HAND2"), position= position_nudge_tern(y=0.05,x=-0.05/2,z=-0.05/2), aes(label=nearestGene), fontface= "bold", check_overlap=T, size= 8/ .pt, colour= "#525252", hjust= 1, vjust= 0.5)')


ggsave(snakemake@output[[1]], plot= p1, width= 95, height= 95, units= 'mm', dpi= 300)


d= select(x, rsid, ID, MF, Maternal, Fetal)

fwrite(x, snakemake@output[[2]], sep= '\t')

R dplyr data.table tidyr cowplot ggrepel knitr showtext ggtern From line 1 of figures/effect_origin_ternary.R

library(data.table)
library(dplyr)
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library('showtext')


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[1]])

d$lead_snp= with(d, ifelse(lead_snp== '1:50958027', '1:50959262', ifelse(lead_snp== '9:116929327', '9:116935764', ifelse(lead_snp== '5:157896786', '5:157895049', ifelse(lead_snp== '1:22511594', '1:22462111', lead_snp)))))

x= fread(snakemake@input[[2]])

x$lead_snp= paste(x$CHR, x$POS, sep= ':')

d= inner_join(d,x, by= 'lead_snp')

d$z_score= ifelse(d$z_score> 3.5, 3.5, d$z_score)

d$nearestGene= with(d, ifelse(nearestGene== 'CDC42', 'CDC42/ WNT4', ifelse(nearestGene== 'HIVEP3', 'HIVEP3/ EDN2', ifelse(nearestGene== 'TET3', 'TET3/ DGUOK-AS1', ifelse(nearestGene== 'TCEA2', 'TCEA2/ OPRL1', nearestGene)))))

d= filter(d, !(annotation %in% c('B2', 'geva_allele_age')))

d$annotation= with(d, ifelse(annotation== 'argweave', 'ARGWEAVE', 
		ifelse(annotation== 'betascore', 'Beta score',
		ifelse(annotation== 'B2', '', 
		ifelse(annotation== 'fst_eas_afr', 'Fst AFR-EAS',
		ifelse(annotation== 'fst_eur_afr', 'Fst AFR-EUR',
		ifelse(annotation== 'fst_eur_eas', 'Fst EAS-EUR',
		ifelse(annotation== 'gerp', 'GERP',
		ifelse(annotation== 'geva_allele_age', 'Alelle age',
		ifelse(annotation== 'iES_Sabeti', 'iES',
		ifelse(annotation== 'linsigh', 'LINSIGHT',
		ifelse(annotation== 'phastCon100', 'phastCONS100',
		ifelse(annotation== 'phyloP100', 'PhyloP',
		ifelse(annotation== 'xpehh_afr2_eas', 'XPEHH AFR-EAS',
		ifelse(annotation== 'xpehh_afr2_eur', 'XPEHH AFR-EUR',
		'XPEHH EAS-EUR')))))))))))))))

p1= ggplot(d, aes(annotation, nearestGene, fill= z_score)) +
geom_tile(colour = "white", size= 1) +
theme_cowplot(font_size= 9) +
scale_fill_gradient2(low= colorBlindBlack8[2], high= colorBlindBlack8[4], mid= 'white', limits= c(-2, 4)) +
theme(axis.text.x = element_text(angle = 45, hjust = 0),
        axis.title.x = element_blank(),
        axis.title.y = element_blank()) +
scale_x_discrete(position = "top") +
geom_text(data= filter(d, pvalue.x< 0.05), aes(annotation, nearestGene, label= '*'), size= 8/ .pt) +
theme(  panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.ticks= element_blank(),
        panel.border = element_rect(colour= 'black', fill= NA, size=1),
        plot.margin = unit(c(0, 1, 0, 0), "cm"),
        axis.line= element_blank(),
	axis.text.y = element_text(face = "italic")) +
coord_equal()


ggsave(snakemake@output[[1]], plot= p1, width= 140, height= 120, units= 'mm', dpi= 300)

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/evo.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


colorBlindBlack8  <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

d= fread(snakemake@input[[1]])

z= fread(snakemake@input[[3]])

df= fread(snakemake@input[[2]], select= (c('MarkerName', 'Effect', 'StdErr', 'HetISq', 'HetPVal', 'TOTALSAMPLESIZE', 'P-value', 'Allele1', 'Allele2')))
names(df)= c('SNP', 'BETA', 'SE', 'HetISq', 'HetPval', 'N', 'pvalue', 'A1', 'A2')
df= filter(df, SNP %in% d$SNP)

df= separate(df, SNP, into= c('CHR', 'POS', 'Ax1', 'Ax2', 'ID'), sep= ':', remove= F)
df$BETA= ifelse(df$A2 > df$A1, -1 * df$BETA, df$BETA)
df$CHR= ifelse(df$CHR== 'X','23', df$CHR)
df$CHR= as.integer(df$CHR)
df$POS= as.integer(df$POS)
df= select(df, -c(A1, A2, ID, Ax1, Ax2))

df$cohort= 'Meta-analysis'
d= bind_rows(d, df)

z$CHR= ifelse(z$CHR== 'X','23', z$CHR)
z$CHR= as.integer(z$CHR)

d= inner_join(d, z, by= 'CHR') %>% filter(POS> pos1, POS< pos2)

d$locus= paste0('Chr ', d$CHR,': ', d$nearestGene)

d= filter(d, !(cohort %in% c('PGPII', 'PGPIII', 'BIB', 'DNBCPTD', 'STORK', 'STORKGROR')))

d$cohort= paste0(d$cohort, ' (n= ', d$N, ')')

temp_df= d[d$nearestGene== snakemake@wildcards[['prev_locus']], ]

temp_df= temp_df[order(temp_df$N, decreasing= T), ]

rsid= ifelse(snakemake@wildcards[['prev_locus']]== 'EEFSEC', 'rs2659685', 
ifelse(snakemake@wildcards[['prev_locus']]== 'WNT4', 'rs12037376', 
ifelse(snakemake@wildcards[['prev_locus']]== 'EBF1', 'rs2963463',
ifelse(snakemake@wildcards[['prev_locus']]== 'AGTR2', 'rs5991030', 'rs28654158'))))

gene= unique(temp_df$nearestGene)
my_title = expression(paste0(italic(gene), " (,", rsid, ")"))

p1= ggplot(temp_df, aes(x=factor(cohort, level = factor(cohort)), y=BETA, ymin= BETA - 1.96 * SE, ymax= BETA + 1.96 * SE, colour= !is.na(HetISq), shape= !is.na(HetISq)), alpha= 0.5) +
 geom_pointrange(size= 0.4) +
scale_shape_manual(values= c(15, 18), guide= F) +
 geom_hline(yintercept = 0, linetype=2) +
scale_y_continuous(sec.axis = dup_axis()) +
ggtitle(parse(text = paste0(rsid, ' - ', "~italic('", unique(temp_df$nearestGene), "')"))) + 
coord_flip() +
scale_colour_manual(values= c(colorBlindBlack8[3], colorBlindBlack8[4]), guide= F) +
theme_cowplot(8) +
 xlab('') +
    ylab('Beta [95% CI]') +
geom_vline(xintercept= 0, linetype= "dotted", colour= 'grey') 

ggsave(snakemake@output[[1]], plot= p1, width= 140, height= 30.5  + 50/13 * nrow(temp_df), units= 'mm', dpi= 300)

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/forest_plot_EEFSEC.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])

d$term= with(d, ifelse(term== 'fetal_effect_PGS', 'Fetal', 'Maternal'))
d$outcome= gsub(' PGS', '', d$outcome)


p1= ggplot(d, aes(term, estimate, colour= term)) + 
geom_pointrange(aes(ymin= lo95, ymax= up95)) + 
facet_wrap(vars(outcome)) + 
scale_colour_manual(guide= 'none', values= colorBlindBlack8[c(2, 4)]) +
theme_cowplot(10) + 
geom_hline(yintercept= 0, colour= 'grey', size= 0.5, linetype= 'dashed') + 
theme(strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA)) + 
ylab('Effect on gestational duration \ngenetic score (95% CI), days') +
xlab('Birth weight genetic score')


ggsave(snakemake@output[[1]], plot= p1, width= 180, height= 100, units= 'mm', dpi= 300)

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/GA_BW_PGS_correlations.R

library(MendelianRandomization)
library(data.table)
library(dplyr)
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library('showtext')


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


top_ga= fread(snakemake@input[[1]])

top_ga= c(pull(top_ga, ID), '5:158058432:G:T', '3:156697097:A:G')


top_ptd= fread(snakemake@input[[2]])

top_ptd= pull(top_ptd, ID)

top= c(top_ga, top_ptd)

top= unique(top)

ga= fread(snakemake@input[[3]], select= c('ID', 'BETA', 'SE'))

ga= filter(ga, ID %in% top)

ptd= fread(snakemake@input[[4]], select= c('ID', 'BETA', 'SE'))

ptd= filter(ptd, ID %in% top) %>% select(ID, BETA, SE)

names(ptd)= c('ID', 'BETA_ptd', 'SE_ptd')

d= inner_join(ga, ptd, by= 'ID')

d$GWAS= with(d, ifelse(ID== '5:157895049:C:T', 'Both phenotypes', ifelse(ID %in% top_ptd, 'Preterm delivery', 'Gestational duration')))

p1= ggplot(d, aes(BETA, BETA_ptd, colour= GWAS, fill= GWAS)) +
geom_errorbarh(aes(xmin= BETA - SE, xmax= BETA + SE, colour= GWAS, fill= GWAS), size= 0.1, alpha= 0.7) +
geom_errorbar(aes(ymin= BETA_ptd - SE_ptd, ymax= BETA_ptd + SE_ptd, colour= GWAS, fill= GWAS),size= 0.1, alpha= 0.7) +
geom_point(size= 2, shape=21, stroke= 0.1, alpha= 0.7) +
scale_colour_manual(values= colorBlindBlack8[c(4, 2, 1)], guide= 'none') +
scale_fill_manual(values= colorBlindBlack8[c(4, 2, 1)], guide= 'none') +
xlab('Maternal effect on gestational duration, days') +
ylab('Maternal effect on preterm delivery, log(OR)') +
theme_cowplot(font_size= 8) +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05))


ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)

p1= ggplot(d, aes(BETA, BETA_ptd, colour= GWAS, fill= GWAS)) +
geom_errorbarh(aes(xmin= BETA - SE, xmax= BETA + SE, colour= GWAS, fill= GWAS), size= 0.1, alpha= 0.7) +
geom_errorbar(aes(ymin= BETA_ptd - SE_ptd, ymax= BETA_ptd + SE_ptd, colour= GWAS, fill= GWAS),size= 0.1, alpha= 0.7) +
geom_point(size= 2, shape=21, stroke= 0.1, alpha= 0.7) +
scale_colour_manual(values= colorBlindBlack8[c(4, 2, 1)], guide= 'none') +
scale_fill_manual(values= colorBlindBlack8[c(4, 2, 1)]) +
xlab('Maternal effect on gestational duration, days') +
ylab('Maternal effect on preterm delivery, log(OR)') +
theme_cowplot(font_size= 8) +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05))

ggsave(snakemake@output[[2]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/GAraw_vs_allPTD.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)


d= fread(snakemake@input[[1]], h= T)


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)



pph= fread(snakemake@input[[1]])
supp_table= pph
geneb= fread(snakemake@input[[2]])

gene_dict= fread(snakemake@input[[3]])

names(gene_dict)= c('CHR', 'POS1', 'POS2', 'Gene', 'EnsembleID')

gene_dict$EID= with(gene_dict, unlist(lapply(strsplit(as.character(EnsembleID), ".", fixed= T), '[[', 1)))

d= inner_join(pph, gene_dict, by= c('protein'= 'EID')) %>% inner_join(., geneb, by= 'Gene')

#supp_table= full_join(pph, gene_dict, by= c('protein'= 'EID')) %>% full_join(., geneb, by= 'Gene') %>% filter(Pvalue< 0.05/ nrow(geneb) | PP.H4.abf>= 0.9)


z= fread(snakemake@input[[5]], select= c('z.df1', 'z.df2', 'SNP.PP.H4', 'protein', 'snp'))
z= arrange(z, desc(SNP.PP.H4))

z= group_by(z, protein) %>% filter(row_number()==1)

d= left_join(d, z, by= c('protein'))

d= separate(d, snp, into= c('CHR', 'POS', 'REF', 'EFF'), sep= ':', remove= FALSE)

#aa= fread(snakemake@input[[6]])
#names(aa)= c('CHR', 'POS', 'REF', 'ALT', 'AA')
#aa= filter(aa, AA!= '.')
#aa= filter(aa, POS %in% d$POS)

#aa$ID= with(aa, ifelse(REF> ALT, paste(CHR, POS, ALT, REF, sep= ':'), paste(CHR, POS, REF, ALT, sep= ':')))

#d= left_join(d,aa[, c('ID', 'AA')], by= c('snp'= 'ID'))

#d$z.df1= with(d, ifelse(d$AA== d$EFF, -1 * d$z.df1, d$z.df1))
#d$z.df2= with(d, ifelse(d$AA== d$EFF, -1 * d$z.df2, d$z.df2))

#d$direction= with(d, ifelse(z.df1>0 & z.df2 > 0, 'Positive', ifelse(z.df1<0 & z.df2< 0, 'Negative', 'Opposite')))
#d$direction= with(d, ifelse(is.na(d$AA), 'Missing', d$direction))

d$direction= with(d, ifelse((z.df1 * z.df2)>0, 'Same direction', 'Opposite'))

d$gene_group= with(d, ifelse(PP.H4.abf> 0.9 & Pvalue< 0.05 / nrow(geneb), 'Colocalize and gene-based significant', ifelse(Pvalue< 0.05 / nrow(geneb) & PP.H4.abf<= 0.9, 'Gene based significant',
	ifelse(PP.H4.abf> 0.9 & Pvalue> 0.05 / nrow(geneb), 'Colocalize', 'No colocalize and not significant'))))

ga= fread(snakemake@input[[4]], select= c('ID', 'BETA'))

d= inner_join(d, ga, by= c('snp'= 'ID'))

p1= ggplot(d, aes(-log10(Pvalue), PP.H4.abf, fill= direction, alpha= (1 + PP.H4.abf) * -log10(Pvalue))) +
geom_point(shape=21, colour= 'black', size= 4) +
theme_cowplot(font_size= 10) +
scale_alpha_continuous(guide= F) +
scale_size_continuous(range = c(.001, 10), guide= F) +
scale_fill_manual(values= c(colorBlindBlack8[c(2, 4)]), guide= F) +
geom_text_repel(data= filter(d, PP.H4.abf> 0.9 | Pvalue< 0.05 / nrow(geneb)), aes(label= Gene), max.overlaps= 20, colour= 'black', size= 6/ .pt, max.time= 10, alpha= 1) +
geom_hline(yintercept= 0.9, colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_vline(xintercept= -log10(0.05/nrow(geneb)), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
scale_y_continuous(breaks= c(seq(0, 1, 0.25), 0.9), limits= c(0, 1), expand= expansion(mult= c(0.05,0))) +
ylab('Posterior probability of colocalization') +
xlab('-log10(Gene based p-value)')

ggsave(snakemake@output[[1]], plot= p1, width= 95, height= 95, units= 'mm', dpi= 300)

d= select(d, Gene, BETA, direction, Pvalue, PP.H4.abf, Pvalue, z.df1, z.df2)

fwrite(d, snakemake@output[[2]], sep= '\t')

p1= ggplot(d, aes(-log10(Pvalue), PP.H4.abf, fill= direction, alpha= (1 + PP.H4.abf) * -log10(Pvalue))) +
geom_point(shape=21, colour= 'black', size= 4) +
theme_cowplot(font_size= 10) +
scale_alpha_continuous('Legend') +
scale_size_continuous('Legend', range = c(.001, 10)) +
scale_fill_manual('Legend', values= c(colorBlindBlack8[c(2, 4)])) +
geom_text_repel(data= filter(d, PP.H4.abf> 0.9 | Pvalue< 0.05 / nrow(geneb)), aes(label= Gene), max.overlaps= 20, colour= 'black', size= 6/ .pt, max.time= 10, alpha= 1) +
geom_hline(yintercept= 0.9, colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_vline(xintercept= -log10(0.05/nrow(geneb)), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
scale_y_continuous(breaks= c(seq(0, 1, 0.25), 0.9), limits= c(0, 1), expand= expansion(mult= c(0.05,0))) +
ylab('Posterior probability of colocalization') +
xlab('-log10(Gene based p-value)')

ggsave(snakemake@output[[3]], plot= p1, width= 90, height= 90, units= 'mm', dpi= 300)

fwrite(supp_table, snakemake@output[[4]], sep= '\t')

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/gene_based_vs_coloc_iPSC.R

library(scales)
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
library(tidyverse)
library(fmsb)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

d= fread(snakemake@input[[1]])

d$p1= 'Gestational\nduration'
d$p2= with(d, ifelse(grepl('postTerm', p2), 'Post-term\ndelivery', ifelse(grepl('allPTD', p2), 'Preterm\ndelivery', 'GAnrm')))

d= filter(d, p2!= 'GAnrm')

p1= ggplot(d, aes(p2, rg, colour= p2)) +
  geom_point() +
geom_errorbar(aes(ymin= I(rg - 1.96*se) , ymax= (rg + 1.96 * se)), width=.2, position=position_dodge(.9)) +
theme_cowplot(font_size= 9) +
scale_fill_manual(values= colorBlindBlack8[c(8,3,2)], guide= 'none') +
scale_colour_manual(guide= 'none', values= colorBlindBlack8[c(8,3,2)]) +
xlab('Phenotype') +
ylab('Genetic correlation [95% CI]') +
theme(legend.position= 'none') +
ylim(pmin(-1, min(d$rg - 1.96*d$se)), pmax(1, max(d$rg + 1.96 * d$se))) +
geom_hline(yintercept= 0, linetype= 'dashed', colour= 'grey', size= 0.5)


ggsave(snakemake@output[[1]], plot= p1, width= 60, height= 80, units= 'mm', dpi= 300)

R ggplot2 tidyverse dplyr data.table tidyr cowplot ggrepel knitr scales showtext From line 1 of figures/genet_correlations_meta.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)


d= fread(snakemake@input[[1]], h= T)


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])

d= filter(d, pheno!= 'GAnrm')

d$pheno= with(d, ifelse(pheno== 'GAraw', 'Gestational\nduration', ifelse(pheno== 'allPTD', 'Preterm\ndelivery', 'Post-term\ndelivery')))

p1= ggplot(d, aes(pheno, h2, colour= pheno)) +
  geom_point() +
geom_errorbar(aes(ymin= I(h2 - 1.96*se) , ymax= (h2 + 1.96 * se)), width=.2, position=position_dodge(.9)) +
theme_cowplot(font_size= 9) +
scale_fill_manual(values= colorBlindBlack8[c(8,3,2)], guide= 'none') +
scale_colour_manual(guide= 'none', values= colorBlindBlack8[c(8,3,2)]) +
xlab('Phenotype') +
ylab('Common SNP heritability [95% CI]') +
theme(legend.position= 'none',
	axis.text.x= element_text(angle= 45, hjust= 1))


ggsave(snakemake@output[[1]], plot= p1, width= 60, height= 80, units= 'mm', dpi= 300)

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/h2_allphenos.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)


d= fread(snakemake@input[[1]], h= T)
x= fread(snakemake@input[[2]], h= T)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)
d$trait= 'Gestational\nduration'
x$trait= 'Preterm delivery'

d= rbind(d, x)

p1= ggplot(d, aes(cohort, h2, colour= cohort)) +
  geom_point() +
geom_errorbar(aes(ymin= I(h2 - 1.96*se) , ymax= (h2 + 1.96 * se)), width=.2, position=position_dodge(.9)) +
theme_cowplot(font_size= 9) +
facet_wrap(vars(trait), ncol= 1) +
scale_fill_manual(values= colorBlindBlack8[c(8,3,2,6,7, 4, 1)], guide= 'none') +
scale_colour_manual(guide= 'none', values= colorBlindBlack8[c(8,3,2,6,7, 4, 1)]) +
xlab('Cohort') +
ylab('Common SNP heritability [95% CI]') +
theme(legend.position= 'none',
	strip.background = element_blank(),
	axis.text.x= element_text(angle= 45, hjust= 1))


ggsave(snakemake@output[[1]], plot= p1, width= 60, height= 120, units= 'mm', dpi= 300)

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/h2_cohorts.R

library(dplyr)
library(data.table)
library(ggplot2)
library(cowplot)
library(ggrepel)
library(tidyr)
library(showtext)
colorBlindBlack8  <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])
names(d)[8]= 'phenocode'
mani= fread(snakemake@input[[2]])

trait_list= c('biomarkers', 'continuous', 'icd10')
mani= mani[mani$trait_type %in% trait_list, ]

mani= filter(mani, saige_heritability_EUR> 0.01)
mani= mani[order(mani$saige_heritability_EUR, decreasing= TRUE), ]
mani= mani[!duplicated(mani$phenocode), ]

mani$phenocode= paste(mani$trait_type, mani$phenocode, sep= '_')
mani= mani[, c('phenocode', 'description')]
mani= mani[!duplicated(mani$description), ]

d= inner_join(d, mani[, c('description', 'phenocode')], by= 'phenocode')
d$cohort= 'UKBB'

x= fread(snakemake@input[[3]])
names(x)[8]= 'phenocode'
mani= fread(snakemake@input[[4]])
mani= mani[, c('phenocode', 'name')]
names(mani)= c('phenocode', 'description')
mani= mani[!duplicated(mani$description), ]

x= inner_join(x, mani, by= 'phenocode')
x$cohort= 'FINNGEN'

d= rbind(d, x)
d= d[order(d$PP.H4.abf, decreasing= F), ]
d= filter(d, PP.H4.abf> 0.01, PP.H4.abf + PP.H3.abf> 0.75)

d$preg_trait= factor(d$preg_trait)
empty_bar <- 6
to_add <- data.frame( matrix(NA, empty_bar*nlevels(d$preg_trait), ncol(d)) )
colnames(to_add) <- colnames(d)
to_add$preg_trait <- rep(levels(d$preg_trait), each=empty_bar)
d <- rbind(d, to_add)
d <- d %>% arrange(preg_trait)


d$id= seq(1, nrow(d))

label_data= d
number_of_bar <- nrow(label_data)
angle <-  90 - 360 * (label_data$id-0.5) /number_of_bar
label_data$hjust<-ifelse( angle < -90, 1, 0)


label_data$angle<-ifelse(angle < -90, angle+180, angle)

#d$id= factor(d$id, levels= d$id[order(d$PP.H4.abf)])

base_data= d %>%
  group_by(preg_trait) %>%
  filter(is.na(PP.H4.abf)) %>%
  summarize(start=min(id), end=max(id) ) %>%
  rowwise() %>%
  mutate(title=mean(c(start, end)))

arc100= rep(1, 2)
arc75= rep(0.75, 2)
arc50= rep(0.50, 2)
arc25= rep(0.25, 2)

p1= ggplot(d, aes(as.factor(id), PP.H4.abf, fill= preg_trait, alpha= PP.H4.abf)) +
geom_bar(stat="identity", colour= NA) +
scale_alpha_continuous(range= c(0.4, 1), guide= F) +
geom_segment(data=base_data, aes(x = end, y = arc100, xend = start, yend = arc100), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  geom_segment(data=base_data, aes(x = end, y = arc75, xend = start, yend = arc75), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  geom_segment(data=base_data, aes(x = end, y = arc50, xend = start, yend = arc50), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  geom_segment(data=base_data, aes(x = end, y = arc25, xend = start, yend = arc25), colour = "grey", alpha=1, size=0.3 , inherit.aes = FALSE ) +
  annotate("text", x = ((base_data$end[1] + base_data$start[1]) / 2), y = c((0.25 + 0.05) , (0.50 + 0.05), (0.75 + 0.05) , (1 + 0.05)), label = c("0.25", "0.50", "0.75", "1") , color="grey", size=3 , angle=0, fontface="bold", hjust= 0.5) +
   annotate("text", x = ((base_data$end[2] + base_data$start[2]) / 2), y = c((0.25 + 0.05) , (0.50 + 0.05), (0.75 + 0.05) , (1 + 0.05) ), label = c("0.25", "0.50", "0.75", "1") , color="grey", size=3, angle=15, fontface="bold", hjust=0.5) +
ylim(-0.2, 2) + # Limits of the plot = very important. The negative value controls the size of the inner circle, the positive one is useful to add size over each bar
theme_cowplot() +
scale_fill_manual(values=colorBlindBlack8[c(2,4)], guide= F) +
scale_colour_manual(values=colorBlindBlack8[c(2,4)], guide= F) +
  theme(
    axis.text = element_blank(),
    axis.title = element_blank(),
    panel.grid = element_blank(),
    plot.margin = unit(rep(-2,4), "cm")    ) +
  coord_polar(start = 0) +
geom_text(data=filter(label_data, PP.H4.abf> 0.75), aes(x= factor(id), y=PP.H4.abf + 0.01, label=description, hjust=hjust), color="black", fontface="bold",alpha=0.6, size=2.5, angle= filter(label_data, PP.H4.abf> 0.750)$angle, inherit.aes = FALSE) +
theme(panel.grid = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank())

p1= save_plot(snakemake@output[[1]], p1, base_width= 8, base_height= 8)

fwrite(d, snakemake@output[[2]], sep= '\t')

R ggplot2 dplyr data.table tidyr cowplot ggrepel showtext From line 1 of figures/KCNAB1_pheWAS.R

library(data.table)
library(dplyr)
library(cowplot)
library(ggrepel)
library('showtext')


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)



d= fread(snakemake@input[[1]])

d$Category= factor(d$Category, levels= unique(d$Category))

p1= ggplot(d, aes(Enrichment, -log10(Enrichment_p))) + 
geom_point(aes(size= Enrichment_p< 0.05/ (nrow(d)-1)), shape= 21, stroke= 0.1, fill= colorBlindBlack8[4]) +
xlab('Heritability enrichment') +
ylab('-log10(P-value)') +
theme_cowplot(font_size= 8) +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(panel.grid.major= element_line(colour= 'grey', size= 0.05),
legend.position = "none")  +
geom_text_repel(data= filter(d, Enrichment_p< 0.05), aes(Enrichment, -log10(Enrichment_p), label= Category), size= 8/.pt)


ggsave(snakemake@output[[1]], plot= p1, width= 120, height= 90, units= 'mm', dpi= 300)


p2= ggplot(d, aes(n_genes, -log10(Enrichment_p))) + 
geom_point(aes(size= Enrichment_p< 0.05/ (nrow(d)-1)), shape= 21, stroke= 0.1, fill= colorBlindBlack8[4]) +
xlab('Size of gene set') +
ylab('-log10(Enrichment)') +
theme_cowplot(font_size= 8) +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(panel.grid.major= element_line(colour= 'grey', size= 0.05),
legend.position = "none") +
geom_text_repel(data= filter(d, Enrichment_p< 0.05), aes(n_genes, -log10(Enrichment_p), label= Category), size= 8/.pt)

ggsave(snakemake@output[[2]], plot= p2, width= 90, height= 90, units= 'mm', dpi= 300)

p3= ggplot(d, aes(n_genes, -log10(Enrichment_p))) + 
geom_point(aes(size= Enrichment_p< 0.05/ (nrow(d)-1)), shape= 21, stroke= 0.1, fill= colorBlindBlack8[4]) +
xlab('Size of gene set') +
ylab('-log10(Enrichment)') +
theme_cowplot(font_size= 8) +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(panel.grid.major= element_line(colour= 'grey', size= 0.05)) +
geom_text_repel(data= filter(d, Enrichment_p< 0.05), aes(n_genes, -log10(Enrichment_p), label= Category), size= 8/.pt)

ggsave(snakemake@output[[3]], plot= p3, width= 90, height= 90, units= 'mm', dpi= 300)

R ggplot2 dplyr data.table cowplot ggrepel showtext From line 1 of figures/labor_deg.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')



colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[1]])

x= fread(snakemake@input[[2]], select= c('RSID', 'BETA'))


d= inner_join(d, x, by= c('rsid' = 'RSID'))

d$beta_MNT= with(d, ifelse(BETA< 0, -1 * beta_MNT,  beta_MNT))
d$beta_PT= with(d, ifelse(BETA< 0, -1 * beta_PT, beta_PT))
d$beta_MT= with(d, ifelse(BETA< 0, -1 * beta_MT, beta_MT))
d$BETA= with(d, ifelse(BETA<0, -1 * BETA, BETA))

d$lo95_MT= d$beta_MT - 1.96 * d$se_MT
d$up95_MT= d$beta_MT + 1.96 * d$se_MT

d$lo95_MNT= d$beta_MNT - 1.96 * d$se_MNT
d$up95_MNT= d$beta_MNT + 1.96 * d$se_MNT

d$lo95_PT= d$beta_PT - 1.96 * d$se_PT
d$up95_PT= d$beta_PT + 1.96 * d$se_PT

d$class_name= with(d, ifelse(class_name== 'MF SD', 'Maternal and fetal (same direction)', ifelse(class_name== 'Fetal MatT', 'Fetal effect, maternal transmitted only', ifelse(class_name== 'Maternal', 'Maternal', ifelse(class_name== 'Fetal', 'Fetal', ifelse(class_name== 'MF OD', 'Maternal and fetal (opposite direction)', ''))))))

p1= ggplot(d, aes(beta_MNT, BETA, colour= class_name)) +
geom_point(size= 0.5) +
#geom_errorbarh(data= filter(d, (lo95_h2 >0 & up95_h2>0) | (lo95_h2<0 & up95_h2 <0)), aes(xmax = lo95_h2, xmin = up95_h2), size= 0.05) +
theme_cowplot(font_size= 8) +
scale_colour_manual(values= c('grey', colorBlindBlack8[c(8, 2, 4, 3)])) +
geom_vline(xintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_hline(yintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
xlab('Effect size maternal \nnon-transmitted alleles, days') +
ylab('Effect size maternal genome, days')
#theme(legend.direction = "horizontal", legend.position = "bottom")
#scale_x_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) +
#  scale_y_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1))


ggsave(snakemake@output[[1]], plot= p1, width= 60, height= 60, units= 'mm', dpi= 300)

print('plot1')
p1= ggplot(d, aes(beta_PT, BETA, colour= class_name)) +
geom_point(size= 0.5) +
#geom_errorbarh(data= filter(d, (lo95_h3 >0 & up95_h3>0) | (lo95_h3<0 & up95_h3 <0)), aes(xmax = lo95_h3, xmin = up95_h3), size= 0.05) +
theme_cowplot(font_size= 8) +
scale_colour_manual(values= c('grey', colorBlindBlack8[c(8, 2, 4, 3)])) +
geom_vline(xintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_hline(yintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
xlab('Effect size paternal \ntransmitted alleles, days') +
ylab('Effect size maternal genome, days') 
#scale_x_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) +
#  scale_y_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1))

ggsave(snakemake@output[[2]], plot= p1, width= 60, height= 60, units= 'mm', dpi= 300)

print('plot2')
p1= ggplot(d, aes(beta_MT, BETA, colour= class_name)) +
geom_point(size= 0.5) +
#geom_errorbarh(data= filter(d, (lo95_h3 >0 & up95_h3>0) | (lo95_h3<0 & up95_h3 <0)), aes(xmax = lo95_h3, xmin = up95_h3), size= 0.05) +
theme_cowplot(font_size= 8) +
scale_colour_manual(values= c('grey', colorBlindBlack8[c(8, 2, 4, 3)]), guide= F) +
geom_vline(xintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_hline(yintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
xlab('Effect size maternal \ntransmitted alleles, days') +
ylab('Effect size maternal genome, days')
#scale_x_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) +
#  scale_y_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1))

ggsave(snakemake@output[[3]], plot= p1, width= 60, height= 60, units= 'mm', dpi= 300)

p1= ggplot(d, aes(beta_MNT, BETA, colour= class_name)) +
geom_point(size= 0.5) +
#geom_errorbarh(data= filter(d, (lo95_h2 >0 & up95_h2>0) | (lo95_h2<0 & up95_h2 <0)), aes(xmax = lo95_h2, xmin = up95_h2), size= 0.05) +
theme_cowplot(font_size= 8) +
scale_colour_manual(values= c('grey', colorBlindBlack8[c(8, 2, 4, 3)])) +
geom_vline(xintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_hline(yintercept= 0, colour= colorBlindBlack8[1], linetype= 'dashed', size= 0.2, alpha= 0.6) +
xlab('Effect size maternal \nnon-transmitted alleles, days') +
ylab('Effect size maternal genome, days') 
theme(legend.direction = "horizontal", legend.position = "bottom")
#scale_x_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1)) +
#  scale_y_continuous(breaks = round(seq(-1.5, 3, by= 0.5), 1))

ggsave(snakemake@output[[4]], plot= p1, width= 120, height= 60, units= 'mm', dpi= 300)
fwrite(d, snakemake@output[[5]], sep= '\t')

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/lm_effect_origin.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])

x= fread(snakemake@input[[2]])

d= rbind(d, x)

names(d)= c('Name', 'no_no', 'no_yes', 'yes_no', 'yes_yes', 'candidate_gene', 'rest_genes', 'OR', 'pvalue')
d$enrichment= d$candidate_gene / d$rest_genes

d= arrange(d, desc(pvalue))

d$description= with(d, ifelse(Name== 'pli', 'Loss-of-function intolerant',
			ifelse(Name== 'dominant', 'Dominant', 'Recessive')))

d$description= factor(d$description, levels= unique(d$description))



p1= ggplot(data=d, aes(x= description, y= -log10(pvalue))) +
geom_col(fill=colorBlindBlack8[2], alpha= 0.6) +
theme_cowplot(font_size= 10) +
ylab('Enrichment -log10(pvalue)') +
theme(axis.title.y=element_blank()) +
geom_hline(yintercept= -log10(0.05/nrow(d)), linetype= 'dashed', colour= 'grey') +
coord_flip()


ggsave(snakemake@output[[1]], plot= p1, height= 35, width= 90, dpi= 300, units= 'mm')

fwrite(d, snakemake@output[[2]], sep='\t')

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/MacArthurlab_enrichment.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)


d= fread(snakemake@input[[1]], h= T)


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

ge= data.frame(CHR= c(5, 3, 1, 23, 3, 23), pos_ge= c(157895049, 127881613, 22470407, 115164770, 123068359, 131300571))

df= arrange(d, pvalue)

dg= fread(snakemake@input[[2]])
dg$GENE= dg$nearestGene


don <- df %>%
    group_by(CHR)      %>%
    summarise(chr_len= max(POS)) %>%
    mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome
    select(-chr_len) %>%
    left_join(df, ., by= 'CHR') %>%
    arrange(CHR, POS) %>% # Add a cumulative position of each SNP
    mutate(BPcum=POS+tot) %>%
         ungroup()

axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
  names(axisdf)= c('CHR', 'center')
HC= -log10(5*10**-8)
dg= dg %>% ungroup() %>% select(ID, GENE, CHR, POS, BETA)

don$disc= ifelse(don$pvalue> 5*10**-8, 0, 2)

don= left_join(don, select(dg, ID, GENE), by= 'ID')
names(dg)= c('ID', 'GENE', 'CHR', 'POS_new', 'BETA')

lims= 250000

don= data.frame(don)
dg= data.frame(dg)


for (i in rownames(dg)) {
don= mutate(don, disc= ifelse(CHR== as.integer(dg[i, 'CHR']) & POS>= as.integer(dg[i, 'POS_new']) - lims & POS<= as.integer(dg[i, 'POS_new']) + lims, 2, disc)) 
}


for (i in rownames(ge)) {
don= mutate(don, disc= ifelse(CHR== as.integer(ge[i, 'CHR']) & POS>= as.integer(ge[i, 'pos_ge']) - lims & POS<= as.integer(ge[i, 'pos_ge']) + lims, 1, disc))
}

don= don[order(don$disc, decreasing= F, na.last= T), ]
don$disc= factor(don$disc, levels=c(0, 1, 2), labels=c('Not significant', 'Previous discovery', 'New discovery'))

cols <- c('Not significant'= 'grey', 'Previous discovery'= colorBlindBlack8[3], 'New discovery'= colorBlindBlack8[8])

don$GENE= ifelse(!is.na(don$GENE), don$nearestGene, don$GENE)

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


don$GENE= with(don, ifelse(GENE== 'CDC42', 'CDC42/ WNT4', ifelse(GENE== 'HIVEP3', 'HIVEP3/ EDN2', ifelse(GENE== 'TET3', 'TET3/ DGUOK-AS1', ifelse(GENE== 'TCEA2', 'TCEA2/ OPRL1', GENE)))))

don$logpval= -log10(don$pvalue)

p1= ggplot(data= don, aes(x= BPcum, y= logpval, colour= disc)) +
  geom_point(size= 0.07) +   # Show all points
  theme_cowplot(font_size= 9) +
  scale_colour_manual(values= cols, guide= F) +
  scale_x_continuous(label = c(1:19, '', 21,'', 'X'), breaks= axisdf$center, expand= c(0.03, 0.03)) + # label = ifelse(axisdf$CHR== 23, 'X', axisdf$CHR)
  scale_y_continuous(expand= c(0, 0), limits= c(min(don$logpval) - 2, max(don$logpval) + 2), breaks= seq(0, 10, 5), labels= c(abs(seq(0, 10, 5)))) + # , sec.axis = sec_axis(~ ., name = derive())) +
  ylab('-log10(pvalue)') +
  xlab('Chromosome') +
  geom_hline(yintercept= 0,, size= 0.25, colour= 'black') +
  geom_hline(yintercept= c(HC, -HC), size= 0.2, linetype= 2, colour= '#878787') +
  coord_cartesian(clip = "off") +
  geom_text_repel(data= filter(don, GENE!= ''), aes(x= BPcum, y= logpval, label= GENE),
                  size= 6/ .pt,
                  force_pull= 0, # do not pull toward data points
                  force= 0.1,
                  nudge_y      =  ifelse(filter(don, GENE!= '') %>% pull(logpval)>0, 1, -1), #43 - ((-log10(filter(don, GENE!= '')$pvalue))),
                  direction    = "both",
                  hjust        = 0,
                  vjust=  0.5,
		  box.padding= 0.1,
		  angle= 0,
                  segment.size = 0.1,
                  segment.square= TRUE,
                  segment.inflect= FALSE,
                  segment.colour= colorBlindBlack8[8],
                  colour= ifelse(filter(don, GENE!= '') %>% pull(disc)== 'New discovery', colorBlindBlack8[8], colorBlindBlack8[3]),
                  segment.linetype = 4,
                  ylim = c(-Inf, 50),
                  xlim = c(-Inf, Inf)) +
  theme(legend.position= 'none',
	plot.margin = unit(c(t= 0, r=0, b= 0, l=0), 'cm'),
        text= element_text(family="arial", size= 9),
	axis.line= element_line(size= 0.1)) 

save_plot(snakemake@output[[1]], plot= p1, base_height= 90, base_width= 185, units= 'mm', dpi= 300)

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/manhattan_plot_postTerm.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)


d= fread(snakemake@input[[1]], h= T, select= c('ID', 'CHR', 'POS', 'pvalue', 'nearestGene'))
d$pheno= 'GAraw'
x= fread(snakemake@input[[3]], h= T, select= c('ID', 'CHR', 'POS', 'pvalue', 'nearestGene'))
x$pheno= 'allPTD'

d= rbind(d, x)

rm(x)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

ge= data.frame(CHR= c(5, 3, 1, 23, 3, 23), pos_ge= c(157895049, 127881613, 22470407, 115164770, 123068359, 131300571))

df= arrange(d, pvalue)


dg= fread(snakemake@input[[2]])
dg$GENE= dg$nearestGene

ptd= fread(snakemake@input[[4]])
ptd$GENE= ptd$nearestGene

don <- df %>%
    group_by(CHR)      %>%
    summarise(chr_len= max(POS)) %>%
    mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome
    select(-chr_len) %>%
    left_join(df, ., by= 'CHR') %>%
    arrange(CHR, POS) %>% # Add a cumulative position of each SNP
    mutate(BPcum=POS+tot) %>%
         ungroup()

axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
  names(axisdf)= c('CHR', 'center')
HC= -log10(5*10**-8)
dg= dg %>% ungroup() %>% select(ID, GENE, CHR, POS, BETA)
ptd= ptd %>% ungroup %>% select(ID, GENE, CHR, POS, BETA)

don$disc= ifelse(don$pvalue> 5*10**-8, 0, 2)

don1= filter(don, pheno== 'GAraw') %>% left_join(., select(dg, ID, GENE), by= 'ID')
don2= filter(don, pheno!= 'GAraw') %>% left_join(., select(ptd, ID, GENE), by= 'ID')
names(dg)= c('ID', 'GENE', 'CHR', 'POS_new', 'BETA')
names(ptd)= c('ID', 'GENE', 'CHR', 'POS_new', 'BETA')

lims= 250000

don= data.frame(don)
dg= data.frame(dg)
ptd= data.frame(ptd)


for (i in rownames(dg)) {
don1= mutate(don1, disc= ifelse(CHR== as.integer(dg[i, 'CHR']) & POS>= as.integer(dg[i, 'POS_new']) - lims & POS<= as.integer(dg[i, 'POS_new']) + lims, 2, disc)) 
}

for (i in rownames(ptd)) {
don2= mutate(don2, disc= ifelse(CHR== as.integer(ptd[i, 'CHR']) & POS>= as.integer(ptd[i, 'POS_new']) - lims & POS<= as.integer(ptd[i, 'POS_new']) + lims, 2, disc))

}

don= rbind(don1, don2)
rm(don1) ; rm(don2)

for (i in rownames(ge)) {
don= mutate(don, disc= ifelse(CHR== as.integer(ge[i, 'CHR']) & POS>= as.integer(ge[i, 'pos_ge']) - lims & POS<= as.integer(ge[i, 'pos_ge']) + lims, 1, disc))
}

don= don[order(don$disc, decreasing= F, na.last= T), ]
don$disc= factor(don$disc, levels=c(0, 1, 2), labels=c('Not significant', 'Previous discovery', 'New discovery'))

cols <- c('Not significant'= 'grey', 'Previous discovery'= colorBlindBlack8[3], 'New discovery'= colorBlindBlack8[8])

don$GENE= ifelse(!is.na(don$GENE), don$nearestGene, don$GENE)

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


don$GENE= with(don, ifelse(GENE== 'CDC42', 'CDC42/ WNT4', ifelse(GENE== 'HIVEP3', 'HIVEP3/ EDN2', ifelse(GENE== 'TET3', 'TET3/ DGUOK-AS1', ifelse(GENE== 'TCEA2', 'TCEA2/ OPRL1', GENE)))))

don$logpval= with(don, ifelse(pheno== 'allPTD', log10(pvalue), -log10(pvalue)))

p1= ggplot(data= don, aes(x= BPcum, y= logpval, colour= disc)) +
  geom_point(size= 0.07) +   # Show all points
  theme_cowplot(font_size= 9) +
  scale_colour_manual(values= cols, guide= F) +
  scale_x_continuous(label = c(1:19, '', 21,'', 'X'), breaks= axisdf$center, expand= c(0.03, 0.03)) + # label = ifelse(axisdf$CHR== 23, 'X', axisdf$CHR)
  scale_y_continuous(expand= c(0, 0), limits= c(min(don$logpval) - 2, max(don$logpval) + 2), breaks= seq(-30, 45, 10), labels= c(abs(seq(-30, 45, 10)))) + # , sec.axis = sec_axis(~ ., name = derive())) +
  ylab('-log10(pvalue)') +
  xlab('Chromosome') +
  geom_hline(yintercept= 0,, size= 0.25, colour= 'black') +
  geom_hline(yintercept= c(HC, -HC), size= 0.2, linetype= 2, colour= '#878787') +
  coord_cartesian(clip = "off") +
  geom_text_repel(data= filter(don, GENE!= ''), aes(x= BPcum, y= logpval, label= GENE),
                  size= 6/ .pt,
                  force_pull= 0, # do not pull toward data points
                  force= 0.1,
                  nudge_y      =  ifelse(filter(don, GENE!= '') %>% pull(logpval)>0, 1, -1), #43 - ((-log10(filter(don, GENE!= '')$pvalue))),
                  direction    = "both",
                  hjust        = 0,
                  vjust=  0.5,
		  box.padding= 0.1,
		  angle= 0,
                  segment.size = 0.1,
                  segment.square= TRUE,
                  segment.inflect= FALSE,
                  segment.colour= colorBlindBlack8[8],
                  colour= ifelse(filter(don, GENE!= '') %>% pull(disc)== 'New discovery', colorBlindBlack8[8], colorBlindBlack8[3]),
                  segment.linetype = 4,
                  ylim = c(-Inf, 50),
                  xlim = c(-Inf, Inf)) +
  theme(legend.position= 'none',
	plot.margin = unit(c(t= 0, r=0, b= 0, l=0), 'cm'),
        text= element_text(family="arial", size= 9),
	axis.line= element_line(size= 0.1)) 

save_plot(snakemake@output[[1]], plot= p1, base_height= 90, base_width= 180, units= 'mm', dpi= 300)

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/manhattan_plot.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

as= 8
as1= 8

d= fread(snakemake@input[[1]])
d$effect= 'fetal_effect'
x= fread(snakemake@input[[2]])
x$effect= 'maternal_effect'

d= rbind(d, x)

d= filter(d, !(rsID %in% c('rs7819593', 'rs41311445')))
d$Beta2= ifelse(d$Beta1< 0, -1 * d$Beta2, d$Beta2)
d$Beta1= ifelse(d$Beta1< 0, -1 * d$Beta1, d$Beta1)

d$beta_dif= with(d, (Beta2 - Beta1) / Beta1)

mor= filter(d, effect == 'maternal_effect') %>% pull(beta_dif)
barn= filter(d, effect == 'fetal_effect') %>% pull(beta_dif)

p1= ggplot() +
geom_density( mapping=aes(x = mor, y = ..density..), fill= colorBlindBlack8[3], colour= colorBlindBlack8[3]) +
annotate('text', x= 0.35, y= 0.6, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x= 0.35, y= -1, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
geom_density(mapping= aes(x = barn, y = -..density..), fill= colorBlindBlack8[8], colour= colorBlindBlack8[8]) +
  theme_cowplot(font_size = 8) +
scale_x_continuous(expand= c(0, 0)) +
  xlab("Relative difference in effect size on \nbirth weight with or without adjusting for gestational duration") +
ylab('Density') +
geom_hline(yintercept= 0, colour= 'grey') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))


ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)



p1= ggplot(d, aes(beta_dif, group= effect, fill= effect)) +
geom_hline(yintercept= 0, colour= 'black') +
geom_density(color= NA) +
annotate('text', x=-1.5, y= 0.8, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=1, y= 0.8, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
theme_cowplot(font_size= 8) +
#scale_colour_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_fill_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_x_continuous(expand= c(0, 0)) +
scale_y_continuous(expand=c(0, 0.05)) +
  xlab("Relative difference in effect size on birth weight\nwith or without adjusting for gestational duration") +
ylab('Density') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3)) +
geom_vline(xintercept= 0, linetpye= 'dashed', colour= 'grey')

ggsave(snakemake@output[[3]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/mediation_BW_GA_individual_level_data_decode.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

as= 8
as1= 8

d= fread(snakemake@input[[1]])

d$beta_h2_GA= ifelse(d$beta_h2< 0, -1 * d$beta_h2_GA, d$beta_h2_GA)
d$beta_h2= ifelse(d$beta_h2< 0, -1 * d$beta_h2, d$beta_h2)

d$beta_h3_GA= ifelse(d$beta_h3 < 0, -1 * d$beta_h3_GA, d$beta_h3_GA)
d$beta_h3= ifelse(d$beta_h3 < 0, -1 * d$beta_h3, d$beta_h3)

d$beta_dif_h2= with(d, (beta_h2_GA - beta_h2) / beta_h2)
d$beta_dif_h3= with(d, (beta_h3_GA - beta_h3) / beta_h3)

mor= filter(d, effect == 'maternal_effect') %>% pull(beta_dif_h2)
barn= filter(d, effect == 'fetal_effect') %>% pull(beta_dif_h3)

p1= ggplot() +
geom_density( mapping=aes(x = mor, y = ..density..), fill= colorBlindBlack8[3], colour= colorBlindBlack8[3]) +
annotate('text', x= 0.35, y= 0.6, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x= 0.35, y= -1, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
geom_density(mapping= aes(x = barn, y = -..density..), fill= colorBlindBlack8[8], colour= colorBlindBlack8[8]) +
  theme_cowplot(font_size = 8) +
scale_x_continuous(expand= c(0, 0)) +
  xlab("Relative difference in effect size on \nbirth weight with or without adjusting for gestational duration") +
ylab('Density') +
geom_hline(yintercept= 0, colour= 'grey') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3))


ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)



moms= filter(d, effect== 'maternal_effect') %>% gather(key, beta_dif, beta_dif_h2) %>% select(beta_dif, effect)
fets= filter(d, effect== 'fetal_effect') %>% gather(key, beta_dif, beta_dif_h3) %>% select(beta_dif, effect)

d= rbind(moms, fets)


p1= ggplot(d, aes(beta_dif, group= effect, fill= effect)) +
geom_hline(yintercept= 0, colour= 'black') +
geom_density(color= NA) +
annotate('text', x=-2, y= 0.4, label= "Maternal", color= colorBlindBlack8[3], size= as1/ .pt, fontface = 'bold') +
annotate('text', x=1, y= 0.8, label="Fetal", color= colorBlindBlack8[8], size= as1/ .pt, fontface = 'bold') +
theme_cowplot(font_size= 8) +
#scale_colour_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_fill_manual(values= alpha(colorBlindBlack8[c(8,3)], 0.5), guide= 'none') +
scale_x_continuous(expand= c(0, 0)) +
scale_y_continuous(expand=c(0, 0.05)) +
  xlab("Relative difference in effect size on birth weight\nwith or without adjusting for gestational duration") +
ylab('Density') +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3)) +
geom_vline(xintercept= 0, linetpye= 'dashed', colour= 'grey')

ggsave(snakemake@output[[3]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/mediation_BW_GA_individual_level_data.R

library(MendelianRandomization)
library(data.table)
library(dplyr)
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library('showtext')


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])

d$rsid= with(d, ifelse(rsid== 'chrX:116013571', 'rs5991030', ifelse(rsid== 'chrX:132178061', 'rs5930554', rsid)))

d$effect_origin= with(d, ifelse(class_name== 'MF OD' | class_name== 'MF SD', 'Maternal and fetal', ifelse(class_name== 'Fetal MatT' | class_name== 'Fetal', 'Fetal', 'Maternal')))

#d= filter(d, MarkerName!= '6:32595083:G:T')

#top= fread(snakemake@input[[2]])
#ids= pull(top, ID)
#ids= c('3:156697097:A:G', '5:158058432:G:T', ids)

x= fread(snakemake@input[[2]], select= c('ID', 'RSID'))

#x= filter(x, ID %in% ids)

d= inner_join(d, x, by= c('rsid'= 'RSID'))

d= separate(d, ID, into= c('CHR', 'POS', 'REF', 'EFF'), sep= ':')
d$beta_MT= with(d, ifelse(REF > EFF, -1 * beta_MT, beta_MT))
d$beta_MNT= with(d, ifelse(REF > EFF, -1 * beta_MNT, beta_MNT))
d$beta_PT= with(d, ifelse(REF > EFF, -1 * beta_PT, beta_PT))

d$ID= with(d, ifelse(REF> EFF, paste(CHR, POS, EFF, REF, sep= ':'), paste(CHR, POS, REF, EFF, sep= ':')))

outcome= ifelse(grepl('fetal', snakemake@input[[3]]), 'Fetal', 'Maternal')

x= fread(snakemake@input[[3]], select= c('ID', 'BETA', 'SE', 'pvalue'))

d= inner_join(d, x, by= 'ID')

df_MT= select(d, beta_MT, se_MT, BETA, SE, effect_origin)
df_MT$BETA= with(df_MT, ifelse(beta_MT<0, BETA * -1, BETA))
df_MT$beta_MT= with(df_MT, ifelse(beta_MT<0, beta_MT * -1, beta_MT))


inputMR_m= mr_input(bx= df_MT$beta_MT, bxse= df_MT$se_MT, by= df_MT$BETA, byse= df_MT$SE)
MT= mr_allmethods(inputMR_m)$Values
names(MT)= c('method', 'estimate', 'se', 'lo95', 'up95', 'pvalue')

df_MNT= select(d, beta_MNT, se_MNT, BETA, SE, effect_origin)
df_MNT$BETA= with(df_MNT, ifelse(beta_MNT<0, BETA * -1, BETA))
df_MNT$beta_MNT= with(df_MNT, ifelse(beta_MNT<0, beta_MNT * -1, beta_MNT))


inputMR_m= mr_input(bx= df_MNT$beta_MNT, bxse= df_MNT$se_MNT, by= df_MNT$BETA, byse= df_MNT$SE)
MNT= mr_allmethods(inputMR_m)$Values
names(MNT)= c('method', 'estimate', 'se', 'lo95', 'up95', 'pvalue')


df_PT= select(d, beta_PT, se_PT, BETA, SE, effect_origin)
print(nrow(df_PT))
df_PT$BETA= with(df_PT, ifelse(beta_PT<0, BETA * -1, BETA))
df_PT$beta_PT= with(df_PT, ifelse(beta_PT<0, beta_PT * -1, beta_PT))

inputMR_m= mr_input(bx= df_PT$beta_PT, bxse= df_PT$se_PT, by= df_PT$BETA, byse= df_PT$SE)
PT= mr_allmethods(inputMR_m)$Values
names(PT)= c('method', 'estimate', 'se', 'lo95', 'up95', 'pvalue')

p1= ggplot(df_MT, aes(beta_MT, BETA, colour= effect_origin, fill= effect_origin)) +
geom_errorbarh(aes(xmin= beta_MT - se_MT, xmax= beta_MT + se_MT, colour= effect_origin, fill= effect_origin), size= 0.1, alpha= 0.7) +
geom_errorbar(aes(ymin= BETA - SE, ymax= BETA + SE, colour= effect_origin, fill= effect_origin),size= 0.1, alpha= 0.7) +
geom_point(size= 2, shape=21, stroke= 0.1, alpha= 0.7) +
scale_colour_manual(values= colorBlindBlack8[c(4, 2, 1)], guide= 'none') +
scale_fill_manual(values= colorBlindBlack8[c(4, 2, 1)], guide= 'none') +
xlab('Effect of maternal transmitted\nalleles on gestational duration, days') +
ylab(paste(outcome, 'only effect\non birth weight, z-score')) +
theme_cowplot(font_size= 8) +
geom_abline(intercept= 0, slope= filter(MT, method== 'IVW') %>% pull(estimate), colour= '#d9d9d9') +
geom_abline(intercept= (filter(MT, method== '(intercept)') %>% pull(estimate))[1], slope= filter(MT, method== 'MR-Egger') %>% pull(estimate), colour= '#d9d9d9', linetype= 'dashed') +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05))

p2= ggplot(df_MNT, aes(beta_MNT, BETA, colour= effect_origin, fill= effect_origin)) +
geom_errorbarh(aes(xmin= beta_MNT - se_MNT, xmax= beta_MNT + se_MNT,colour= effect_origin, fill= effect_origin), size= 0.1) +
geom_errorbar(aes(ymin= BETA - SE, ymax= BETA + SE,colour= effect_origin, fill= effect_origin),size= 0.1) +
geom_point(size= 2, shape= 21, stroke= 0.1) +
scale_colour_manual(values= alpha(colorBlindBlack8[c(4, 2, 1)], 0.7), guide= 'none') +
scale_fill_manual(values= alpha(colorBlindBlack8[c(4, 2, 1)], 0.7), guide= 'none') +
xlab('Effect of maternal non-transmitted alleles\non gestational duration, days') +
ylab(paste(outcome, 'only effect\non birth weight, z-score')) +
theme_cowplot(font_size= 8) +
geom_abline(intercept= 0, slope= filter(MNT, method== 'IVW') %>% pull(estimate), colour= '#d9d9d9') +
geom_abline(intercept= (filter(MNT, method== '(intercept)') %>% pull(estimate))[1], slope= filter(MNT, method== 'MR-Egger') %>% pull(estimate), colour= '#d9d9d9', linetype= 'dashed') +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05))

p3= ggplot(df_PT, aes(beta_PT, BETA, colour= effect_origin, fill= effect_origin)) +
geom_errorbarh(aes(xmin= beta_PT - se_PT, xmax= beta_PT + se_PT, colour= effect_origin, fill= effect_origin), size= 0.1) +
geom_errorbar(aes(ymin= BETA - SE, ymax= BETA + SE, colour= effect_origin, fill= effect_origin), alpha= 0.5, size= 0.1) +
geom_point(size= 2, shape= 21, stroke = 0.1) +
scale_colour_manual(values= alpha(colorBlindBlack8[c(4, 2, 1)], 0.7), guide= 'none') +
scale_fill_manual(values= alpha(colorBlindBlack8[c(4, 2, 1)], 0.7), guide= 'none') +
xlab('Effect of paternal transmitted alleles\non gestational duration, days') +
ylab(paste(outcome, 'only effect\non birth weight, z-score')) +
theme_cowplot(font_size= 8) +
geom_abline(intercept= 0, slope= filter(PT, method== 'IVW') %>% pull(estimate), colour= '#d9d9d9') +
geom_abline(intercept= (filter(PT, method== '(intercept)') %>% pull(estimate))[1], slope= filter(PT, method== 'MR-Egger') %>% pull(estimate), colour= '#d9d9d9', linetype= 'dashed') +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05))

ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)
ggsave(snakemake@output[[2]], plot= p2, width= 70, height= 70, units= 'mm', dpi= 300)
ggsave(snakemake@output[[3]], plot= p3, width= 70, height= 70, units= 'mm', dpi= 300)

MT$haplotype= 'MT'
MNT$haplotype= 'MNT'
PT$haplotype= 'PT'

df= bind_rows(MT, MNT, PT)

fwrite(d, snakemake@output[[4]], sep= '\t')
fwrite(df, snakemake@output[[5]], sep= '\t')

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/MR_GA_BW_haplotype.R

library(MendelianRandomization)
library(data.table)
library(dplyr)
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library('showtext')


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


d= fread(snakemake@input[[1]])
x= fread(snakemake@input[[2]], select= c('ID', 'BETA', 'SE'))

mr= fread(snakemake@input[[3]])

d= inner_join(d,x, by= 'ID')
d= filter(d, !duplicated(ID))

d$BETA= with(d, ifelse(beta< 0, -1 * BETA, BETA))
d$beta= with(d, ifelse(beta< 0, -1 * beta, beta))


shbg= filter(d, trait== 'SHBG_fem_cluster')
testo= filter(d, trait== 'Testosterone_fem_cluster')

p1= ggplot(shbg, aes(beta, BETA), color= colorBlindBlack8[2]) +
geom_errorbarh(aes(xmin= beta - se, xmax= beta + se), size= 0.1, alpha= 0.7, color= colorBlindBlack8[2]) +
geom_errorbar(aes(ymin= BETA - SE, ymax= BETA + SE), size= 0.1, alpha= 0.7, color= colorBlindBlack8[2]) +
geom_point(size= 2, shape=21, stroke= 0.1, alpha= 0.7, fill= colorBlindBlack8[2]) +
xlab('Effect on SHBG (women), nmol/L') +
ylab('Effect on gestational duration, days') +
theme_cowplot(font_size= 8) +
geom_abline(intercept= 0, slope= filter(mr, method== 'IVW', trait== 'SHBG_fem_cluster') %>% pull(estimate), colour= '#d9d9d9') +
geom_abline(intercept= (filter(mr, method== '(intercept)', trait== 'SHBG_fem_cluster') %>% pull(estimate))[1], slope= filter(mr, method== 'MR-Egger', trait== 'SHBG_fem_cluster') %>% pull(estimate), colour= '#d9d9d9', linetype= 'dashed') +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05))


p2= ggplot(testo, aes(beta, BETA), color= colorBlindBlack8[2]) +
geom_errorbarh(aes(xmin= beta - se, xmax= beta + se), size= 0.1, alpha= 0.7, color= colorBlindBlack8[2]) +
geom_errorbar(aes(ymin= BETA - SE, ymax= BETA + SE), size= 0.1, alpha= 0.7, color= colorBlindBlack8[2]) +
geom_point(size= 2, shape=21, stroke= 0.1, alpha= 0.7, fill= colorBlindBlack8[2]) +
xlab('Effect on testosterone (women), nmol/L') +
ylab('Effect on gestational duration, days') +
theme_cowplot(font_size= 8) +
geom_abline(intercept= 0, slope= filter(mr, method== 'IVW', trait== 'Testosterone_fem_cluster') %>% pull(estimate), colour= '#d9d9d9') +
geom_abline(intercept= (filter(mr, method== '(intercept)', trait== 'Testosterone_fem_cluster') %>% pull(estimate))[1], slope= filter(mr, method== 'MR-Egger', trait== 'Testosterone_fem_cluster') %>% pull(estimate), colour= '#d9d9d9', linetype= 'dashed') +
geom_hline(yintercept= 0, size= 0.1) +
geom_vline(xintercept= 0, size= 0.1) +
theme(axis.line.x = element_blank(),
        axis.line.y = element_blank(),
        axis.ticks= element_blank(),
        panel.grid.major= element_line(colour= 'grey', size= 0.05))

ggsave(snakemake@output[[1]], plot= p1, width= 70, height= 70, units= 'mm', dpi= 300)
ggsave(snakemake@output[[2]], plot= p2, width= 70, height= 70, units= 'mm', dpi= 300)

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/MR_sex_hormones_GA.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[1]])

nr= nrow(d)

d= filter(d, Enrichment_p< 0.05 / (nrow(d)- 1))

d$description= with(d, ifelse(Category== 'H3K27ac_HniszL2_0', 'H3K27ac', 
			ifelse(Category== 'SuperEnhancer_HniszL2_0', 'SuperEnhancer',
			ifelse(Category== 'Backgrd_Selection_StatL2_0', 'Background selection',
			ifelse(Category== 'CpG_Content_50kbL2_0', 'CpG content',
			ifelse(Category== 'BLUEPRINT_DNA_methylation_MaxCPPL2_0', 'DNA Methylation', NA))))))

d= arrange(d, desc(Enrichment_p))

d$description= factor(d$description, levels= unique(d$description))

p1= ggplot(data=d, aes(x= description, y= -log10(Enrichment_p))) +
geom_col(fill=colorBlindBlack8[2], alpha= 0.6) +
theme_cowplot(font_size= 10) +
ylab('Enrichment -log10(pvalue)') +
theme(axis.title.y=element_blank()) +
geom_hline(yintercept= -log10(0.05/ (nr -1)), linetype= 'dashed', colour= 'grey') +
coord_flip()

p2= ggplot(data=d, aes(x= description, y= Enrichment)) +
geom_col(fill=colorBlindBlack8[4], alpha= 0.6) +
theme_cowplot(font_size= 10) +
ylab('Enrichment (h2 / proportion of SNPs)') +
theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
geom_hline(yintercept= 1, linetype= 'dashed', colour= 'grey') +
coord_flip()

x= plot_grid(p1, p2)


ggsave(snakemake@output[[1]], plot= x, height= 50, width= 140, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 2 of figures/partitioned_h2.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)


d= fread(snakemake@input[[1]], h= T)


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)




d= fread(snakemake@input[[1]], h= T, select= c('ID', 'pvalue', 'EAF'))
d$MAF= ifelse(d$EAF>0.5,  1 - d$EAF, d$EAF)
d= arrange(d, pvalue)
d= d[!duplicated(d$ID), ]


d= mutate(d, maf_tertiles = ntile(MAF, 3))
#m1= round(max(d[d$maf_tertiles== 1, 'MAF']), 3)
#m2= round(max(d[d$maf_tertiles== 2, 'MAF']), 3)


#d$maf_tertiles= factor(d$maf_tertiles, levels=c("1", "2", "3"), labels=c(paste('MAF<', m1), paste(m1,'< MAF >', m2), paste('MAF>', m2)))

df= arrange(d, pvalue) %>% mutate(exp1= -log10(1:length(pvalue)/length(pvalue)))

p1= ggplot(filter(df, pvalue<0.05), aes(exp1, -log10(pvalue))) +
  geom_point(size= 0.4, color= colorBlindBlack8[2]) +
#scale_color_manual(values= colorBlindBlack8[c(2,4,8)])+
  geom_abline(intercept = 0, slope = 1, alpha = .5) +
labs(colour="") +
theme_cowplot(font_size= 12) +
xlab('Expected (-log10(p-value))') +
ylab('Observed (-log10(p-value))') +
theme(legend.position= 'bottom') 
#guides(colour = guide_legend(override.aes = list(size=3)))

ggsave(snakemake@output[[1]], plot= p1, width= 120, height= 120, units= 'mm', dpi= 300)

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/QQ_plot.R

library(data.table)
library(dplyr)
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library('showtext')


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

d= fread(snakemake@input[[1]])


d= filter(d, !grepl('BW', trait), !grepl('GA_fetal', trait), !grepl('male', trait))

d$trait= with(d, ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Maternal',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
                ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Fetal',
                ifelse(trait== 'BW_fetal_effect', 'Fetal only',
                ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'BW_maternal_effect', 'Maternal only',
                ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis'))))))))))))))))))))))

pregnancy= c('Miscarriage', 'Pre-eclampsia')
uterus= c('Leiomyoma uterus', 'Pelvic Organ Prolapse', 'Endometriosis', 'Polycystic ovary syndrome')
fitness= c('Age at first birth', 'Number of live births')
hormonal= c('Age at menarche', 'Age at menopause', 'Testosterone (women)', 'SHBG (women)', 'CBAT (women)', 'Oestradiol (women)')

d$cluster= with(d, ifelse(trait %in% pregnancy, 'Pregnancy', ifelse(trait %in% uterus, 'Reproductive organs', ifelse(trait %in% fitness, 'Fitness', 'Sex-hormone related'))))

d$colour= with(d, ifelse(cluster== 'Pregnancy', colorBlindBlack8[3], ifelse(cluster== 'Reproductive organs', colorBlindBlack8[5], ifelse(cluster== 'Fitness', colorBlindBlack8[7], colorBlindBlack8[8]))))

d$GENE= apply(d[, 'locus'], 1, function(x) unlist(strsplit(x, '_'))[2])

d$GENE= with(d, ifelse(GENE== 'CDC42', 'CDC42/ WNT4', ifelse(GENE== 'HIVEP3', 'HIVEP3/ EDN2', ifelse(GENE== 'TET3', 'TET3/ DGUOK-AS1', ifelse(GENE== 'TCEA2', 'TCEA2/ OPRL1', GENE)))))

d$sig= ifelse(d$PP.H4.abf>0.5, '*', '')

d= arrange(d, cluster)

d$trait= factor(d$trait, levels= unique(d$trait))
traits= unique(d$trait)
colors <- filter(d, !duplicated(trait)) %>% arrange(trait) %>% pull(colour)

d$PP= ifelse(d$PP.H4.abf> d$PP.H3.abf, d$PP.H4.abf, -d$PP.H3.abf - d$PP.H4.abf)
d$PP2= ifelse(d$PP.H4.abf> d$PP.H3.abf, d$PP.H4.abf, d$PP.H3.abf)
p1= ggplot(d, aes(trait, GENE, value= PP, fill= PP, colour= PP, size= PP2, stroke= 1-  PP)) + 
theme_cowplot(font_size= 9) +
geom_point(shape= 15) + 
scale_fill_gradient2(low= colorBlindBlack8[4], mid= 'white', high= colorBlindBlack8[2], guide= F) +
scale_colour_gradient2(low= colorBlindBlack8[4], mid= 'white', high= colorBlindBlack8[2], guide= F) +
scale_size_continuous(range= c(1, 2.5), guide= F) +
scale_x_discrete(position= 'top') +
theme(axis.ticks= element_blank(),
	axis.title= element_blank(),
	axis.text.x= element_blank())  +
geom_vline(xintercept= 1:(length(unique(d$trait))-1) + 0.5, size= 0.4, colour= 'grey') +
geom_hline(yintercept= 1:(length(unique(d$GENE))-1) + 0.5, size= 0.4, colour= 'grey') +
geom_vline(xintercept= cumsum(c(length(fitness) , length(pregnancy) , length(uterus) )) +0.5, size= 0.8) +
theme(	panel.grid.major = element_blank(), 
	panel.grid.minor = element_blank(),
	panel.background = element_blank(),
	panel.border = element_rect(colour= 'black', fill= NA, size=1),
	plot.margin = unit(c(0, 0.1, 0.1, 0), "cm"),
	axis.line= element_blank())

t_count_locus= group_by(d, trait) %>% summarize(PP= sum(as.numeric(PP.H4.abf> 0.8)), PP_locus= sum(as.numeric(PP.H4.abf + PP.H3.abf>0.8)))
t_count_locus$PP= t_count_locus$PP_locus - t_count_locus$PP
t_count_locus$supp= 'Locus-level'

t_count= group_by(d, trait) %>% summarize(PP= sum(as.numeric(PP.H4.abf> 0.8)))
t_count$supp= 'Coloc'

t_count= bind_rows(t_count, t_count_locus)

t_count$trait= factor(t_count$trait, levels= unique(d$trait))
t_count$supp= factor(t_count$supp, levels= c('Locus-level','Coloc'))

p2= ggplot(t_count, aes(trait, -PP, fill= supp)) +
theme_cowplot(font_size= 8) +
geom_col(alpha= 0.7) +
geom_hline(yintercept= 0) +
scale_fill_manual(values= c(colorBlindBlack8[4], colorBlindBlack8[2]), guide= F) +
theme(	axis.line= element_blank(),
	panel.grid.major = element_blank(), 
	panel.grid.minor = element_blank(),
	panel.background = element_blank(),
	panel.border = element_rect(colour= 'black', fill= NA, size=1),
	axis.text.x= element_blank(),
	axis.ticks.x= element_blank(),
	axis.title= element_blank(),
	plot.margin = unit(c(0, 0, 0, 0.1), "cm")) +
scale_y_continuous(limits= c(-10, 0), expand= c(0,0), labels= seq(0, 10, 2), breaks= seq(0, -10, -2)) +
geom_vline(xintercept= cumsum(c(length(fitness) , length(pregnancy) , length(uterus) )) +0.5, size= 0.8)  +
geom_hline(yintercept= c(-4, -8), size= 0.3, linetype= 'dashed', colour= 'grey')

l_count_locus= group_by(d, GENE) %>% summarize(PP= sum(as.numeric(PP.H4.abf> 0.8)), PP_locus= sum(as.numeric(PP.H4.abf + PP.H3.abf>0.8)))
l_count_locus$PP= l_count_locus$PP_locus - l_count_locus$PP
l_count_locus$supp= 'Locus-level'

l_count= group_by(d, GENE) %>% summarize(PP= sum(as.numeric(PP.H4.abf> 0.8)))
l_count$supp= 'Coloc'

l_count= bind_rows(l_count, l_count_locus)

l_count$trait= factor(l_count$GENE, levels= unique(d$GENE))
l_count$supp= factor(l_count$supp, levels= c('Locus-level','Coloc'))

print('done')
p3= ggplot(l_count, aes(PP, GENE, fill= supp)) +
theme_cowplot(font_size= 8) +
geom_col(alpha= 0.7) +
geom_hline(yintercept= 0) +
scale_fill_manual(values= c(colorBlindBlack8[4], colorBlindBlack8[2]), guide= F) +
theme(	axis.line= element_blank(),
	panel.grid.major = element_blank(), 
	panel.grid.minor = element_blank(),
	panel.background = element_blank(),
	panel.border = element_rect(colour= 'black', fill= NA, size=1),
	axis.text.y= element_blank(),
	axis.ticks.y= element_blank(),
	axis.title= element_blank(),
	plot.margin = unit(c(0, 0.1, 0, 0), "cm")) +
scale_x_continuous(limits= c(0, 10), expand= c(0,0), labels= seq(0,10, 2), breaks= seq(0, 10, 2))

x1= plot_grid(p1, p3, nrow= 1, align= 'h', rel_widths= c(2, 0.5))
x2= plot_grid(p1, p2, nrow= 2, align= 'v', rel_heights= c(2, 0.3))

ggsave(snakemake@output[[1]], plot= x1, width= 127 - 1, height= 127 - 25 - 1, units= 'mm', dpi= 300)
ggsave(snakemake@output[[2]], plot= x2, width= 103 - 1, height= 127 - 25 - 1, units= 'mm', dpi= 300)

################## Genetic correlations

d= fread(snakemake@input[[2]])

d= filter(d, grepl('GAraw', p1), !grepl('BW', p2), !grepl('male', p2))
#d$p1= 'Gestational duration (maternal)'
d$p1= 'Maternal'
x= fread(snakemake@input[[2]])

x= filter(x, grepl('GA_fetal', p1), !grepl('BW', p2), !grepl('male', p2))
#x$p1= 'Gestational duration (fetal)'
x$p1= 'Fetal'
d= rbind(d, x)

d$p2= gsub('.txt.sumstats.gz', '', apply(d[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))
d$trait= d$p2

d$trait= with(d, ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Maternal',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
                ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Fetal',
                ifelse(trait== 'BW_fetal_effect', 'Fetal only',
                ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'BW_maternal_effect', 'Maternal only',
                ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis'))))))))))))))))))))))


d= filter(d, trait!= 'GA fetal effect')

d$cluster= with(d, ifelse(trait %in% pregnancy, 'Pregnancy', ifelse(trait %in% uterus, 'Reproductive organs', ifelse(trait %in% fitness, 'Fitness', 'Sex-hormone related'))))

d$colour= with(d, ifelse(cluster== 'Pregnancy', colorBlindBlack8[3], ifelse(cluster== 'Reproductive organs', colorBlindBlack8[1], ifelse(cluster== 'Fitness', colorBlindBlack8[7], colorBlindBlack8[8]))))

d= arrange(d, cluster)

d$trait= factor(d$trait, levels= traits)

colors <- filter(d, !duplicated(trait)) %>% arrange(trait) %>% pull(colour)


d$sig= ifelse(d$p< 0.05/ (nrow(d)/2), '**', ifelse(d$p< 0.05, '*', ''))
d= filter(d, p1== 'Maternal')
d$p1= 'Gestational duration'

rg_plot= ggplot(d, aes(trait, p1, fill= rg)) +
geom_tile(colour = "white", size= 1) +
theme_cowplot(font_size= 9) +
scale_fill_gradient2(low= colorBlindBlack8[2], high= colorBlindBlack8[4], mid= 'white', guide= F) +
theme(axis.text.x = element_text(angle = 45, hjust = 0),
        axis.title.x = element_blank(),
        axis.title.y = element_blank()) +
scale_x_discrete(position = "top") +
geom_text(data= d, aes(trait, p1, label= sig), size= 6/ .pt) +
theme(  panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
	axis.ticks= element_blank(),
        panel.border = element_rect(colour= 'black', fill= NA, size=1),
        plot.margin = unit(c(0, 1, 0, 0), "cm"),
        axis.line= element_blank(),
	axis.text.x= element_text(angle= 45, hjust=0, colour= colors))


x2= plot_grid(rg_plot,p1, nrow= 2, align= 'v', rel_heights= c(0.85, 2))

ggsave(snakemake@output[[3]], plot= x2, width= 113 - 2.5, height= 127 - 25 - 1 , units= 'mm', dpi= 300)

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/repr_pheno_coloc.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
library(tidyverse)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

x= fread(snakemake@input[[1]])

x$p1= gsub('.txt.sumstats.gz', '', apply(x[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2]))
x$p2= gsub('.txt.sumstats.gz', '', apply(x[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))

x1= fread(snakemake@input[[2]])

x1$p1= gsub('.txt.sumstats.gz', '', apply(x1[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2]))
x1$p2= gsub('.txt.sumstats.gz', '', apply(x1[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))
x1$rg= -1 * x1$rg
d= rbind(x, x1)

#traits= filter(d, p< 0.05/ 14, !grepl('BW', p2), !grepl('GA', p2)) %>% pull(p2)

d$trait= d$p2
d$trait= with(d, ifelse(trait== 'GAraw', 'Maternal gestational duration',
ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Maternal BW',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
                ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Estradiol (women)',
                ifelse(trait== 'POP', 'Pelvic organ prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Fetal',
                ifelse(trait== 'BW_fetal_effect', 'Fetal only',
                ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'BW_maternal_effect', 'Maternal only',
                ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis')))))))))))))))))))))))

d= filter(d, !grepl('BW', p2), !grepl('GA', p2), !grepl('_male', p2))

traits= unique(arrange(d, p) %>% pull(trait))
d$trait= factor(d$trait, levels= rev(traits))

p1= ggplot(d, aes(rg, trait, colour= p1)) + 
geom_pointrange(aes(xmax= rg + 1.96 * se, xmin= rg - 1.96 * se), position = position_dodge(width = 0.3), fatten= 1) +
scale_colour_manual(values= colorBlindBlack8[c(8,3)], guide= FALSE) +
theme_cowplot(font_size= 9) +
scale_x_continuous(limits= c(-1, 1), breaks= seq(-1, 1, 0.5)) +
xlab('Genetic correlation') +
geom_vline(xintercept= 0, size= 0.3) +
geom_vline(xintercept= c(seq(-1, 1, 0.25)), colour= 'grey', linetype= 'dashed', alpha= 0.5, size= 0.2) +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3),
        axis.title.y= element_blank())


ggsave(snakemake@output[[1]], plot= p1, width= 88, height= 120, units= 'mm', dpi= 300)

fwrite(d, snakemake@output[[2]], sep= '\t')

p1= ggplot(d, aes(rg, trait, colour= p1)) +
geom_pointrange(aes(xmax= rg + 1.96 * se, xmin= rg - 1.96 * se), position = position_dodge(width = 0.3), fatten= 1) +
scale_colour_manual(values= colorBlindBlack8[c(8,3)], name= 'Trait') +
theme_cowplot(font_size= 9) +
scale_x_continuous(limits= c(-1, 1), breaks= seq(-1, 1, 0.5)) +
xlab('Genetic correlation') +
geom_vline(xintercept= 0, size= 0.3) +
geom_vline(xintercept= c(seq(-1, 1, 0.25)), colour= 'grey', linetype= 'dashed', alpha= 0.5, size= 0.2) +
theme(axis.line.x = element_line(size = 0.3),
        axis.line.y = element_line(size = 0.3),
        axis.ticks= element_line(size= 0.3),
        axis.title.y= element_blank())


ggsave(snakemake@output[[3]], plot= p1, width= 88, height= 120, units= 'mm', dpi= 300)

R ggplot2 tidyverse dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/repr_pheno_correlations.R

library(scales)
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
library(tidyverse)
library(fmsb)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")


font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)


x= fread(snakemake@input[[1]])
x$p1= gsub('.txt.sumstats.gz', '', apply(x[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2]))
x$p2= gsub('.txt.sumstats.gz', '', apply(x[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))

x1= fread(snakemake@input[[2]])
x1$p1= gsub('.txt.sumstats.gz', '', apply(x1[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2]))
x1$p2= gsub('.txt.sumstats.gz', '', apply(x1[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))

x= rbind(x, x1)

traits= unique(filter(x, p< 0.05/ 14, !grepl('BW', p2), !grepl('GA', p2)) %>% pull(p2))

d= fread(snakemake@input[[3]])

table_supp= d
table_supp$pheno= 'Gestational duration'
d$gcp.pm= ifelse(d$pval.gcpzero.2tailed< 0.05/length(traits), d$gcp.pm, 0)

d= filter(d, repr_pheno %in% traits)

d= arrange(d, desc(gcp.pm))

df= fread(snakemake@input[[4]])

table_supp2= df
table_supp2$pheno= 'Preterm delivery' 

table_supp= rbind(table_supp, table_supp2)

df$gcp.pm= ifelse(df$pval.gcpzero.2tailed< 0.05/length(traits), df$gcp.pm, 0)

df= filter(df, repr_pheno %in% traits)

d= inner_join(d, df, by= 'repr_pheno')
d$trait= d$repr_pheno
d$trait= with(d, ifelse(trait== 'GAraw', 'Maternal gestational duration',
ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Maternal BW',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
                ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Fetal',
                ifelse(trait== 'BW_fetal_effect', 'Fetal only',
                ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'BW_maternal_effect', 'Maternal only',
                ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis')))))))))))))))))))))))

d$repr_pheno= d$trait
x= as.data.frame(matrix(d$gcp.pm.x, ncol= nrow(d)))
x=rbind(x, as.data.frame(matrix(d$gcp.pm.y, ncol= nrow(d))))



names(x)= d$repr_pheno
rownames(x)= c('Preterm delivery', 'Gestational duration ')
x= rbind(rep(1,nrow(d)) , rep(0,nrow(d)) , x)

inches= 25.4

pdf(snakemake@output[[1]], width= 88 / inches, height= 88 / inches)
par(mar=c(0,0,0,0))


radarchart(abs(x), axistype= 0,

    #custom polygon
    pcol= c(colorBlindBlack8[3], colorBlindBlack8[8]) , pfcol= c(alpha(colorBlindBlack8[3], 0.4), alpha(colorBlindBlack8[8], 0.4)) , plwd=1, pty= 16, plty= 1, vlcex= 0.8, vlabels= c('Testosterone\n(women)', 'Age at\nfirst birth', 'Age at\nmenopause', 'Number of\nlive births', 'SHBG\n(women)', 'CBAT\n(women)'),
    #custom the grid
    cglcol="grey", cglty=1, axislabcol="#525252", caxislabels= seq(0, 1, 0.25), cglwd=0.8, calcex= 0.4

    #custom labels
    )

dev.off()

table_supp$trait= table_supp$repr_pheno
table_supp$trait= with(table_supp, ifelse(trait== 'GAraw', 'Maternal gestational duration',
ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Maternal BW',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
                ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Fetal',
                ifelse(trait== 'BW_fetal_effect', 'Fetal only',
                ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'BW_maternal_effect', 'Maternal only',
                ifelse(trait== 'PCOS', 'Polycystic ovary syndrome', 'Endometriosis')))))))))))))))))))))))


fwrite(table_supp, snakemake@output[[2]], sep= '\t')

R tidyverse dplyr data.table tidyr cowplot ggrepel knitr scales showtext From line 1 of figures/repr_pheno_LCV.R

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')
options(warn=-1)


d= fread(snakemake@input[[1]], h= T)


colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

font_add("arial", "arial.ttf", bold= 'arial_bold.ttf')

showtext_opts(dpi = 300)
showtext_auto(enable = TRUE)

female_repr= c('breast', "cervix, uterine", 'endometrium', 'ovary', 'placenta', 'vagina', 'fallopian tube')
male_repr= c('ductus deferens', 'testis', 'seminal vesicle', 'prostate', 'epididymis')
muscle= c('smooth muscle', 'heart muscle', 'skeletal muscle')

d$organ= with(d, ifelse(tissue %in% female_repr, 'Female reproductive', ifelse(tissue %in% male_repr, 'Male reproductive', ifelse(tissue %in% muscle, 'Muscle', 'Others'))))

p1= ggplot(d, aes(-log10(MannW_pvalue), I(i_listmedian/ base_list_median), colour= organ)) +
geom_point(size= 1.5) +
theme_cowplot(font_size= 8) +
scale_colour_manual('Legend', values= c(colorBlindBlack8[c(3, 2, 8)], 'grey'), guide= 'none') +
geom_text_repel(data= filter(d, MannW_pvalue< 0.05), aes(label= tissue), fontface = 'bold') +
geom_vline(xintercept= -log10(0.05), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_vline(xintercept= -log10(0.05/nrow(d)), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
ylab('Enrichment') +
xlab('-log10(pvalue)')


ggsave(snakemake@output[[1]], plot= p1, width= 120, height= 90, units= 'mm', dpi= 300)


p1= ggplot(d, aes(-log10(MannW_pvalue), I(i_listmedian/ base_list_median), colour= organ)) +
geom_point(size= 1.5) +
theme_cowplot(font_size= 10) +
scale_colour_manual('Legend', values= c(colorBlindBlack8[c(3, 2, 8)], 'grey')) +
geom_text_repel(data= filter(d, MannW_pvalue< 0.05), aes(label= tissue), fontface = 'bold', show_guide = FALSE) +
geom_vline(xintercept= -log10(0.05), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
geom_vline(xintercept= -log10(0.05/nrow(d)), colour= colorBlindBlack8[8], linetype= 'dashed', size= 0.2, alpha= 0.6) +
ylab('Enrichment') +
xlab('-log10(pvalue)')

ggsave(snakemake@output[[2]], plot= p1, width= 120, height= 90, units= 'mm', dpi= 300)

R ggplot2 dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of figures/RNA_enrichment.R

script:
	'manhattan_plot.R'

SnakeMake From line 11 of figures/Snakefile

script:
	'lm_effect_origin.R'

SnakeMake From line 25 of figures/Snakefile

script:
	'effect_origin_dendrogram.R'

SnakeMake From line 37 of figures/Snakefile

script:
	'effect_origin_ternary.R'

SnakeMake From line 48 of figures/Snakefile

script:
	'gene_based_vs_coloc_iPSC.R'

SnakeMake From line 64 of figures/Snakefile

script:
	'BW_coloc_spider.R'

SnakeMake From line 74 of figures/Snakefile

script:
        'KCNAB1_pheWAS.R'

SnakeMake From line 87 of figures/Snakefile

script:
	'ADCY5_pheWAS.R'

SnakeMake From line 101 of figures/Snakefile

script:
	'ADCY5_FST_AFR_EUR.R'

SnakeMake From line 118 of figures/Snakefile

script:
	'BW_genetic_correlations.R'

SnakeMake From line 129 of figures/Snakefile

script:
        'repr_pheno_correlations.R'

SnakeMake From line 143 of figures/Snakefile

script:
	'partitioned_h2.R'

SnakeMake From line 153 of figures/Snakefile

script:
	'MacArthurlab_enrichment.R'

SnakeMake From line 164 of figures/Snakefile

script:
	'ADCY5_effect_direction.R'

SnakeMake From line 180 of figures/Snakefile

script:
	'BW_conditioning.R'

SnakeMake From line 193 of figures/Snakefile

script:
        'BW_conditioning_top.R'

SnakeMake From line 207 of figures/Snakefile

script:
        'mediation_BW_GA_individual_level_data.R'

SnakeMake From line 218 of figures/Snakefile

script:
        'mediation_BW_GA_individual_level_data_decode.R'

SnakeMake From line 230 of figures/Snakefile

script:
	'MR_GA_BW_haplotype.R'

SnakeMake From line 245 of figures/Snakefile

script:
	'repr_pheno_coloc.R'

SnakeMake From line 257 of figures/Snakefile

script:
	'repr_pheno_LCV.R'

SnakeMake From line 270 of figures/Snakefile

script:
	'repr_pheno_correlations.R'

SnakeMake From line 282 of figures/Snakefile

script:
	'RNA_enrichment.R'

SnakeMake From line 292 of figures/Snakefile

script:
	'QQ_plot.R'

SnakeMake From line 305 of figures/Snakefile

script:
	'h2_allphenos.R'

SnakeMake From line 316 of figures/Snakefile

script:
	'h2_cohorts.R'

SnakeMake From line 326 of figures/Snakefile

script:
        'genet_correlations_meta.R'

SnakeMake From line 335 of figures/Snakefile

script:
	'manhattan_plot_postTerm.R'

SnakeMake From line 346 of figures/Snakefile

script:
        'manhattan_plot_postTerm.R'

SnakeMake From line 357 of figures/Snakefile

script:
	'forest_plot_EEFSEC.R'

SnakeMake From line 368 of figures/Snakefile

script:
	'MR_sex_hormones_GA.R'

SnakeMake From line 380 of figures/Snakefile

script:
	'cell_type_enrichment.R'

SnakeMake From line 391 of figures/Snakefile

script:
	'labor_deg.R'

SnakeMake From line 402 of figures/Snakefile

script:
	'coloc_sex_hormones.R'

SnakeMake From line 413 of figures/Snakefile

script:
	'evo.R'

SnakeMake From line 423 of figures/Snakefile

script:
	'GA_BW_PGS_correlations.R'

SnakeMake From line 432 of figures/Snakefile

script:
	'GAraw_vs_allPTD.R'

SnakeMake From line 445 of figures/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= None, names= ['CHR', 'start', 'end', 'geneSymbol', 'Ensembl_gene'])
	d= d.loc[~d.geneSymbol.str.contains(' '), :]
	d= d[['CHR', 'start', 'end', 'geneSymbol']]
	d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 11 of gene_based/Snakefile

run:
	d= pd.read_csv(input[0], sep='\t', header= 0)
	pop= ['CEU', 'TSI', 'FIN', 'GBR', 'IBS']
	d= d.loc[d.Population.isin(pop)]
	d['IID']= d['Individual ID']
	d.to_csv(output[0], sep= '\t', header= False, index= False, columns= ['IID'])

SnakeMake From line 24 of gene_based/Snakefile

run:
	vcfs= [x for x in input if '1000g' in x]
	shell('/home/pol/software/bcftools-1.9/bcftools concat {input} -o {output[0]} -Oz')

SnakeMake BCFtools From line 37 of gene_based/Snakefile

shell:
	'/home/pol/software/plink2 --vcf {input[0]} --max-alleles 2 --keep {input[1]} --make-bed --out {params[0]}'

SnakeMake plink2 From line 52 of gene_based/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= None, names= ['CHR', 'RSID', 'cm', 'POS', 'A1', 'A2'])
	d['REF']= np.where(d.A1.str.len() > d.A2.str.len(), 'I', d.A1)
	d['EFF']= np.where(d.A2.str.len() > d.A1.str.len(), 'I', d.A2)
	d['REF']= np.where(d.EFF== 'I', 'D', d.REF)
	d['EFF']= np.where(d.REF== 'I', 'D', d.EFF)
	d['RSID']= np.where(d.REF > d.EFF, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.EFF + ':' + d.REF, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.REF + ':' + d.EFF)
	d= d[['CHR', 'RSID', 'cm', 'POS', 'A1', 'A2']]
	d.to_csv(output[0], sep= '\t', header= False, index= False)
	shell('mv {input[1]} {output[1]}')
	shell('mv {input[2]} {output[2]}')

SnakeMake From line 65 of gene_based/Snakefile

run:
	d= pd.read_csv(input[0], header= 0, sep= '\t', compression= 'gzip', usecols= ['ID', 'pvalue'])
	d.dropna(subset= ['ID'], inplace= True)
	d= d.loc[d.ID != '-', :]
	d= d[['ID', 'pvalue']]
	d.columns= ['SNP', 'p']
	d['SNP']= d.SNP.str.replace('^23:', 'X:')
	d.to_csv(output[0], sep= '\t', header= True, index= None, columns= ['SNP', 'p'])

SnakeMake From line 83 of gene_based/Snakefile

shell:
	'/home/pol/software/gcta_1.93.2beta/gcta64 --bfile {params[0]} --maf 0.01 --fastBAT {input[1]} --fastBAT-gene-list {input[2]} --out {params[1]} --thread-num {threads}'

SnakeMake From line 105 of gene_based/Snakefile

run:
	d=pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'RSID', 'pvalue'])[['RSID', 'CHR', 'POS', 'pvalue']]
	d.columns= ['SNP', 'CHR', 'POS', 'P']
	d.to_csv(output[0], header= True, index= False, sep= '\t')

SnakeMake From line 12 of independent/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['CHR', 'POS'])
        d.sort_values(['CHR', 'POS'], inplace= True)
        d['pos2']= d.POS

SnakeMake From line 23 of independent/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	d= d.loc[d.Relationship== 'unrel', :]
	pop= ['CEU', 'TSI', 'FIN', 'GBR', 'IBS']

SnakeMake From line 38 of independent/Snakefile

run:
	vcfs= [infile for infile in input if 'vcf' in infile]
	shell('/home/pol/software/bcftools-1.9/bcftools concat -a -O v -R {input[0]} {vcfs} -o {output[0]}')

SnakeMake BCFtools From line 54 of independent/Snakefile

shell:
	'/home/pol/software/plink --vcf {input[0]} --keep {input[1]} --make-bed -out {params[0]}'

SnakeMake pLink From line 68 of independent/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= None, names= ['chr', 'snp', 'x1', 'pos', 'a1', 'a2'])
	d= d[d.duplicated(['snp'], keep=False)]
	d.drop_duplicates(subset= ['snp'], keep= 'first')
	d.to_csv(output[0], sep= '\t', header= False, index= False)

SnakeMake From line 77 of independent/Snakefile

shell:
	'~/software/plink --bfile {params[0]} --clump {input[0]} --exclude {input[1]} --clump-r2 0.05 --clump-kb 1000 --clump-p1 5e-8 --clump-p2 1e-5 --out {params[1]}'

SnakeMake pLink From line 94 of independent/Snakefile

library(data.table)
library(dplyr)
library(coloc)
library(parallel)

df= fread(snakemake@input[[1]], select= c('ID', 'BETA', 'SE', 'TOTALSAMPLESIZE', 'EAF'))
df$MAF= ifelse(df$EAF>0.5, 1 - df$EAF, df$EAF)

z= fread(snakemake@input[[2]])
z$n= 716
df= inner_join(df, z, by= 'ID')

rm(z)

pph_outfile= snakemake@output[[1]]
results_outfile= snakemake@output[[2]]

if (!grepl('sQTL', snakemake@output[[1]])) {

cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tprotein\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tprotein\n', file= snakemake@output[[2]])
} else {

cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tprotein\tgene\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tprotein\tgene\n', file= snakemake@output[[2]])

}

prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6


df= data.frame(df)

colocalization_eqtl= function(temp_df){
	protein= unique(temp_df$gene)
        if (nrow(temp_df)== 0) {

        PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')


        } else {
	temp_df = filter(temp_df, SE>0, se> 0)

	if (grepl('allPTD', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID,s= 0.067)
        } else if (grepl('postTerm', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, s= 0.122)
        } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID) }

        data2= list(beta= temp_df$beta, varbeta= temp_df$se**2, N=temp_df$n, type= 'quant', snp= temp_df$ID)
        myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, MAF=temp_df$MAF, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
        if (length(myres)==1 ) { 
        PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')
        } else {
        PPH= data.frame(t(myres[[1]]))
        PPH$protein= protein
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= myres[[2]]
        res$protein= protein
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
}
}
}



mclapply(split(df, df$gene), colocalization_eqtl, mc.cores= 3)

R dplyr data.table Quant coloc From line 1 of iPSC/coloc_iPSC.R

script:
	'coloc_iPSC.R'

SnakeMake From line 15 of iPSC/Snakefile

library(dplyr)
library(data.table)

d= fread(snakemake@input[[1]])
d= filter(d, !is.na(Z))

x= fread(snakemake@input[[2]])
x= filter(x, !is.na(Z))

ld= fread(snakemake@input[[3]])

d= inner_join(d, x, by= 'SNP')
d= inner_join(d, ld, by= 'SNP')


source(snakemake@params[[1]])
setwd(snakemake@params[[2]])

LCV= RunLCV(d$L2, d$Z.y, d$Z.x, ldsc.intercept= 1, n.1= (d$N.y), n.2= (d$N.x))

cat('zscore\tpval.gcpzero.2tailed\tgcp.pm\tgcp.pse\trho.est\trho.err\tpval.fullycausal1\tpval.fullycausal2\th2.zscore1\th2.zscore2\tpheno\trepr_pheno\n', file = snakemake@output[[1]])

z= data.frame(zscore= LCV$zscore, pval.gcpzero.2tailed= LCV$pval.gcpzero.2tailed, gcp.pm= LCV$gcp.pm, gcp.pse= LCV$gcp.pse, rho.est= LCV$rho.est, rho.err= LCV$rho.err, pval.fullycausal1= LCV$pval.fullycausal[1],pval.fullycausal2= LCV$pval.fullycausal[2], h2.zscore1= LCV$h2.zscore[1], h2.zscore2= LCV$h2.zscore[2], pheno= snakemake@wildcards[['pheno']], repr_pheno= snakemake@wildcards[['repr_pheno']])

fwrite(z, snakemake@output[[1]], sep= '\t')

R dplyr data.table From line 1 of LCV/LCV.R

script:
	'LCV.R'

SnakeMake From line 13 of LCV/Snakefile

	shell:
		'''
		head -1 {input[0]} > {output[0]}
                tail -n +2 -q {input} >> {output[0]}
		'''

SnakeMake From line 22 of LCV/Snakefile

import pandas as pd
import numpy as np
from scipy.special import chdtri
import gzip
import csv

def not_number(s):
	if s != None:
		try:
			float(s)
			return False
		except ValueError:
			return True
	else:
		return True


def select_format(repr_pheno, row):
	'For each wildcard assign the correct formating function.'
	if repr_pheno== 'Preeclampsia':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= preeclampsia(row)
	if repr_pheno== 'POP': 
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= POP(row)
	if repr_pheno== 'miscarriage':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= miscarriage(row)
	if repr_pheno== 'GA_fetal':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= fet_GA(row)
	if repr_pheno== 'BW_maternal':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_maternal(row)
	if repr_pheno== 'BW_fetal':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_fetal(row)
	if repr_pheno== 'BW_maternal_effect':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_maternal_adjusted_effect(row)
	if repr_pheno== 'BW_fetal_effect':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_fetal_adjusted_effect(row)
	if repr_pheno== 'leiomyoma_uterus':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= leiomyoma_uterus(row)
	if repr_pheno== 'AMenopause':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= AMenopause(row)
	if repr_pheno in ['Oestradiol_fem', 'NLB', 'AFB', 'AMenarche', 'endometriosis']:
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= UKBB_traits(row)
	if repr_pheno in ['SHBG_fem', 'Testosterone_fem', 'Testosterone_male', 'SHBG_male', 'CBAT_fem', 'CBAT_male']:
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= pritchard(row)
	if repr_pheno == 'PCOS':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= PCOS(row)
	if repr_pheno in ['Ruth_CBAT_female', 'Ruth_CBAT_male', 'Ruth_SHBG_female', 'Ruth_SHBG_male', 'Ruth_Testosterone_female', 'Ruth_Testosterone_male', 'Ruth_oestradiol']:
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= Ruth(row, repr_pheno) 
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]


def AMenopause(row):
	'REPROGEN Age at menopause.'
	EAF= float(row['EAF'])
	CHR= row['CHR']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['POS'])
	REF= row['Other_Allele'].upper()
	EFF= row['Effect_Allele'].upper()
	BETA= float(row['Effect'])
	pvalue= float(row['Pval'])
	SE= float(row['SE'])
	N= int(row['N'])
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def Ruth(row, repr_pheno):
	''
	EAF= float(row['effect_allele_frequency'])
	CHR= row['chromosome']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['base_pair_location'])
	REF= row['other_allele']
	EFF= row['effect_allele']
	BETA= float(row['beta'])
	pvalue= float(row['p_value'])
	SE= float(row['standard_error'])
	N= np.where(repr_pheno== 'Ruth_SHBG_female', 189473,
	np.where(repr_pheno== 'Ruth_SHBG_make', 180726,
	np.where(repr_pheno== 'Ruth_Testosterone_female', 230454,
	np.where(repr_pheno== 'Ruth_SHBG_male',194453 ,
	np.where(repr_pheno== 'Ruth_CBAT_female', 188507,
	np.where(repr_pheno== 'Ruth_SHBG_male', 178782, 206927))))))
	rsid= row['variant_id']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def pritchard(row):
	''
	EAF= float(row['A1_FREQ'])
	CHR= row['#CHROM']
	if CHR== 'X': CHR= 23
	if not_number(CHR): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	POS= int(row['POS'])
	CHR= int(CHR)
	REF= row['REF']
	EFF= row['ALT']
	N= int(row['OBS_CT'])
	if not_number(row['BETA']): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	if not_number(row['SE']): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	if not_number(row['P']): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	BETA= float(row['BETA'])
	SE= float(row['SE'])
	pvalue= float(row['P'])
	rsid= row['ID']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def leiomyoma_uterus(row):
	''
	EAF= float(row['EAF'])
	CHR= row['CHR']
	if CHR== 'X': CHR= 23
	if not_number(CHR): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	POS= int(row['POS'])
	CHR= int(CHR)
	REF= row['REF']
	EFF= row['EFF']
	N= row['TOTALSAMPLESIZE']
	BETA= float(row['beta'])
	SE= float(row['se'])
	pvalue= float(row['pvalue'])
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def preeclampsia(row):
	''
	CHR= row['CHR']
	if CHR== 'X': CHR= 23
	if not_number(CHR): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	POS= int(row['POS'])
	CHR= int(CHR)
	REF= row['REF'].upper()
	EFF= row['EFF'].upper()
	N= 4630 + 373345
	rsid= row['rsid']
	BETA= float(row['beta'])
	SE= float(row['se'])
	EAF= float(row['EAF'])
	pvalue= float(row['pvalue'])
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def BW_fetal_adjusted_effect(row):
	'Define each header for Birth weight fetal effect.'
	EAF= float(row['eaf'])
	CHR= row['chr']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['pos'])
	REF= row['nea'].upper()
	if REF== 'R': REF= 'D'
	EFF= row['ea'].upper()
	if EFF== 'R': EFF= 'D'
	BETA= float(row['beta'])
	pvalue= float(row['p'])
	SE= float(row['se'])
	N= int(row['n_ownBW'])
	rsid= row['RSID']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def BW_maternal_adjusted_effect(row):
	'Define each header for Birth weight fetal effect.'
	EAF= float(row['eaf'])
	CHR= row['chr']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['pos'])
	REF= row['nea'].upper()
	if REF== 'R': REF= 'D'
	EFF= row['ea'].upper()
	if EFF== 'R': EFF= 'D'
	BETA= float(row['beta'])
	pvalue= float(row['p'])
	SE= float(row['se'])
	N= int(row['n_offBW'])
	rsid= row['RSID']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]


def BW_maternal(row):
	'Define each header for Birth weight maternal effect.'
	EAF= float(row['eaf'])
	CHR= row['chr']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['pos'])
	REF= row['nea']
	EFF= row['ea']
	if REF== 'R': REF= 'D'
	if EFF== 'R': EFF= 'D'
	BETA= float(row['beta'])
	pvalue= float(row['p'])
	SE= float(row['se'])
	N= int(row['n'])
	rsid= row['SNP']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def BW_fetal(row):
	'Define each header for Birth weight maternal effect.'
	EAF= float(row['eaf'])
	CHR= row['chr']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['pos'])
	REF= row['nea']
	EFF= row['ea']
	if REF== 'R': REF= 'D'
	if EFF== 'R': EFF= 'D'
	BETA= float(row['beta'])
	pvalue= float(row['p'])
	SE= float(row['se'])
	N= int(row['n'])
	rsid= row['rsid']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]


def PCOS(row):
	'Define each header for PCOS excluding 23andme.'
	EAF= float(row['EAF'])
	CHR= row['CHR']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['POS'])
	REF= row['REF']
	EFF= row['EFF']
	BETA= float(row['beta'])
	pvalue= float(row['pvalue'])
	SE= float(row['se'])
	N= int(round(float(row['TOTALSAMPLESIZE'])))
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def UKBB_traits(row):
	'Define each header for UKBB traits (hormones).'
	if row['low_confidence_variant']== 'true': return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	CHR= row['variant'].split(':')[0]
	if CHR== 'X': CHR= 23
	POS= row['variant'].split(':')[1]
	if any([not_number(t) for t in [row['minor_AF'], CHR, POS, row['beta'], row['pval'], row['se'], row['n_complete_samples']]]): return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	CHR= int(CHR)
	POS= int(POS)
	REF= row['variant'].split(':')[2]
	EFF= row['variant'].split(':')[3]
	BETA= float(row['beta'])
	pvalue= float(row['pval'])
	SE= float(row['se'])
	N= int(row['n_complete_samples'])
	if row['minor_allele']== EFF:
		EAF= float(row['minor_AF'])
	else:
		EAF= 1- float(row['minor_AF'])
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def AP_repr(row):
	'Define each header for BOLT-LMM sumstats.'
	EAF= float(row['EAF'])
	CHR= row['CHR']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['POS'])
	REF= row['A2']
	EFF= row['A1']
	BETA= float(row['Beta'])
	pvalue= float(row['P'])
	SE= float(row['se'])
	N= row['N']
	rsid= row['SNP']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]


def POP(row):
	'Define each header for pelvic organ prolapse.'
	if not row['CHR'].isdigit(): return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	EAF= float(row['EAF'])
	MAF= np.where(EAF> 0.5, 1 - EAF, EAF)
	if MAF < 0.005: return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	if row['CHR']== 'X': row['CHR']= 23
	CHR= int(row['CHR'])
	POS= int(row['POS'])
	REF= row['REF']
	EFF= row['EFF']
	BETA= float(row['BETA'])
	pvalue= float(row['pvalue'])
	SE= float(row['SE'])
	N= float(row['N'])
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def fet_GA(row):
	'Define each header for Fetal gestational duration.'
	EAF= ''
	if row['Chr']== 'X': row['Chr']= 23
	CHR= int(row['Chr'])
	POS= int(row['Pos'])
	REF= row['Non_effect_allele'].upper()
	EFF= row['Effect_allele'].upper()
	BETA= float(row['Effect'])
	pvalue= float(row['P'])
	SE= float(row['StdErr'])
	N= int(row['N'])
	rsid= row['Rsid']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def miscarriage(row):
	'Define each header for Miscarriage.'
	EAF= row['Freq1']
	CHR= row['MarkerName'].split(':')[0]
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['MarkerName'].split(':')[1])
	REF= row['Allele2'].upper()
	EFF= row['Allele1'].upper()
	BETA= float(row['Effect'])
	pvalue= float(row['P-value'])
	SE= float(row['StdErr'])
	N= 49996 + 174109
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]


def format_list(input, output):
	with gzip.open(input, 'rt', newline='') as f:
		print(input)
		dialect = csv.Sniffer().sniff(f.readline(), delimiters= ' \t')
		f.seek(0)
		input_file= csv.DictReader(f, dialect= dialect)
		df_list= list()
		with open(output, 'w') as csvfile:
			writer = csv.writer(csvfile, delimiter= '\t')
			writer.writerow([g for g in ['ID', 'rsid', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']])
		for row in input_file:
			rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= select_format(snakemake.wildcards.repr_pheno, row)
			if CHR== 0: continue
			if len(REF) >1: REF= 'I'
			if len(EFF) >1: EFF= 'I'
			if REF== 'I': EFF= 'D'
			if EFF== 'I': REF= 'D'
			if REF> EFF:
				ID= str(CHR) + ':' + str(POS) + ':' + EFF + ':' + REF
				BETA= -1 * float(BETA)
				ref= EFF
				eff= REF
				EAF= 1 - float(EAF)
			else:
				ID= str(CHR) + ':' + str(POS) + ':' + REF + ':' + EFF
				BETA= float(BETA)
				eff= EFF
				ref= REF
			df_list.append([ID, rsid, CHR, POS, EAF, N, ref, eff, BETA, SE, pvalue])
			if len(df_list)== 1000:
				with open(output, 'a', newline= '') as file_handler:
					writer1= csv.writer(file_handler, delimiter= '\t')
					for item in df_list:
						writer1.writerow(item)
				df_list= list()
	with open(output, 'a', newline= '') as file_handler:
			writer1= csv.writer(file_handler, delimiter= '\t')
			for item in df_list:
				writer1.writerow(item)


format_list(snakemake.input[0], snakemake.output[0])

Python Pandas numpy scipy preeclampsia From line 1 of LDscore_reprpheno/format_sumstats.py

script:
	'format_sumstats.py'

SnakeMake From line 8 of LDscore_reprpheno/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	d.columns= ['ID', 'SNP', 'CHR', 'POS', 'EAF', 'N', 'A2', 'A1', 'BETA', 'SE', 'pvalue']
	d.dropna(subset= ['pvalue'], axis= 0, inplace= True)

SnakeMake From line 21 of LDscore_reprpheno/Snakefile

        shell:
                """
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
		conda activate ldsc
                python2 /home/pol/software/ldsc/munge_sumstats.py \
                --out {params[0]} \
		--merge-alleles /home/pol/software/ldsc/w_hm3.snplist \
                --sumstats {input[0]} \
                --chunksize 500000
                conda deactivate
                set -eu
                """

SnakeMake From line 53 of LDscore_reprpheno/Snakefile

	run:
                allfiles= [infile for infile in input if wildcards.pheno not in infile]
                allfiles= ','.join(allfiles)
                outfile= params[0] + wildcards.pheno + '_rg'
                infile= input[0]
                shell("""
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --rg {infile},{allfiles} \
                --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --out {outfile}
                conda deactivate
                set -eu
                """)

SnakeMake From line 77 of LDscore_reprpheno/Snakefile

run:
        with open(input[0], 'r') as f:
                x= f.readlines()
        x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3]
        with open(output[0], 'w') as f:
                f.write(''.join(x))

SnakeMake From line 101 of LDscore_reprpheno/Snakefile

run:
	allfiles= [infile for infile in input if wildcards.repr_pheno not in infile]
	allfiles= ','.join(allfiles)
	outfile= params[0] + wildcards.repr_pheno + '_rg'
	infile= input[0]
	shell("""
	set +eu
	source /home/pol/miniconda3/etc/profile.d/conda.sh
	conda activate ldsc
	python2 /home/pol/software/ldsc/ldsc.py \
	--rg {infile},{allfiles} \
	--ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
	--w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
	--out {outfile}
	conda deactivate
	set -eu
	""")

SnakeMake From line 117 of LDscore_reprpheno/Snakefile

run:
        with open(input[0], 'r') as f:
                x= f.readlines()
        x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3]
        with open(output[0], 'w') as f:
                f.write(''.join(x))

SnakeMake From line 141 of LDscore_reprpheno/Snakefile

run:
	df_list= list()
	for i in input:
		d= pd.read_csv(i, delim_whitespace= True, header= 0)
		df_list.append(d)
	d= pd.concat(df_list)
	d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 154 of LDscore_reprpheno/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', header= 0)

SnakeMake From line 172 of LDscore_reprpheno/Snakefile

shell:
        """
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/munge_sumstats.py \
        --out {params[0]} \
        --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \
        --sumstats {input[0]} \
        --chunksize 500000
        conda deactivate
        set -eu
        """

SnakeMake From line 211 of LDscore_reprpheno/Snakefile

run:
        allfiles= [infile for infile in input if 'BW_maternal_effect' not in infile]
        allfiles= ','.join(allfiles)
        outfile= params[0] + 'BW_maternal_effect_rg'
        infile= input[0]
        shell("""
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/ldsc.py \
        --rg {infile},{allfiles} \
        --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --out {outfile}
        conda deactivate
        set -eu
        """)

SnakeMake From line 234 of LDscore_reprpheno/Snakefile

	shell:
		"""
		set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --h2 {input[0]} \
                --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --out {params[0]}
                conda deactivate
                set -eu
		"""

SnakeMake From line 261 of LDscore_reprpheno/Snakefile

run:
        with open(input[0], 'r') as f:
                x= f.readlines()
        x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3]
        with open(output[0], 'w') as f:
                f.write(''.join(x))

SnakeMake From line 281 of LDscore_reprpheno/Snakefile

	shell:
		"""
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --h2 {input[0]} \
                --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --out {params[0]}
                conda deactivate
                set -eu
                """

SnakeMake From line 297 of LDscore_reprpheno/Snakefile

	run:
                df_list= list()
                for infile in input:
                        with open(infile, 'r') as f:
                                lines= [line.strip() for line in f if line.startswith('Total Observed')]
                                h2= float(lines[0].split(' ')[4])
                                se= float(lines[0].split('(')[1].replace(')', ''))
                                cohort= infile.split('/')[10].replace('_h2.log', '')
                                d= pd.DataFrame({'cohort': cohort, 'h2': h2, 'se': se}, index= [0])
                                df_list.append(d)
                d= pd.concat(df_list)
                d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 317 of LDscore_reprpheno/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', header= 0, compression= 'gzip', usecols= ['RSID', 'CHR', 'POS', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])
        d.columns= ['CHR', 'POS', 'A1', 'A2', 'N', 'BETA', 'SE', 'pvalue', 'SNP']
        d.dropna(axis= 0, inplace= True)

SnakeMake From line 17 of LDscore/Snakefile

        shell:
                """
		set +eu
		source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/munge_sumstats.py \
		--merge-alleles /home/pol/software/ldsc/w_hm3.snplist \
                --out {params[0]} \
                --sumstats {input[0]} \
		--chunksize 500000
                conda deactivate
		set -eu
                """

SnakeMake From line 40 of LDscore/Snakefile

run:
	allfiles= [infile for infile in input if wildcards.pheno not in infile]
	allfiles= ','.join(allfiles)

SnakeMake From line 63 of LDscore/Snakefile

run:
	with open(input[0], 'r') as f:
		x= f.readlines()
	x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3]
	with open(output[0], 'w') as f:
		f.write(''.join(x))

SnakeMake From line 88 of LDscore/Snakefile

	shell:
		'''
		set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
		python2 /home/pol/software/ldsc/ldsc.py \
		--h2 {input[0]}\
		--ref-ld-chr /home/pol/software/ldsc/baseline/baseline/baselineLD. \
		--w-ld-chr /home/pol/software/ldsc/baseline/weights_hm3_no_hla/weights.\
		--overlap-annot\
		--frqfile-chr /home/pol/software/ldsc/baseline/1000G_Phase3_frq/1000G.EUR.QC.\
		--out {params[0]}
		conda deactivate
                set -eu
		'''

SnakeMake From line 104 of LDscore/Snakefile

        shell:
                '''
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
		cd /home/pol/software/ldsc/cts/
                python2 /home/pol/software/ldsc/ldsc.py \
                --h2-cts {input[0]}\
                --ref-ld-chr-cts {params[1]} \
                --w-ld-chr /home/pol/software/ldsc/baseline/weights_hm3_no_hla/weights.\
		--ref-ld-chr /home/pol/software/ldsc/baseline/baseline/baselineLD.\
                --out {params[0]}
                conda deactivate
                set -eu
                '''

SnakeMake From line 130 of LDscore/Snakefile

	run:
                d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['SNP', 'CHR', 'POS', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])[['SNP', 'CHR', 'POS', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']]

SnakeMake From line 156 of LDscore/Snakefile

shell:
        """
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/munge_sumstats.py \
        --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \
        --out {params[0]} \
        --sumstats {input[0]} \
        --chunksize 500000
        conda deactivate
        set -eu
        """

SnakeMake From line 185 of LDscore/Snakefile

	shell:
                """
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --h2 {input[0]} \
                --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --out {params[0]}
                conda deactivate
                set -eu
                """

SnakeMake From line 209 of LDscore/Snakefile

run:
        df_list= list()
        for infile in input:
                with open(infile, 'r') as f:
                        lines= [line.strip() for line in f if line.startswith('Total Observed')]
                        h2= float(lines[0].split(' ')[4])
                        se= float(lines[0].split('(')[1].replace(')', ''))
                        cohort= infile.split('/')[9].replace('_h2.log', '')
                        d= pd.DataFrame({'cohort': cohort, 'h2': h2, 'se': se}, index= [0])
                        df_list.append(d)
        d= pd.concat(df_list)
        d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 229 of LDscore/Snakefile

	run:
                d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['SNP', 'CHR', 'POS', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue'])[['SNP', 'CHR', 'POS', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']]
                d['SNP']= d.SNP.str.replace(':SNP', '')
                d['SNP']= d.SNP.str.replace(':INDEL', '')
                d['CHR']= d.CHR.apply(str)
                d.columns= ['ID', 'CHR', 'POS', 'N', 'A2', 'A1', 'BETA', 'SE', 'pvalue']
                d.dropna(axis= 0, inplace= True)
                d['CHR']= d.CHR.apply(str)

SnakeMake From line 252 of LDscore/Snakefile

shell:
        """
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/munge_sumstats.py \
        --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \
        --out {params[0]} \
        --sumstats {input[0]} \
        --chunksize 500000
        conda deactivate
        set -eu
        """

SnakeMake From line 281 of LDscore/Snakefile

shell:
        """
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/ldsc.py \
        --h2 {input[0]} \
        --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --out {params[0]}
        conda deactivate
        set -eu
        """

SnakeMake From line 304 of LDscore/Snakefile

run:
	df_list= list()
	for infile in input:
		with open(infile, 'r') as f:
			lines= [line.strip() for line in f if line.startswith('Total Observed')]
			h2= float(lines[0].split(' ')[4])
			se= float(lines[0].split('(')[1].replace(')', ''))
			cohort= infile.split('/')[10].replace('_allPTD.log', '')
			d= pd.DataFrame({'cohort': cohort, 'h2': h2, 'se': se}, index= [0])
			df_list.append(d)
	d= pd.concat(df_list)
	d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 324 of LDscore/Snakefile

	shell:
                """
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --h2 {input[0]} \
                --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
                --out {params[0]}
                conda deactivate
                set -eu
                """

SnakeMake From line 345 of LDscore/Snakefile

run:
	df_list= list()

SnakeMake From line 365 of LDscore/Snakefile

run:
	x= pd.read_csv(input[0], sep= '\t', header= 0)
	d= pd.read_csv(input[1], sep= '\t', header= 0)
	d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
	d[['CHR', 'POS', 'REF', 'EFF', 'INDELS']]= d['MarkerName'].str.split(':', expand= True)

SnakeMake From line 387 of LDscore/Snakefile

shell:
        """
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/munge_sumstats.py \
        --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \
        --out {params[0]} \
        --sumstats {input[0]} \
        --chunksize 500000
        conda deactivate
        set -eu
        """

SnakeMake From line 416 of LDscore/Snakefile

run:
        allfiles= [infile for infile in input if wildcards.PTD_metas not in infile]
        allfiles= ','.join(allfiles)
        outfile= params[0] + wildcards.PTD_metas + '_rg'
        infile= input[0]
        shell("""
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/ldsc.py \
        --rg {infile},{allfiles} \
        --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --out {outfile}
        conda deactivate
        set -eu
        """)

SnakeMake From line 440 of LDscore/Snakefile

run:
	with open(input[0], 'r') as f:
		x= f.readlines()
	x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3]
	with open(output[0], 'w') as f:
		f.write(''.join(x))
	with open(input[1], 'r') as f:
		x= f.readlines()
	x= x[x.index('Summary of Genetic Correlation Results\n')+2:-3]
	with open(output[0], 'a') as f:
		f.write(''.join(x))

SnakeMake From line 464 of LDscore/Snakefile

run:
	allfiles= [infile for infile in input if 'individual_cohorts/' + wildcards.big5 not in infile]
	allfiles= ','.join(allfiles)
	print(allfiles)
	outfile= input[0].replace('.txt.sumstats.gz', '_rg')
	infile= input[0]
	shell("""
	set +eu
	source /home/pol/miniconda3/etc/profile.d/conda.sh
	conda activate ldsc
	python2 /home/pol/software/ldsc/ldsc.py \
	--rg {infile},{allfiles} \
	--ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
	--w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
	--out {params[0]}
	conda deactivate
	set -eu
	""")

SnakeMake From line 488 of LDscore/Snakefile

run:
	for i in range(len(input)):
		with open(input[i], 'r') as f:
			x= f.readlines()
		if i== 0: 
			x= x[x.index('Summary of Genetic Correlation Results\n')+1:-3]
			with open(output[0], 'w') as f:
				f.write(''.join(x))
		else:
			x= x[x.index('Summary of Genetic Correlation Results\n')+2:-3]
			with open(output[0], 'a') as f:
				f.write(''.join(x))

SnakeMake From line 513 of LDscore/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	d['Allele1']= d['Allele1'].str.upper()
	d['Allele2']= d['Allele2'].str.upper()
	d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
	d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
	d['CHR']= d['CHR'].astype(str).astype(int)
	d['POS']= d['POS'].astype(str).astype(int)
	d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']]
	d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']
	d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
	d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)
	d['CHR']= d['CHR'].astype(str).astype(int)
	d['POS']= d['POS'].astype(str).astype(int)
	d['pvalue']= d['pvalue'].astype(str).astype(float)
	d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
	d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF
	d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]
	rs= pd.read_csv(input[1], sep= '\t', header=0)
	rs.columns= ['ID', 'RSID']

SnakeMake From line 533 of LDscore/Snakefile

run:
        x= pd.read_csv(input[0], sep= '\t', header= 0)
        d= pd.read_csv(input[1], sep= '\t', header= 0)
        d['CHR']= np.where(d['CHR']== 'X', '23', d['CHR'])
        d['POS']= d['POS'].astype(str).astype(int)
        d['CHR']= d['CHR'].astype(str).astype(int)
        d.dropna(axis= 0, inplace= True)
        d= pd.merge(d, x[['CHR', 'SNP', 'BP']], left_on= ['CHR', 'POS'], right_on= ['CHR', 'BP'])
        d= d.loc[~((d.CHR==6) & (d.POS >28477797) & (d.POS< 33448354)), :]
        d= d[['CHR', 'POS', 'RSID', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']]
        d.columns= ['CHR', 'POS', 'SNP', 'A1', 'A2', 'N', 'EAF', 'BETA', 'SE', 'pvalue']
        d.drop_duplicates(['CHR', 'POS', 'A1', 'A2'], keep= 'first', inplace= True)
        d.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['SNP', 'CHR', 'POS', 'N', 'A2', 'A1', 'BETA', 'SE', 'pvalue'])

SnakeMake From line 564 of LDscore/Snakefile

shell:
        """
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/munge_sumstats.py \
        --merge-alleles /home/pol/software/ldsc/w_hm3.snplist \
        --out {params[0]} \
        --sumstats {input[0]} \
        --chunksize 500000
        conda deactivate
        set -eu
        """

SnakeMake From line 588 of LDscore/Snakefile

run:
        shell("""
        set +eu
        source /home/pol/miniconda3/etc/profile.d/conda.sh
        conda activate ldsc
        python2 /home/pol/software/ldsc/ldsc.py \
        --rg {input[0]},{input[1]} \
        --ref-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --w-ld-chr /home/pol/software/ldsc/eur_w_ld_chr/ \
        --out {params[0]}
        conda deactivate
        set -eu
        """)

SnakeMake From line 611 of LDscore/Snakefile

run:
	d= pd.read_table(input[0], sep= '\t', header= 0)
	d['kbid']= d.kbid.str.split('.', expand= True)[0]
	d['Cell_type']= d.Cell_type.str.replace(' ', '-')
	for k, g in d[d['Cell_type'].isin(set(d.Cell_type))].groupby('Cell_type'):
		g.to_csv(params[0] + k + '.txt', header= False, sep= '\t', columns= ['kbid'], index= False)
	d.drop_duplicates('kbid', inplace= True, keep= 'first')
	d.to_csv(output[-1], sep= '\t', header= False, index= False, columns= ['kbid'])

SnakeMake From line 637 of LDscore/Snakefile

run:
	shell("""
	set +eu

SnakeMake From line 655 of LDscore/Snakefile

run:
	shell("""
	set +eu

SnakeMake From line 681 of LDscore/Snakefile

	shell:
                '''
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --h2 {input[0]}\
                --ref-ld-chr /home/pol/software/ldsc/baseline/baseline/baselineLD.,{params[1]}. \
                --w-ld-chr /home/pol/software/ldsc/baseline/weights_hm3_no_hla/weights.\
                --overlap-annot \
                --frqfile-chr /home/pol/software/ldsc/baseline/1000G_Phase3_frq/1000G.EUR.QC. \
                --out {params[0]} \
		--thin-annot
                conda deactivate
                set -eu
                '''

SnakeMake From line 712 of LDscore/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	d= d.loc[d.Category== 'L2_1', :]
	x= pd.read_csv(input[1], sep= '\t', header= None, names= ['Gene'])
	d['n_genes']= x.shape[0]
	d['Category']= wildcards.cell_types
	d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 736 of LDscore/Snakefile

shell:
	'''
	head -1 {input[0]} > {output[0]}
	tail -n +2 -q {input} >> {output[0]}
	'''

SnakeMake From line 750 of LDscore/Snakefile

run:
	shell("""

SnakeMake From line 766 of LDscore/Snakefile

	run:

                shell("""
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --l2 \
                --bfile {params[0]} \
                --ld-wind-cm 1 \
                --annot {input[0]} \
                --out {params[1]} \
                --print-snps {input[1]} \
                --thin-annot
                """)

SnakeMake From line 794 of LDscore/Snakefile

run:
        if wildcards.cell_types!= 'overall':
                d= pd.DataFrame({'V1': [wildcards.cell_types], 'V2': ','.join(params)})
                d.to_csv(output[0], sep= '\t', header= False, index= False)
        else:
                open(output[0], 'a').close()

SnakeMake From line 823 of LDscore/Snakefile

shell:
        'cat {input} > {output[0]}'

SnakeMake From line 836 of LDscore/Snakefile

        shell:
                '''
                set +eu
                source /home/pol/miniconda3/etc/profile.d/conda.sh
                conda activate ldsc
                python2 /home/pol/software/ldsc/ldsc.py \
                --h2-cts {input[0]}\
                --ref-ld-chr /home/pol/software/ldsc/baseline/baseline/baselineLD. \
		--ref-ld-chr-cts {input[1]} \
                --w-ld-chr /home/pol/software/ldsc/baseline/weights_hm3_no_hla/weights.\
                --overlap-annot \
                --frqfile-chr /home/pol/software/ldsc/baseline/1000G_Phase3_frq/1000G.EUR.QC. \
                --out {params[0]} \
                --thin-annot
                conda deactivate
                set -eu
                '''

SnakeMake From line 855 of LDscore/Snakefile

shell:
	'grep -v {wildcards.allPTD_coh} {input[0]} | sed -e "s/to_replace/{params[0]}/g" > {output[0]}'

SnakeMake From line 10 of LOCO_meta_allPTD/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['MarkerName', 'Allele1', 'P-value'])

SnakeMake From line 38 of LOCO_meta_allPTD/Snakefile

shell:
        'bedtools closest -t all -a {input[0]} -b {input[1]} > {output[0]}'

SnakeMake BEDTools From line 59 of LOCO_meta_allPTD/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header=0)
	d['Allele1']= d['Allele1'].str.upper()
	d['Allele2']= d['Allele2'].str.upper()
	d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
	d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
	d['CHR']= d['CHR'].astype(str).astype(int)
	d['POS']= d['POS'].astype(str).astype(int)
	d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value', 'HetISq', 'HetPVal']]
	d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue', 'HetISq', 'HetPVal']
	d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
	d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)
	d['CHR']= d['CHR'].astype(str).astype(int)
	d['POS']= d['POS'].astype(str).astype(int)
	d['pvalue']= d['pvalue'].astype(str).astype(float)
	d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
	d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF
	d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]

SnakeMake From line 69 of LOCO_meta_allPTD/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', compression= 'gzip', usecols= ['CHR', 'POS', 'pvalue', 'nearestGene', 'ID'])
        df= d.loc[d.pvalue< 5*10**-8, :]
        df.sort_values(by= 'pvalue', ascending= True, inplace= True)
        df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True)
        df_list= list()
        for chrom in set(df.CHR):
                d_temp= df.loc[df.CHR== chrom, :]
                positions= d_temp.POS.values
                for pos in positions:
                        if pos in d_temp.POS.values:
                                df_list.append(d_temp.loc[d_temp.POS== pos, :])
                                d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :]
                        else:
                                continue
        x= pd.concat(df_list)
        x['pos1']= x.POS - 1.5*10**6
        x['pos2']= x.POS + 1.5*10**6
        x['CHR']= x.CHR.astype(str)
        x['CHR']= np.where(x.CHR== '23', 'X', x.CHR)
        x.to_csv(output[0], sep='\t', header= True, index= False, columns= ['CHR', 'pos1', 'pos2', 'nearestGene', 'ID', 'pvalue'])

SnakeMake From line 98 of LOCO_meta_allPTD/Snakefile

shell:
	'grep -v {wildcards.GAraw_coh} {input[0]} | sed -e "s/to_replace/{params[0]}/g" > {output[0]}'

SnakeMake From line 10 of LOCO_meta/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['MarkerName', 'Allele1', 'P-value'])

SnakeMake From line 38 of LOCO_meta/Snakefile

shell:
        'bedtools closest -t all -a {input[0]} -b {input[1]} > {output[0]}'

SnakeMake BEDTools From line 59 of LOCO_meta/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header=0)
	d['Allele1']= d['Allele1'].str.upper()
	d['Allele2']= d['Allele2'].str.upper()
	d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
	d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
	d['CHR']= d['CHR'].astype(str).astype(int)
	d['POS']= d['POS'].astype(str).astype(int)
	d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value', 'HetISq', 'HetPVal']]
	d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue', 'HetISq', 'HetPVal']
	d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
	d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)
	d['CHR']= d['CHR'].astype(str).astype(int)
	d['POS']= d['POS'].astype(str).astype(int)
	d['pvalue']= d['pvalue'].astype(str).astype(float)
	d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
	d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF
	d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]

SnakeMake From line 69 of LOCO_meta/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', compression= 'gzip', usecols= ['CHR', 'POS', 'pvalue', 'nearestGene', 'ID'])
        df= d.loc[d.pvalue< 5*10**-8, :]
        df.sort_values(by= 'pvalue', ascending= True, inplace= True)
        df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True)
        df_list= list()
        for chrom in set(df.CHR):
                d_temp= df.loc[df.CHR== chrom, :]
                positions= d_temp.POS.values
                for pos in positions:
                        if pos in d_temp.POS.values:
                                df_list.append(d_temp.loc[d_temp.POS== pos, :])
                                d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :]
                        else:
                                continue
        x= pd.concat(df_list)
        x['pos1']= x.POS - 1.5*10**6
        x['pos2']= x.POS + 1.5*10**6
        x['CHR']= x.CHR.astype(str)
        x['CHR']= np.where(x.CHR== '23', 'X', x.CHR)
        x.to_csv(output[0], sep='\t', header= True, index= False, columns= ['CHR', 'pos1', 'pos2', 'nearestGene', 'ID', 'pvalue'])

SnakeMake From line 98 of LOCO_meta/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['CHR', 'POS', 'pvalue', 'nearestGene'], compression= 'gzip')
	d.sort_values(['pvalue'], ascending=True, inplace= True)
	d.drop_duplicates(['CHR', 'POS'], inplace= True, keep= 'first')
	d['ID']= 'chr' + d.CHR.apply(str) + ':' + d.POS.apply(str)
	d.columns= ['CHR', 'POS', 'P-value', 'nearestGene', 'MarkerName']
	d.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['MarkerName', 'P-value'])

SnakeMake From line 11 of LocusZoom/Snakefile

run:
	df= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['CHR', 'POS', 'pvalue', 'nearestGene'], compression= 'gzip')
	df= df.loc[df.pvalue< 5*10**-8, :]
	df.sort_values(by= 'pvalue', ascending= True, inplace= True)
	df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True)
	df_list= list()
	for chrom in set(df.CHR):
		d_temp= df.loc[df.CHR== chrom, :]
		positions= d_temp.POS.values
		for pos in positions:
			if pos in d_temp.POS.values:
				df_list.append(d_temp.loc[d_temp.POS== pos, :])
				d_temp= d_temp.loc[(d_temp.POS < pos - (1.5 * 10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :]
			else:
				continue
	df= pd.concat(df_list)
	df['CHR']= df.CHR.astype(str)

SnakeMake From line 25 of LocusZoom/Snakefile

run:
	if not os.path.exists(params[1]):
		os.makedirs(params[1])
	df= pd.read_csv(input[0], sep= '\t', header= 0)
	for index, row in df.iterrows():
		snp= row['snp']
		title= '"' + row['nearestGene'] + '"'
		shell('python2 /home/pol/software/locuszoom/bin/locuszoom --metal {input[1]} --refsnp {snp} --flank 250kb --plotonly --no-date --build hg19 --pop EUR --source 1000G_March2012 --prefix {params[0]} title={title} theme=publication')
		outfile= params[1] + 'chr' + str(row['chr']) + '_' + row['nearestGene'] + '.pdf'
		infile= params[0] + '_' + snp.replace(':', '_') + '.pdf'
		shell('qpdf --empty --pages {infile} 1 -- {outfile}; rm {infile}')

SnakeMake From line 57 of LocusZoom/Snakefile

run:
	if len(input)== 1:
		shell('cp {input[0]} {output[0]}')

SnakeMake From line 80 of LocusZoom/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['CHR', 'POS', 'pvalue'], compression= 'gzip')

SnakeMake From line 96 of LocusZoom/Snakefile

run:
	if not os.path.exists(params[1]):
		os.makedirs(params[1])
	shell('python2 /home/pol/software/locuszoom/bin/locuszoom --metal {input[0]} --refsnp rs9823520 --flank 250kb --plotonly --no-date --build hg19 --pop EUR --source 1000G_March2012 --prefix {params[0]} title="WNT4-GA" theme=publication')

SnakeMake From line 126 of LocusZoom/Snakefile

shell:
	'/home/pol/software/generic-metal/metal {params[0]} >> {output[1]}'

SnakeMake From line 20 of meta/Snakefile

    shell:
        '''
        /home/pol/software/generic-metal/metal {input[0]} >> {output[3]}
        /home/pol/software/generic-metal/metal {input[1]} >> {output[3]}
	/home/pol/software/generic-metal/metal {input[2]} >> {output[3]}
        '''

SnakeMake From line 37 of meta/Snakefile

shell:
    '''
    /home/pol/software/generic-metal/metal {input[0]} >> {output[2]}
    /home/pol/software/generic-metal/metal {input[1]} >> {output[2]}
    '''

SnakeMake From line 55 of meta/Snakefile

shell:
        '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}'

SnakeMake From line 70 of meta/Snakefile

shell:
        '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}'

SnakeMake From line 81 of meta/Snakefile

shell:
        '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}'

SnakeMake From line 92 of meta/Snakefile

shell:
        '/home/pol/software/generic-metal/metal {input[0]} >> {output[1]}'

SnakeMake From line 104 of meta/Snakefile

shell:
    '''
    /home/pol/software/generic-metal/metal {input[0]} >> {output[1]}
    '''

SnakeMake From line 115 of meta/Snakefile

library(MendelianRandomization)
library(data.table)
library(dplyr)

if (!grepl('cluster', snakemake@output[[1]])){
d= fread(snakemake@input[[1]])
names(d)= c('ID', 'beta', 'se', 'pvalue', 'trait')
} else {
d= fread(snakemake@input[[1]])

}
x=fread(snakemake@input[[2]])
x= filter(x, !duplicated(ID))
d= inner_join(d, x, by= 'ID')



funk= function(temp_df){

inputMR= mr_input(bx = temp_df$beta,   bxse= temp_df$se,by = temp_df$BETA, byse = temp_df$SE)

if (nrow(temp_df)>3) {

z= mr_allmethods(inputMR)$Values
names(z)= c('method', 'estimate', 'se', 'lo95', 'up95', 'pvalue')
z$trait= unique(temp_df$trait)

} else {
z= mr_ivw(inputMR)

z= data.frame(method= 'IVW', estimate= z$Estimate, se= z$StdError, lo95= z$CILower, up95= z$CIUpper, pvalue= z$Pvalue, trait= unique(temp_df$trait))

}
return(z)
}


mr= lapply(split(d, d$trait), funk)

mr= do.call('rbind', mr)

fwrite(mr, snakemake@output[[1]], sep= '\t')

R dplyr data.table From line 1 of MR/MR_reproductive_traits.R

run:
	d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['CHR', 'POS', 'pvalue', 'ID'])
	x= pd.read_csv(input[1], sep= '\t', header= 0, usecols= ['ID', 'EAF'])
	x= x.loc[((x.EAF>=0.01) & (x.EAF<= 0.99)), :]
	d= d.loc[d.pvalue< 5e-8, :]
	d= d.loc[d.ID.isin(x.ID.values), :]
	d.drop_duplicates('ID', inplace= True)
	if d.shape[0] == 0: 
		open(output[0], 'a').close()
	else:

SnakeMake From line 16 of MR/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= None, names= ['CHR', 'SNP', 'x1', 'POS', 'A1', 'A2'])
	d['A1']= np.where(d.A1.str.len() > d.A2.str.len(), 'I', d.A1)
	d['A2']= np.where(d.A1.str.len() < d.A2.str.len(), 'I', d.A2)
	d['A1']= np.where(d.A2== 'I', 'D', d.A1)
	d['A2']= np.where(d.A1== 'I', 'D', d.A2)
	d['SNP']= np.where(d.A1>d.A2, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.A2 + ':' + d.A1, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.A1 + ':' + d.A2)
	d.to_csv(output[0], sep= '\t', header= False, index= False)
	d= d[d.duplicated(['SNP'], keep= False)]
	d.drop_duplicates('SNP', inplace= True, keep= 'first')
	d.to_csv(output[1], sep='\t', columns= ['SNP'], index= False, header= False)

SnakeMake From line 40 of MR/Snakefile

run:
        shell('~/software/plink --bim {input[2]} --bed {input[3]} --fam {input[4]} --clump {input[0]} --exclude {input[1]} --clump-r2 0.001 --clump-kb 1000 --clump-p1 5e-8 --clump-p2 1e-5 --out {params[1]} || true')

SnakeMake pLink From line 64 of MR/Snakefile

run:
	if os.stat(input[1]).st_size == 0:
		open(output[0], "w").close
	else:
		d= pd.read_csv(input[0], sep='\t', header= 0, usecols= ['ID', 'BETA', 'SE', 'pvalue'])
		x= pd.read_csv(input[1], delim_whitespace= True, header= 0)
		d= d.loc[d.ID.isin(x.SNP.values), :]
		d= d.groupby('ID').head(1)
		d= d[['ID', 'BETA', 'SE', 'pvalue']]
		d['trait']= wildcards.repr_pheno
		d.to_csv(output[0], sep= '\t', header= False, index= False)

SnakeMake From line 75 of MR/Snakefile

shell:
	'echo -e "ID\tbeta\tse\tpvalue_exp\ttrait" | cat {input} > {output[0]}'

SnakeMake From line 93 of MR/Snakefile

script:
        'MR_reproductive_traits.R'

SnakeMake From line 104 of MR/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', header= 0)
        x= pd.read_csv(input[1], sep= '\t', header= 0, usecols= ['#CHROM', 'POS', 'ID', 'REF', 'ALT'])
        x.columns= ['CHR', 'POS', 'ID', 'REF', 'ALT']
        df= d.loc[(d.OKG_proxy != 'Signal in 1KG') & (d.OKG_proxy != 'No 1KG proxy'), :]
        okg= df.loc[(df.OKG_Other_allele.str.len() == 1) & (d.OKG_Trait_raising.str.len() == 1), :]
        hm= df.loc[(df.OKG_Other_allele.str.len() != 1) | (d.OKG_Trait_raising.str.len() != 1), :]
        hm['beta']= hm.HM_Weight
        hm['ref']= hm.HM_Other_allele
        hm['eff']= hm.HM_Trait_raising
        hm['RSID']= hm.HM_proxy
        hm['se']= hm.HM_SE_weight
        hm= hm[['RSID', 'beta', 'se', 'ref', 'eff', 'Cluster']]
        okg['beta']= okg.OKG_Weight
        okg['ref']= okg.OKG_Other_allele
        okg['eff']= okg.OKG_Trait_raising
        okg['RSID']= okg.OKG_proxy
        okg['se']= okg.OKG_SE_weight
        okg= okg[['RSID', 'beta', 'se', 'ref', 'eff', 'Cluster']]
        d= d.loc[(d.OKG_proxy == 'Signal in 1KG') | (d.OKG_proxy == 'No 1KG proxy'), :]
        d['beta']= d.Weight
        d['ref']= d.Other_allele
        d['eff']= d.Trait_raising
        d['RSID']= d.Signal
        d['se']= d.SE_weight
        d= d[['RSID', 'beta', 'se', 'ref', 'eff', 'Cluster']]
        d= pd.concat([d, hm, okg])
        d= pd.merge(d, x, left_on= ['RSID'], right_on= 'ID')
        d= d.loc[(d.ALT== d.ref) | (d.REF== d.ref), :]
        d= d.loc[(d.ALT== d.eff) | (d.REF== d.eff), :]
        d['beta']= np.where(d.ref > d.eff, -1 * d.beta, d.beta)
        d['ID']= np.where(d.ref > d.eff, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.eff + ':' + d.ref, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.ref + ':' + d.eff)
        d['trait']= np.where(d.Cluster== 'Female SHBG cluster', 'SHBG_fem_cluster', 'Testosterone_fem_cluster')
        d.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['ID', 'beta', 'se', 'trait'])

SnakeMake From line 115 of MR/Snakefile

script:
        'MR_reproductive_traits.R'

SnakeMake From line 158 of MR/Snakefile

run:
	x= pd.read_csv(input[0], sep= '\t', header= None, names= ['ID', 'beta', 'se', 'pvalue', 'trait'])
	x= x.loc[((x.trait== 'SHBG_fem') | (x.trait== 'Testosterone_fem') | (x.trait== 'CBAT_fem')), :]
	x.drop_duplicates(subset= 'ID', inplace= True)
	x[['CHR', 'POS', 'REF', 'EFF']]= x.ID.str.split(':', expand= True)
	x['CHR']= np.where(x.CHR== 'X', '23', x.CHR)
	x.to_csv(output[0], sep= '\t', header= False, index= False, columns= ['CHR', 'POS', 'POS', 'ID'])

SnakeMake From line 168 of MR/Snakefile

shell:
	'~/software/plink2 --bfile {params[0]} --extract bed1 {input[0]} --memory 5000 --threads {threads} --make-bed --out {params[1]}'

SnakeMake plink2 From line 186 of MR/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', header= None, names= ['CHR', 'SNP', 'x1', 'POS', 'A1', 'A2'])
        d['A1']= np.where(d.A1.str.len() > d.A2.str.len(), 'I', d.A1)
        d['A2']= np.where(d.A1.str.len() < d.A2.str.len(), 'I', d.A2)
        d['A1']= np.where(d.A2== 'I', 'D', d.A1)
        d['A2']= np.where(d.A1== 'I', 'D', d.A2)
        d['CHR']= d.CHR.apply(str)
        d['CHR']= np.where(d.CHR== 'X', '23', d.CHR)
        d['SNP']= np.where(d.A1>d.A2, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.A2 + ':' + d.A1, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.A1 + ':' + d.A2)
        d.to_csv(output[0], sep= '\t', header= False, index= False)
        d= d[d.duplicated(['SNP'], keep= False)]
        d.drop_duplicates('SNP', inplace= True, keep= 'first')
        d.to_csv(output[3], sep='\t', columns= ['SNP'])
        shell('mv {input[1]} {output[1]}')
        shell('mv {input[2]} {output[2]}')

SnakeMake From line 200 of MR/Snakefile

shell:
        '~/software/plink --bfile {params[0]} --r square --out {params[1]}'

SnakeMake pLink From line 225 of MR/Snakefile

script:
	'MVMR.R'

SnakeMake From line 240 of MR/Snakefile

library(data.table)
library(dplyr)

hrc= fread(snakemake@input[[1]])

funk= function(infile){
d= fread(infile)

print(paste('Filtering file: ', infile))

d= arrange(d, CHR, POS, EFF, REF)

d= filter(d, pvalue< 1, pvalue>0)

d$pval= pnorm(-abs(d$BETA / d$SE)) * 2

d= filter(d, (abs(-log10(pvalue) - -log10(pval)) / -log10(pval)) * 100 <= 10)

d$ID= with(d, ifelse(REF> EFF, paste(CHR, POS, EFF, REF, sep= ':'), paste(CHR, POS, REF, EFF, sep= ':')))

d$SNP= with(d, ifelse(grepl('I', ID), paste(ID, 'INDEL', sep= ':'), paste(ID, 'SNP', sep= ':')))

print(str(d))

d= inner_join(d, hrc, by= 'ID')
d$EAF= ifelse(is.na(d$EAF), d$eaf, d$EAF)

d$BETA= ifelse(d$REF> d$EFF, -1 * d$BETA, d$BETA)
d$EAF= ifelse(d$REF> d$EFF, 1 - d$EAF, d$EAF)

d[d$REF>d$EFF, c("REF", "EFF")]= d[d$REF > d$EFF, c("EFF", "REF")]

d$MAF= ifelse(d$EAF>0.5, 1- d$EAF, d$EAF)
d= filter(d, MAF>= 0.005)

d= filter(d, pvalue>0, pvalue<1, MAF>=0.005, SE>0)

d= filter(d, (MAF * 2 * N) > 6)

d$maf= ifelse(d$eaf> 0.5, 1 - d$eaf, d$eaf)

d= filter(d, abs(maf - MAF) < 0.2)

if (grepl('GAraw/Viva', infile)){

d$EAF= with(d, ifelse(abs(eaf - EAF)> 0.2, 1 - EAF, EAF))
d$BETA= with(d, ifelse(abs(eaf - EAF)> 0.2, -1 * BETA, BETA))
}

if (grepl('GAnrm/Viva', infile)){

d$EAF= with(d, ifelse(abs(eaf - EAF)> 0.2, 1 - EAF, EAF))
d$BETA= with(d, ifelse(abs(eaf - EAF)> 0.2, -1 * BETA, BETA))
}

if (grepl('postTerm/HUNT', infile)){

d$EAF= with(d, ifelse(abs(eaf - EAF)> 0.2, 1 - EAF, EAF))
d$BETA= with(d, ifelse(abs(eaf - EAF)> 0.2, -1 * BETA, BETA))
}

d= arrange(d, pvalue)
d= filter(d, !duplicated(ID))

d= select(d, -c(MAF, ID, eaf, pval))

x2= nrow(d)

d$STRAND= '+'

#outfile= paste0(snakemake@params[[1]], gsub('_temp.txt', '', unlist(strsplit(infile, '/'))[9]), '.txt')

fwrite(d, snakemake@output[[1]], sep= '\t')
}

#input_files= snakemake@input[grepl('sumstats', snakemake@input)]

lapply(snakemake@input[[2]], funk)

R dplyr data.table From line 1 of munge_stats/filter_SNPs.R

run:
    format_list(input[0], output[0])

SnakeMake From line 96 of munge_stats/Snakefile

run:
	for infile in input:
		outfile= params[0] + infile.split('-')[0].split('/')[-1] + '_temp.txt'
		shell('mv {infile} {outfile}')

SnakeMake From line 107 of munge_stats/Snakefile

run:
    format_list(input[0], output[0])

SnakeMake From line 118 of munge_stats/Snakefile

run:
        for infile in input:
                outfile= params[0] + infile.split('-')[0].split('/')[-1] + '_temp.txt'
                shell('mv {infile} {outfile}')

SnakeMake From line 129 of munge_stats/Snakefile

run:
    format_list(input[0], output[0])

SnakeMake From line 140 of munge_stats/Snakefile

run:
        for infile in input:
                outfile= params[0] + infile.split('-')[0].split('/')[-1] + '_temp.txt'
                shell('mv {infile} {outfile}')

SnakeMake From line 151 of munge_stats/Snakefile

run:
    format_list(input[0], output[0])

SnakeMake From line 163 of munge_stats/Snakefile

run:
        for infile in input:
                outfile= params[0] + infile.split('-')[0].split('/')[-1] + '_temp.txt'
                shell('mv {infile} {outfile}')

SnakeMake From line 174 of munge_stats/Snakefile

run:
	d= pd.read_csv(input[0], header= 0, sep= '\t', usecols= ['#CHROM', 'POS', 'REF', 'ALT', 'AF_EXCLUDING_1000G'])
	d.columns= ['CHR', 'POS', 'REF', 'ALT', 'eaf']
	d['CHR']= np.where(d.CHR=='X', '23', d.CHR)
	KG= pd.read_csv(input[1], header= 0, sep='\t', compression= 'gzip', names= ['ID', 'ALT', 'REF', 'eaf'])
	KG['ID']= KG['ID'].str.replace(':ID', '')
	KG['ID']= KG['ID'].str.replace('X', '23')
	d['eaf']= np.where(d['REF']> d['ALT'], 1 - d.eaf, d.eaf)
	KG['eaf']= np.where(KG['REF']> KG['ALT'], 1- KG.eaf, KG.eaf)
	d['REF']= np.where(d.REF.str.len() > d.ALT.str.len(), 'I', d.REF)
	d['ALT']= np.where(d.REF.str.len()< d.ALT.str.len(), 'I', d.ALT)
	d['REF']= np.where(d.ALT== 'I', 'D', d.REF)
	d['ALT']= np.where(d.REF== 'I', 'D', d.ALT)
	KG['REF']= np.where(KG.REF.str.len() > KG.ALT.str.len(), 'I', KG.REF)

SnakeMake From line 186 of munge_stats/Snakefile

script:
        'filter_SNPs.R'

SnakeMake From line 229 of munge_stats/Snakefile

library(data.table)
library(dplyr)

d= fread(snakemake@input[[1]])

x1= nrow(d)

d= arrange(d, CHR, POS, EFF, REF)

hrc= fread(snakemake@input[[2]], header=T)

d= inner_join(d, hrc, by= 'ID')
rm(hrc)
d$EAF= ifelse(is.na(d$EAF), d$eaf, d$EAF)

d[d$REF>d$EFF, c("REF", "EFF")]= d[d$REF > d$EFF, c("EFF", "REF")]

d$MAF= ifelse(d$EAF>0.5, 1- d$EAF, d$EAF)

d= filter(d, MAF>0.005)

d= filter(d, (MAF * 2 * N) > 6)

d$maf= ifelse(d$eaf> 0.5, 1 - d$eaf, d$eaf)
d$P= as.numeric(d$P)

d= filter(d, P<1, P>0)
d= filter(d, abs(MAF - maf) < 0.2)

d= select(d, -c(maf, MAF, eaf))

x2= nrow(d)

write.table(d, snakemake@output[[1]], col.names= T, row.names=F, sep= '\t', quote= F)

cohort= unlist(strsplit(unlist(strsplit(snakemake@input[[1]], '/'))[[10]], '_'))[2]
cat(c(cohort, '\t', x1, '\t', x2, '\n'), file= snakemake@output[[2]])

R dplyr data.table From line 1 of nonadditive/filter_SNPs.R

import pandas as pd
import numpy as np
import re

#d= pd.read_csv(snakemake.input[0], sep= '\t', header= 0)

#d['Allele1']= d['Allele1'].str.upper()
#d['Allele2']= d['Allele2'].str.upper()
#d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
#d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
#d['CHR']= d['CHR'].astype(str).astype(int)
#d['POS']= d['POS'].astype(str).astype(int)
#d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']]
#d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']
#d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
#d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)

#d['CHR']= d['CHR'].astype(str).astype(int)
#d['POS']= d['POS'].astype(str).astype(int)

#d['pvalue']= d['pvalue'].astype(str).astype(float)

#d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
#d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF

#d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]

col_list= ['IMPACT', 'DISTANCE', 'SYMBOL', 'SYMBOL_SOURCE', 'BIOTYPE']
df_list= list()

for vep in pd.read_csv(snakemake.input[1], sep= '\t', header= None, names= ['Variation', 'Location', 'Allele', 'Gene', 'Feature', 'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 'Existing_variation', 'Extra'], comment= '#', chunksize= 100000):
	for i in col_list:
		vep[i]= vep['Extra'].apply(lambda y: dict([(x.split('=', 1)) for x in re.split(';(?=\w)', y) if x.find('=') > -1])[i] if i in y else '')
	vep= vep[['Variation', 'Location', 'Existing_variation', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'SYMBOL_SOURCE', 'BIOTYPE']]
	vep.columns= ['ID', 'Location', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'SYMBOL_SOURCE', 'BIOTYPE']
	vep['BIOTYPE1']= np.where(vep.BIOTYPE== 'protein_coding', 0, np.where(vep.BIOTYPE.str.contains('pseudo'), 2, 1))
	vep['DISTANCE']= np.where(vep.DISTANCE== '', 0, vep.DISTANCE)
	vep[['chr', 'pos', 'All']]= vep.ID.str.split('_', expand= True)
	vep[['EFF', 'REF']]= vep.All.str.split('/', expand= True)
	vep.loc[vep.REF > vep.EFF, ['REF', 'EFF']] = vep.loc[vep.REF > vep.EFF, ['EFF', 'REF']].values
	vep[['CHR', 'POS']]= vep['Location'].str.split(':', expand= True)
	vep['CHR']= np.where(vep['CHR']== 'X', '23', vep['CHR'])
	vep['ID']= vep.CHR.astype(int).astype(str) + ':' + vep.POS.astype(int).astype(str) + ':' + vep.REF + ':' + vep.EFF
	vep= vep[['ID', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'BIOTYPE', 'BIOTYPE1']]
	vep.sort_values(by= ['BIOTYPE1'], ascending= True, inplace= True)
	vep.drop_duplicates(subset= ['ID'], keep= 'first', inplace= True)
	df_list.append(vep)

vep= pd.concat(df_list)

vep.sort_values(by= ['BIOTYPE1'], ascending= True, inplace= True)
vep.drop_duplicates(subset= ['ID'], keep= 'first', inplace= True)
vep= vep[['ID', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'BIOTYPE']]


d= pd.read_csv(snakemake.input[0], sep= '\t', header= 0)
d['Allele1']= d['Allele1'].str.upper()
d['Allele2']= d['Allele2'].str.upper()
d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
d= d.loc[d.TOTALSAMPLESIZE> 66106, :]
d[['CHR', 'POS', 'REF','EFF']]= d['MarkerName'].str.split(':', expand= True)
d['CHR']= d['CHR'].astype(str).astype(int)
d['POS']= d['POS'].astype(str).astype(int)
d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'P-value']]
d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'pvalue']
d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)
d['CHR']= d['CHR'].astype(str).astype(int)
d['POS']= d['POS'].astype(str).astype(int)
d['pvalue']= d['pvalue'].astype(str).astype(float)
d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF
d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]
d['MAF']= np.where(d.EAF>0.5, 1 - d.EAF, d.EAF)
d= d.loc[d.MAF>= 0.1, :]
d= pd.merge(d, vep, on= ['ID'], how= 'left')
d.to_csv(snakemake.output[0], header=True, index= False, sep= '\t')

Python Pandas numpy Variant Effect Predictor (VEP) From line 1 of nonadditive/format_VEP.py

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	dcols= d.columns.values[1:]
	d.drop('INFO', 1, inplace= True)
	d.columns= dcols
	d= d.loc[d.INFO>= 0.4, :]
	d['MAF']=  np.where(d.EAF_CONTR> 0.5, 1- d.EAF_CONTR, d.EAF_CONTR)
	d= d.loc[d.MAF * 2 * d.N >6, :]
	d.drop('MAF', 1, inplace= True)
	d['REF_ALLELE']= np.where(d.REF_ALLELE.str.len()> d.EFF_ALLELE.str.len(), 'I', d.REF_ALLELE)
	d['EFF_ALLELE']= np.where(d.REF_ALLELE.str.len()< d.EFF_ALLELE.str.len(), 'I', d.EFF_ALLELE)
	d['REF_ALLELE']= np.where(d.EFF_ALLELE== 'I', 'D', d.REF_ALLELE)
	d['EFF_ALLELE']= np.where(d.REF_ALLELE== 'I', 'D', d.EFF_ALLELE)
	d['CHR']= d.CHR.apply(str)
	d['CHR']= np.where(d.CHR== 'X', '23', d.CHR)
	d['ID']= np.where(d.REF_ALLELE> d.EFF_ALLELE, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.EFF_ALLELE + ':' + d.REF_ALLELE, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.REF_ALLELE + ':' + d.EFF_ALLELE)
	d= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_DOM', 'P_VAL_REC', 'INFO']]
	df= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_REC', 'INFO']]
	d= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_DOM','INFO']]
	d= d.loc[(d.P_VAL_DOM!= '.' ), :]
	df= df.loc[(df.P_VAL_REC!= '.'), :]
	d[['BETA_ADD', 'P_VAL_DOM']]= d[['BETA_ADD', 'P_VAL_DOM']].apply(pd.to_numeric, errors= 'coerce')
	df[['BETA_ADD', 'P_VAL_REC']]= df[['BETA_ADD', 'P_VAL_REC']].apply(pd.to_numeric, errors= 'coerce')
	d.dropna(axis= 0, inplace= True)
	df.dropna(axis= 0, inplace= True)
	d.columns= ['ID', 'CHR', 'POS', 'EFF', 'REF', 'N', 'EAF', 'BETA', 'P', 'INFO']
	d.to_csv(output[0], sep= '\t', header= True, index= False)
	df.columns= ['ID', 'CHR', 'POS', 'EFF', 'REF', 'N', 'EAF', 'BETA', 'P', 'INFO']
	df.to_csv(output[1], sep= '\t', header= True, index= False)

SnakeMake From line 13 of nonadditive/Snakefile

run:
	for i in range(len(input)):
		d= pd.read_csv(input[i], sep= '\t', header= 0)
		d= d.loc[d.INFO>= 0.4, :]
		d['MAF']=  np.where(d.EAF> 0.5, 1- d.EAF, d.EAF)

SnakeMake From line 56 of nonadditive/Snakefile

run:
	for i in range(len(input)):
		print(input[i])
		d= pd.read_csv(input[i], header= 0, delim_whitespace= True)
		d[['BETA_ADD', 'P_VAL_DOM', 'P_VAL_REC', 'INFO', 'EAF_CONTR']]= d[['BETA_ADD', 'P_VAL_DOM', 'P_VAL_REC', 'INFO', 'EAF_CONTR']].apply(pd.to_numeric, errors= 'coerce')
		d= d.loc[d.INFO>= 0.4, :]
		d['MAF']=  np.where(d.EAF_CONTR> 0.5, 1- d.EAF_CONTR, d.EAF_CONTR)
		d= d.loc[d.MAF * 2 * d.N >6, :]
		d.drop('MAF', 1, inplace= True)
		d['REF_ALLELE']= np.where(len(d.REF_ALLELE)> len(d.EFF_ALLELE), 'I', d.REF_ALLELE)
		d['EFF_ALLELE']= np.where(len(d.REF_ALLELE)< len(d.EFF_ALLELE), 'I', d.EFF_ALLELE)
		d['REF_ALLELE']= np.where(d.EFF_ALLELE== 'I', 'D', d.REF_ALLELE)
		d['EFF_ALLELE']= np.where(d.REF_ALLELE== 'I', 'D', d.EFF_ALLELE)
		d['CHR']= d.CHR.apply(str)
		d['CHR']= np.where(d.CHR== '0X', 'X', d.CHR)
		d['CHR']= np.where(d.CHR== 'X', '23', d.CHR)
		d['ID']= np.where(d.REF_ALLELE> d.EFF_ALLELE, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.EFF_ALLELE + ':' + d.REF_ALLELE, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.REF_ALLELE + ':' + d.EFF_ALLELE)
		d= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_DOM', 'P_VAL_REC', 'INFO']]
		df= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_REC', 'INFO']]
		d= d[['ID', 'CHR', 'POS', 'EFF_ALLELE', 'REF_ALLELE', 'N', 'EAF_CONTR', 'BETA_ADD', 'P_VAL_DOM','INFO']]
		d= d.loc[(d.P_VAL_DOM!= '.' ), :]
		df= df.loc[(df.P_VAL_REC!= '.'), :]
		d.dropna(axis= 0, inplace= True)
		df.dropna(axis= 0, inplace= True)
		d.columns= ['ID', 'CHR', 'POS', 'EFF', 'REF', 'N', 'EAF', 'BETA', 'P', 'INFO']
		d.to_csv(output[i], sep= '\t', header= True, index= False)
		df.columns= ['ID', 'CHR', 'POS', 'EFF', 'REF', 'N', 'EAF', 'BETA', 'P', 'INFO']
		out= output[i].replace('dom', 'rec')
		df.to_csv(out, sep= '\t', header= True, index= False)

SnakeMake From line 105 of nonadditive/Snakefile

script:
	'filter_SNPs.R'

SnakeMake From line 145 of nonadditive/Snakefile

shell:
	'cat {input} > {output[0]}'

SnakeMake From line 154 of nonadditive/Snakefile

shell:
    '''
    /home/pol/software/generic-metal/metal {input[0]} >> {output[1]}
    '''

SnakeMake From line 165 of nonadditive/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', header= 0)
        d[['CHR', 'POS', 'REF', 'EFF']]= d['MarkerName'].str.split(':', expand= True)

SnakeMake From line 176 of nonadditive/Snakefile

shell:
        '/home/pol/software/ensembl-vep/vep -i {input[0]} --check_existing --symbol --biotype --cache -O {output[0]} --offline --force_overwrite'

SnakeMake Variant Effect Predictor (VEP) From line 194 of nonadditive/Snakefile

script:
	'format_VEP.py'

SnakeMake From line 205 of nonadditive/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['MarkerName', 'Allele1'])
        d['CHR']= d.MarkerName.str.split(':').str[0]
        d['end']= d.MarkerName.str.split(':').str[1]
        d['CHR']= d.CHR.astype('str').astype('int')
        d['end']= d.end.astype('str').astype('int')
        d['start']= d.end - 1
        d.sort_values(by= ['CHR', 'start'], inplace= True)
        d= d[['CHR', 'start', 'end', 'MarkerName']]
        d.to_csv(output[0], sep= '\t', header= False, index= False)

SnakeMake From line 214 of nonadditive/Snakefile

shell:
        'bedtools closest -t all -a {input[0]} -b {input[1]} > {output[0]}'

SnakeMake BEDTools From line 232 of nonadditive/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', header=0)
        rs= pd.read_csv(input[1], sep= '\t', header=0)
        d= pd.merge(d, rs, on= 'ID', how= 'left')
        d['RSID']= np.where(pd.isnull(d.RSID), d.name, d.RSID)
        d['RSID']= np.where(d.RSID== '', d.name, d.RSID)
        d['RSID']= np.where(d.RSID== '-', d.name, d.RSID)
        d.drop('name', 1, inplace= True)
        ne= pd.read_csv(input[2], sep= '\t', header= None, names= ['CHR', 'X', 'POS', 'ID', 'c1', 'p1', 'p2', 'nearestGene', 'Ensembl_gene'])
        ne= ne[['ID', 'nearestGene']]
        d= pd.merge(d, ne, on= 'ID', how= 'left')
        d.to_csv(output[0], sep= '\t', header= True, index= False, compression= 'gzip')

SnakeMake From line 244 of nonadditive/Snakefile

run:
	for nfile in range(len(input)):
		d= pd.read_csv(input[nfile], sep= '\t', header= 0)
		d['Allele1']= d['Allele1'].str.upper()
		d['Allele2']= d['Allele2'].str.upper()
		d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
		d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
		d['CHR']= d['CHR'].astype(str).astype(int)
		d['POS']= d['POS'].astype(str).astype(int)
		d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']]
		d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']
		d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
		d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)
		d['CHR']= d['CHR'].astype(str).astype(int)
		d['POS']= d['POS'].astype(str).astype(int)
		d['pvalue']= d['pvalue'].astype(str).astype(float)
		d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
		d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF
		d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]
		d.to_csv(output[nfile], header=True, index= False, sep= '\t')

SnakeMake From line 14 of other_metas/Snakefile

run:
	for fnumber in range(len(input)):

SnakeMake From line 48 of other_metas/Snakefile

run:
	meta_files= [x for x in input if 'other_meta' in x]
	for nfile in range(len(meta_files)):
		meta= meta_files[nfile]
		out= output[nfile]
		shell('bedtools closest -t all -a {meta} -b {input[0]} > {out}')

SnakeMake BEDTools From line 72 of other_metas/Snakefile

run:
	rs= pd.read_csv(input[0], sep= '\t', header=0)

SnakeMake From line 96 of other_metas/Snakefile

run:
        for nfile in range(len(input)):
                d= pd.read_csv(input[nfile], sep= '\t', header= 0)
                d['Allele1']= d['Allele1'].str.upper()
                d['Allele2']= d['Allele2'].str.upper()
                d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
                d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
                d['CHR']= d['CHR'].astype(str).astype(int)
                d['POS']= d['POS'].astype(str).astype(int)
                d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']]
                d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']
                d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
                d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)
                d['CHR']= d['CHR'].astype(str).astype(int)
                d['POS']= d['POS'].astype(str).astype(int)
                d['pvalue']= d['pvalue'].astype(str).astype(float)
                d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
                d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF
                d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]
                d.to_csv(output[nfile], header=True, index= False, sep= '\t')

SnakeMake From line 116 of other_metas/Snakefile

run:
        rs= pd.read_csv(input[0], sep= '\t', header=0)
        d= pd.read_csv(input[1], sep= '\t', header=0)
        d= pd.merge(d, rs, on= 'ID', how= 'left')
        d['RSID']= d.name
        d.drop('name', 1, inplace= True)
        d.to_csv(output[0], sep= '\t', header= True, index= False, compression= 'gzip')

SnakeMake From line 144 of other_metas/Snakefile

library(data.table)
library(dplyr)
library(coloc)
library(parallel)

df= fread(snakemake@input[[1]], select= c('CHR', 'POS', 'ID', 'BETA', 'SE', 'TOTALSAMPLESIZE', 'EAF'))

df= filter(df, !duplicated(ID))

df$MAF= ifelse(df$EAF>0.5, 1 - df$EAF, df$EAF)

x= fread(snakemake@input[[2]], select= c('CHR', 'POS', 'nearestGene'))
x= x[, c('CHR', 'POS', 'nearestGene')]
names(x)= c('CHR', 'pos2', 'nearestGene')

df= inner_join(df, x, by= 'CHR')

df= filter(df, POS>= pos2 - 1.5*10**6, POS< pos2 + 1.5*10**6)

z= fread(snakemake@input[[3]], select= c('chr', 'pos', 'Allele1', 'Allele2', 'Freq1', 'Effect', 'StdErr', 'TotalSampleSize'))

z$Allele1= toupper(z$Allele1)
z$Allele2= toupper(z$Allele2)

z$ID= with(z, ifelse(Allele1 > Allele2, paste(chr, pos, Allele2, Allele1, sep= ':'), paste(chr, pos, Allele1, Allele2, sep= ':')))

z$maf= ifelse(z$Freq1> 0.5, 1 - z$Freq1, z$Freq1)

z= select(z, ID, maf, Effect, StdErr, TotalSampleSize)

df= inner_join(df, z, by= 'ID')

rm(z)

pph_outfile= snakemake@output[[1]]
results_outfile= snakemake@output[[2]]


cat('nsnps\tPP.H0.abf\tPP.H1.abf\tPP.H2.abf\tPP.H3.abf\tPP.H4.abf\tprotein\n', file = snakemake@output[[1]])

cat('snp\tV.df\tz.df1\tr.df1\tlABF.df1\tV.df2\tz.df2\tr.df2\tlABF.df2\tinternal.sum.lABF\tSNP.PP.H4\tprotein\n', file= snakemake@output[[2]])

prior1= 1 * 10**-4
prior2= 1 * 10**-4
prior12= 5 * 10**-6


df= data.frame(df)


colocalization_eqtl= function(temp_df){
	protein= unique(temp_df$nearestGene)
        if (nrow(temp_df)== 0) {

        PPH= data.frame(nsnps= 0, PP.H0.abf= 0,PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')

        } else {
	temp_df = filter(temp_df, SE>0, StdErr> 0)

	if (grepl('allPTD', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, s= 0.067, MAF= temp_df$MAF)
        } else if (grepl('postTerm', snakemake@input[[1]])) {
        data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N=temp_df$TOTALSAMPLESIZE, type= 'cc', snp= temp_df$ID, s= 0.122, MAF= temp_df$MAF)
        } else {data1= list(beta= temp_df$BETA, varbeta= temp_df$SE**2, N= temp_df$TOTALSAMPLESIZE, type= 'quant', snp= temp_df$ID, MAF= temp_df$MAF) }

        data2= list(beta= temp_df$Effect, varbeta= temp_df$StdErr**2, N=temp_df$TotalSampleSize, type= 'quant', snp= temp_df$ID, MAF= temp_df$maf)
        myres= tryCatch({suppressWarnings(coloc.abf(data1, data2, p1= prior1, p2= prior2, p12= prior12))}, error= function(e) { return(0)}
)
        if (length(myres)==1 ) { 
        PPH= data.frame(nsnps= 0, PP.H0.abf= 0, PP.H1.abf= 0, PP.H2.abf= 0, PP.H3.abf= 0, PP.H4.abf= 0, protein= protein)
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= data.frame(snp= 'none', V.df1= 0, z.df1= 0, r.df1= 0, lABF.df1= 0, V.df2= 0, z.df2= 0, r.df2= 0, lABF.df2= 0, internal.sum.lABF= 0, SNP.PP.H4= 0, protein= protein)
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        print('next')
        } else {
        PPH= data.frame(t(myres[[1]]))
        PPH$protein= protein
        fwrite(PPH, pph_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
        res= myres[[2]]
        res$protein= protein
        fwrite(res, results_outfile, sep= '\t', row.names=F, col.names= F, quote=F, append= T)
}
}
}



mclapply(split(df, df$nearestGene), colocalization_eqtl, mc.cores= 3)

R dplyr data.table Quant coloc From line 1 of pQTLs/coloc_pQTL.R

script:
	'coloc_pQTL.R'

SnakeMake From line 12 of pQTLs/Snakefile

body .main-container {
  max-width: 1280px !important;
  width: 1280px !important;
}
body {
  max-width: 1280px !important;
}

R Markdown From line 18 of reports/all_files_QC.Rmd

pheno= unlist(strsplit(snakemake@input[[2]], '/'))[8]

R Markdown From line 28 of reports/all_files_QC.Rmd

library("ggplot2")
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library("kableExtra")
library("data.table")
library(moments)
options(warn=-1)
opts_chunk$set(fig.width = 12)

R Markdown ggplot2 dplyr data.table tidyr cowplot knitr kableExtra From line 40 of reports/all_files_QC.Rmd

p1_list= list()
p2_list= list()
p3_list= list()
df_list= list()
df2_list= list()
fl= length(snakemake@input)

dec= fread(snakemake@input[[grep('DECODE', snakemake@input)]])
dec$MAF= ifelse(dec$EAF>0.5, 1 - dec$EAF, dec$EAF)
dec= select(dec, c(SNP, BETA))
names(dec)= c('SNP', 'BETA_dec')
for (i in 2:fl){

df= fread(snakemake@input[[i]])
df= select(df, -c(STRAND))

cohort= unlist(strsplit(snakemake@input[[i]], '/'))[9]

df$cohort= cohort
df= filter(df, !is.na(EAF))

p1= summarize(df, n_m= median(N, na.rm=T), se_m= mean(SE, na.rm=T))
p1$cohort= cohort
p2= summarize(df, N_max= sqrt(max(N)), EAF_m= median(1/sqrt(2*EAF*(1-EAF)), na.rm=T) / median(SE))
p2$cohort= cohort

d= filter(df, pvalue> median(pvalue, na.rm=T))

p3= summarize(d,SK= skewness(BETA/SE), KU= kurtosis(BETA/SE))
p3$cohort= cohort

p1_list[[cohort]]= p1
p2_list[[cohort]]= p2
p3_list[[cohort]]= p3

df_list[[cohort]]= group_by(df, CHR) %>% sample_n(5000, replace=T)

df= inner_join(df, dec, by= 'SNP')
df$beta_diff= df$BETA - df$BETA_dec

df2_list[[cohort]]= group_by(df, CHR) %>% sample_n(5000, replace=T)

}


p1= do.call("rbind", p1_list)
p2= do.call("rbind", p2_list)
p3= do.call("rbind", p3_list)
d= do.call("rbind", df_list)
d2= do.call('rbind', df2_list)

R Markdown From line 57 of reports/all_files_QC.Rmd

ggplot(p1, aes(n_m, se_m)) +
geom_point() +
geom_text(aes(label=cohort), hjust=0, vjust=0) +
theme_cowplot() +
xlab('Median(N)') +
ylab('Mean(SE)')

R Markdown ggplot2 From line 113 of reports/all_files_QC.Rmd

ggplot(p2, aes(N_max, EAF_m)) +
geom_point() +
geom_text(aes(label=cohort), hjust=0, vjust=0) +
theme_cowplot() +
xlab('SQRT(Max(N))') +
ylab('median(1/sqrt(2*EAF*(1-EAF)), na.rm=T) / median(SE)')

R Markdown ggplot2 From line 127 of reports/all_files_QC.Rmd

ggplot(p3, aes(SK, KU)) +
geom_point() +
geom_text(aes(label=cohort), hjust=0, vjust=0) +
theme_cowplot() +
xlab('Skewness (Z-score)') +
ylab('Kurtosis (Z-score)')

R Markdown ggplot2 From line 145 of reports/all_files_QC.Rmd

d$MAF= ifelse(d$EAF>0.5, 1 - d$EAF, d$EAF)

ggplot(d, aes(MAF, BETA)) +
geom_point() +
facet_wrap(vars(cohort), scales= 'free_y', ncol= 3) +
theme_cowplot() +
xlab('MAF') +
ylab('BETA')

R Markdown ggplot2 From line 160 of reports/all_files_QC.Rmd

d2$MAF= ifelse(d2$EAF>0.5, 1 - d2$EAF, d2$EAF)

ggplot(d2, aes(MAF, beta_diff)) +
geom_point() +
facet_wrap(vars(cohort), scales= 'free_y', ncol= 3) +
theme_cowplot() +
xlab('MAF') +
ylab('BETA cohort - BETA DECODE')

R Markdown ggplot2 From line 177 of reports/all_files_QC.Rmd

library(tint)

knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tint'))
options(htmltools.dir.version = FALSE)
pheno= snakemake@wildcards[['pheno']]
pdf.options(useDingbats = TRUE)
#knitr::opts_chunk$set(dpi=300)
knitr::opts_chunk$set(dev = 'png', warning= FALSE, message= FALSE, dpi= 600)


pheno= ifelse(pheno=='allPTD', 'Preterm Delivery', ifelse(pheno== 'postTerm', 'Post Term', ifelse(pheno=='GAraw', 'Gestational duration', 'Normalized Gestational Duration')))

R Markdown From line 10 of reports/coloc.Rmd

library("ggplot2")
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library("kableExtra")
library(ggrepel)
library("data.table")
options(warn=-1)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

d= fread(snakemake@input[[1]])

d$p1= gsub('.txt.sumstats.gz', '', apply(d[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2]))
d$p2= gsub('.txt.sumstats.gz', '', apply(d[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))

d$trait= d$p2
d$trait= with(d, ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Birth weight maternal effect',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
		ifelse(trait== 'SHBG_male', 'SHBG (men)',
		ifelse(trait== 'CBAT_fem', 'CBAT (women)',
		ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
		ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
		ifelse(trait== 'BW_fetal', 'Birth weight fetal effect',
		ifelse(trait== 'BW_fetal_effect', 'Birth weight fetal effect (adjusted MG)',
		ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
		ifelse(trait== 'BW_maternal_effect', 'Birth weight maternal effect (adjusted FG)',
                ifelse(trait== 'PCOS', 'Polycistic ovary syndrome', 'Endometriosis'))))))))))))))))))))))

R Markdown ggplot2 dplyr data.table tidyr cowplot ggrepel knitr kableExtra From line 30 of reports/coloc.Rmd

cat(paste0('\n- ', factor(d$trait)), sep= "\n")

R Markdown From line 78 of reports/coloc.Rmd

Testosterone in males was further included as a negative control, and only after a first round of genetic correlations.  

R Markdown From line 82 of reports/coloc.Rmd

bw= filter(d, grepl('Birth weight ', d$trait))
df= filter(d, !grepl('Birth weight ', d$trait))

df$significant= ifelse(df$p< 0.05 / (nrow(df) - 7 ), '1', '0')

bw$trait= with(bw, ifelse(trait== 'Birth weight fetal effect', 'Fetal effect', ifelse(
			trait== 'Birth weight fetal effect (adjusted MG)', 'Fetal effect\n (adjusted MG)', ifelse(
			trait=='Birth weight maternal effect (adjusted FG)', 'Maternal effect \n(adjusted FG)', ifelse(
			trait== 'Birth weight maternal effect', 'Maternal effect', '')))))


bw$trait= gsub('Birth weight', '', bw$trait)
ggplot(bw, aes(trait, rg, colour= trait)) +
geom_point() +
geom_errorbar(aes(ymin= I(rg - 1.96*se) , ymax= (rg + 1.96 * se)), width=.1, position=position_dodge(.9)) +
theme_cowplot() +
scale_colour_manual(guide= F, values= colorBlindBlack8[c(1,2,4,8)]) +
xlab('Birth weight') +
ylab('Genetic correlation [95% CI]') +
geom_hline(yintercept= 0) +
ylim(-1, 1) +
scale_y_continuous(breaks= seq(-1, 1, 0.2)) +
geom_hline(yintercept= seq(-1, 1, 0.2), colour= 'grey', size= 0.3, linetype= 'dashed')

R Markdown ggplot2 From line 86 of reports/coloc.Rmd

ggplot(df, aes(trait, rg, colour= significant)) +
geom_point() +
geom_errorbar(aes(ymin= I(rg - 1.96*se) , ymax= (rg + 1.96 * se)), width=.2, position=position_dodge(.9)) +
theme_cowplot() +
theme(axis.text.x= element_text(angle=45, vjust= 1, hjust= 1)) +
scale_colour_manual(guide= F, values= c('#737373', colorBlindBlack8[2])) +
xlab('Reproductive traits') +
ylab('Genetic correlation [95% CI]') +
geom_hline(yintercept= 0) +
ylim(-1, 1) +
scale_y_continuous(breaks= seq(-1, 1, 0.2)) +
geom_hline(yintercept= seq(-1, 1, 0.2), colour= 'grey', size= 0.3, linetype= 'dashed')

R Markdown ggplot2 From line 115 of reports/coloc.Rmd

We used a Bonferroni corrected threshold for significance (0.05/ 13). We exclude testosterone in males, as this test was performed a posteriori as a negative control for testosterone in women.   

R Markdown From line 132 of reports/coloc.Rmd

While coloc naiveley assumes one causal variant, it does not require an LD matrix that represents the summary statistics used. This is almost impossible to obtain without an LD matrix from each of the studies used in the meta-analysis.  

R Markdown From line 157 of reports/coloc.Rmd

inputs= snakemake@input[grep('pph', snakemake@input)]

df_list= list()

for (infile in inputs) {
d_temp= fread(infile)
#d_temp$trait= gsub('.txt', '', unlist(strsplit(infile, '_'))[2])
df_list[[infile]]= d_temp
}

d= do.call('rbind', df_list)

d$trait= with(d, ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Birth weight maternal effect',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
		ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
		ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
		ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Birth weight fetal effect',
                ifelse(trait== 'BW_fetal_effect', 'Birth weight fetal effect (adjusted MG)',
                ifelse(trait== 'BW_maternal_effect', 'Birth weight maternal effect (adjusted FG)',
                ifelse(trait== 'PCOS', 'Polycistic ovary syndrome', 'Endometriosis'))))))))))))))))))))))

d$locus= gsub('_', ' ', d$locus)
d$locus= gsub('chr', 'Chr', d$locus)
d$locus= gsub('23', 'X', d$locus)

R Markdown From line 162 of reports/coloc.Rmd

We identify the different loci as the chromosome where the locus is located and the nearest protein coding gene to the top associated genetic variant (e.g., Chr5 EBF1).  

R Markdown From line 203 of reports/coloc.Rmd

x1= select(d, locus, PP.H3.abf, trait)
x1$PP= 'Shared locus'

x2= select(d, locus, PP.H4.abf, trait)
x2$PP= 'Shared genetic variant'

names(x1)= c('locus', 'coloc', 'trait', 'PP')
names(x2)= c('locus', 'coloc', 'trait', 'PP')

x= bind_rows(x1, x2)

d$coloc= with(d, ifelse(PP.H4.abf< 0.5, 'No evidence', ifelse(PP.H4.abf>=  0.50 & PP.H4.abf< 0.75, 'Suggestive evidence', 'Strong evidence')))

ggplot(d, aes(trait, locus, size= PP.H4.abf,  fill= coloc, color= coloc, shape= direction, alpha= coloc)) + 
geom_point() + 
theme_cowplot() + 
theme(axis.text.x= element_text(angle=45, vjust= 1, hjust= 1)) +
scale_size_binned('Posterior probability of colocalization', guide= FALSE) +
scale_alpha_manual('Colocalization', values= c(1,0.55, 0.55)) +
scale_shape_manual('Effect direction', values=c(25, 21, 24)) +
scale_fill_manual('Colocalization', values=  c('#737373', colorBlindBlack8[2], colorBlindBlack8[4])) +
scale_colour_manual('Colocalization', values=  c('#737373', colorBlindBlack8[2], colorBlindBlack8[4])) +
xlab('') +
ylab('')

R Markdown ggplot2 From line 208 of reports/coloc.Rmd

x3= select(d, locus, PP.H3.abf, trait)
x3$PP= 'H3'

x4= select(d, locus, PP.H4.abf, trait)
x4$PP= 'H4'

x0= select(d, locus, PP.H0.abf, trait)
x0$PP= 'H0'

x1= select(d, locus, PP.H1.abf, trait)
x1$PP= 'H1'

x2= select(d, locus, PP.H2.abf, trait)
x2$PP= 'H2'

names(x0)= c('locus', 'coloc', 'trait', 'PP')
names(x1)= c('locus', 'coloc', 'trait', 'PP')
names(x2)= c('locus', 'coloc', 'trait', 'PP')
names(x3)= c('locus', 'coloc', 'trait', 'PP')
names(x4)= c('locus', 'coloc', 'trait', 'PP')

x= bind_rows(x0, x1, x2, x3, x4)


x= x[order(x$PP, decreasing= T),]

x$evidence= ifelse(x$coloc>= 0.75, '1', '0')

ggplot(filter(x, PP== 'H3' | PP== 'H4'), aes(fill= factor(PP), y=coloc, x= locus, alpha= evidence)) + 
geom_bar(position="stack", stat="identity") +
scale_fill_manual('Posterior probability', values= c(colorBlindBlack8[2], colorBlindBlack8[4])) +
scale_alpha_manual('Posterior probability', values= c(0.55, 0.8), guide=FALSE) +
facet_wrap(vars(trait), ncol= 3) +
theme_cowplot() +
theme(axis.text.x= element_text(angle=45, vjust= 1, hjust= 1),
strip.background= element_blank(),
legend.position= 'bottom') +
xlab('') +
ylab('')

R Markdown ggplot2 From line 248 of reports/coloc.Rmd

z= filter(d, PP.H4.abf >= 0.75)


res_inputs= snakemake@input[grep('results_', snakemake@input)]

df_list= list()

for (infile in res_inputs) {
x= fread(infile, select= c('snp', 'z.df1', 'z.df2', 'SNP.PP.H4', 'locus', 'trait'))

x= gather(x, pheno, zscore, c(z.df1, z.df2))
x$pvalue=2 *  pnorm(-abs(x$zscore))
x= separate(x, snp, into= c('CHR', 'POS', 'REF', 'EFF'), sep =':', remove= F)
x$POS= as.numeric(x$POS)
x$pheno= ifelse(x$pheno =='z.df1', pheno, x$trait)
df_list[[infile]]= x
}

df= bind_rows(df_list)

df$trait= with(df, ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Birth weight maternal effect',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
		ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
		ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'BW_fetal', 'Birth weight fetal effect',
                ifelse(trait== 'BW_fetal_effect', 'Birth weight fetal effect (adjusted MG)',
		ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_maternal_effect', 'Birth weight maternal effect (adjusted FG)',
                ifelse(trait== 'PCOS', 'Polycistic ovary syndrome', 'Endometriosis'))))))))))))))))))))))

df$locus= gsub('_', ' ', df$locus)
df$locus= gsub('chr', 'Chr', df$locus)
df$locus= gsub('23', 'X', df$locus)

z$id= paste(z$trait, z$locus, sep= ':')
df$id= paste(df$trait, df$locus, sep= ':')
df= filter(df, id %in% z$id)

df$pheno= ifelse(df$pheno== pheno, pheno, df$trait)

for (i in unique(df$id)){


PP= filter(z, id== i)$PP.H4.abf

temp_df= filter(df, id== i)
temp_df$pheno= temp_df$pheno
temp_df$POS= temp_df$POS / 10**6
high_df= filter(temp_df, id == i, SNP.PP.H4== max(SNP.PP.H4))

(ggplot() + 
geom_point(data= temp_df, aes(POS, -log10(pvalue), colour= pheno), size= 1, alpha = 0.5) +
geom_point(data= high_df, aes(POS, -log10(pvalue)), colour= colorBlindBlack8[1], size= 2) + 
facet_wrap(vars(pheno), nrow= 2, scales = "free_y") +
theme_cowplot(font_size=14) +
theme(strip.background= element_blank()) +
scale_colour_manual(guide=FALSE, values= colorBlindBlack8[c(4,2)]) +
ylab('-log10(pvalue)') +
xlab('Position (Mbp)') +
geom_text_repel(data=high_df, aes(x= POS, y= -log10(pvalue), label=snp), hjust= 0.5, size=3, vjust= 1) +
ggtitle(paste('Locus: ', unique(temp_df$locus), '. Posterior probability for shared causal variant: ', round(PP, 3)))) %>% print()


cat('  \n')


}

R Markdown From line 297 of reports/coloc.Rmd

cohort= unlist(strsplit(snakemake@input[[1]], '/'))[9]
pheno= unlist(strsplit(snakemake@input[[1]], '/'))[8]

R Markdown From line 18 of reports/file_level_qc.Rmd

library("ggplot2")
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library("kableExtra")
library("data.table")
options(warn=-1)
#opts_chunk$set(dpi=300, out.width="300px")

R Markdown ggplot2 dplyr data.table tidyr cowplot knitr kableExtra From line 29 of reports/file_level_qc.Rmd

d= fread(snakemake@input[[1]], h=T)

R Markdown From line 45 of reports/file_level_qc.Rmd

kable(summary(select(d, BETA, SE, pvalue, EAF, N)))

R Markdown From line 53 of reports/file_level_qc.Rmd

#dec= fread(snakemake@input[[3]],h=T, select= c('CHR', 'POS', 'BETA', 'SE', 'EFF', 'REF', 'EAF', 'pvalue'))

#names(dec)= c('CHR', 'POS', 'BETA_dec', 'SE_dec', 'A1_dec', 'A2_dec', 'EAF_dec', 'pvalue_dec')

#dec$BETA_dec= ifelse(dec$A1_dec> dec$A2_dec, dec$BETA_dec, -1* dec$BETA_dec)
#dec$EAF_dec= ifelse(dec$A1_dec> dec$A2_dec, dec$EAF_dec, 1- dec$EAF_dec)

x= fread(snakemake@input[[2]], h=T)


df= d
df$CHR= as.numeric(df$CHR)
x$CHR= as.numeric(x$CHR)
df$EAF= ifelse(df$EFF> df$REF, df$EAF, 1 - df$EAF)

df= inner_join(df, x, on= c('CHR', 'POS'))

df= filter(df, (EFF== ea & REF== oa) | (REF== ea & EFF== oa))

if (nrow(df)>0){
ggplot(df, aes(eaf, EAF)) +
geom_point(alpha= 1/10) +
theme_cowplot(12) +
xlab('EAF HRC') +
ylab('EAF Sample')
} else{
print('No match')
}

rm(df)
rm(x)

R Markdown ggplot2 From line 63 of reports/file_level_qc.Rmd

d$BETA= ifelse(d$REF> d$EFF, d$BETA, -1* d$BETA)
#dec= inner_join(d, dec, on= c('CHR', 'POS'))

#dec= filter(dec, EFF== A1_dec, REF== A2_dec, pvalue_dec< 0.001)

#dec= group_by(dec, CHR) %>% slice_sample(n= 1000, replace= T)

#ggplot(dec, aes(BETA_dec/SE_dec, BETA/SE)) +
#geom_point(alpha= 1/10) +
#theme_cowplot(12) +
#xlab('DECODE z-score') +
#ylab('Observed z-score')

#rm(dec)

R Markdown From line 101 of reports/file_level_qc.Rmd

d$exp_pvalue= pchisq((d$BETA/d$SE)^2, df=1, lower=F)

ggplot(group_by(d, CHR) %>% sample_n(10000, replace=T ), aes(-log10(exp_pvalue), -log10(pvalue))) +
geom_point(alpha= 1/10) +
theme_cowplot() +
xlab('Expected pvalue') +
ylab('Observed pvalue')

R Markdown ggplot2 From line 125 of reports/file_level_qc.Rmd

df= arrange(d, pvalue) %>% mutate(exp1= -log10(1:length (pvalue)/length (pvalue)))

ggplot(filter(df, pvalue<0.05), aes(exp1, -log10(pvalue))) +
  geom_point(size= 0.4) +
  geom_abline(intercept = 0, slope = 1, alpha = .5) +
theme_cowplot(12, font_size= 12) +
xlab('Expected (-log10(p-value))') +
ylab('Observed (-log10(p-value))')

R Markdown ggplot2 From line 140 of reports/file_level_qc.Rmd

library(tint)

knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tint'))
options(htmltools.dir.version = FALSE)
pheno= snakemake@wildcards[['pheno']]
pdf.options(useDingbats = TRUE)
#knitr::opts_chunk$set(dpi=300)
knitr::opts_chunk$set(dev = 'png', warning= FALSE, message= FALSE, dpi= 600)


pheno= ifelse(pheno=='allPTD', 'Preterm Delivery', ifelse(pheno== 'postTerm', 'Post Term', ifelse(pheno=='GAraw', 'Gestational duration', 'Normalized Gestational Duration')))

R Markdown From line 10 of reports/forest_plots.Rmd

library("ggplot2")
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library("kableExtra")
library(ggrepel)
library("data.table")
options(warn=-1)

colorBlindBlack8  <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

R Markdown ggplot2 dplyr data.table tidyr cowplot ggrepel knitr kableExtra From line 29 of reports/forest_plots.Rmd

d= fread(snakemake@input[[1]])

z= fread(snakemake@input[[3]])




df= fread(snakemake@input[[2]], select= (c('MarkerName', 'Effect', 'StdErr', 'HetISq', 'HetPVal', 'TOTALSAMPLESIZE', 'P-value', 'Allele1', 'Allele2')))
names(df)= c('SNP', 'BETA', 'SE', 'HetISq', 'HetPval', 'N', 'pvalue', 'A1', 'A2')
df= filter(df, SNP %in% d$SNP)

df= separate(df, SNP, into= c('CHR', 'POS', 'Ax1', 'Ax2', 'ID'), sep= ':', remove= F)
df$BETA= ifelse(df$A2 > df$A1, -1 * df$BETA, df$BETA)
df$CHR= ifelse(df$CHR== 'X','23', df$CHR)
df$CHR= as.integer(df$CHR)
df$POS= as.integer(df$POS)
df= select(df, -c(A1, A2, ID, Ax1, Ax2))

df$cohort= 'Meta-analysis'
d= bind_rows(d, df)

z$CHR= ifelse(z$CHR== 'X','23', z$CHR)
z$CHR= as.integer(z$CHR)

d= inner_join(d, z, by= 'CHR') %>% filter(POS> pos1, POS< pos2)



d$locus= paste0('Chr ', d$CHR,': ', d$nearestGene)
d$cohort= paste0(d$cohort, ' (n= ', d$N, ')')

R Markdown From line 44 of reports/forest_plots.Rmd

for (i in unique(d$locus)) {
temp_df= d[d$locus== i, ]

cat('\n')

cat("\n# Forest plot for locus ", i, "\n")

cat("\n")

cat('\n')

cat('Lead variant: \n', temp_df[!is.na(temp_df$HetISq), ]$SNP)

cat('\n')

cat(paste0('\n Meta-analysis: Beta= ', round(temp_df[!is.na(temp_df$HetISq), ]$BETA, 3), ' (95% CI= ', round(temp_df[!is.na(temp_df$HetISq), ]$BETA - 1.96 * temp_df[!is.na(temp_df$HetISq), ]$SE, 3), ', ', round(temp_df[!is.na(temp_df$HetISq), ]$BETA + 1.96 * temp_df[!is.na(temp_df$HetISq), ]$SE, 3), '); pvalue= ', temp_df[!is.na(temp_df$HetISq), ]$pvalue))

cat('\n')

cat('\n')
temp_df= temp_df[order(temp_df$N, decreasing= T), ]

(ggplot(temp_df, aes(x=factor(cohort, level = factor(cohort)), y=BETA, ymin= BETA - 1.96 * SE, ymax= BETA + 1.96 * SE, colour= !is.na(HetISq), shape= !is.na(HetISq)), alpha= 0.5) +
 geom_pointrange(size= 1, alpha= 0.7) +
scale_shape_manual(values= c(15, 18), guide= F) +
 geom_hline(yintercept = 0, linetype=2) +
scale_y_continuous(sec.axis = dup_axis()) +
 coord_flip() +
scale_colour_manual(values= c(colorBlindBlack8[3], colorBlindBlack8[4]), guide= F) +
theme_cowplot() +
 xlab('') +
    ylab('Beta [95% CI]') +
geom_vline(xintercept= 0, linetype= "dotted", colour= 'grey') ) %>% print()

cat('\n')

cat('Test for heterogeneity: I^2^= ', temp_df[!is.na(temp_df$HetISq), ]$HetISq, '%; Het pvalue= ', temp_df[!is.na(temp_df$HetISq), ]$HetPval, '\n')

cat('\\pagebreak')

}

R Markdown From line 78 of reports/forest_plots.Rmd

library(tint)

knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tint'))
options(htmltools.dir.version = FALSE)
pheno= snakemake@wildcards[['pheno']]
pdf.options(useDingbats = TRUE)
#knitr::opts_chunk$set(dpi=300)
knitr::opts_chunk$set(dev = 'png', warning= FALSE, message= FALSE, dpi= 600)


pheno= ifelse(pheno=='allPTD', 'Preterm Delivery', ifelse(pheno== 'postTerm', 'Post Term', ifelse(pheno=='GAraw', 'Gestational duration', 'Normalized Gestational Duration')))

R Markdown From line 10 of reports/meta_qc.Rmd

library("ggplot2")
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library("kableExtra")
library(ggrepel)
library("data.table")
options(warn=-1)

R Markdown ggplot2 dplyr data.table tidyr cowplot ggrepel knitr kableExtra From line 32 of reports/meta_qc.Rmd

d= fread(snakemake@input[[1]], h= T)
d$MAF= ifelse(d$EAF>0.5,  1 - d$EAF, d$EAF)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

ggplot(d, aes(MAF)) +
  geom_density(fill= colorBlindBlack8[4]) +
theme_cowplot(font_size= 12) +
xlab('MAF')

#d= select(d, -MAF)

R Markdown ggplot2 From line 45 of reports/meta_qc.Rmd

ggplot(d, aes(TOTALSAMPLESIZE)) +
  geom_density(fill= colorBlindBlack8[3]) +
theme_cowplot(font_size= 12) +
xlab('Sample size')

R Markdown ggplot2 From line 61 of reports/meta_qc.Rmd

ggplot(d, aes(BETA)) +
  geom_density(fill= colorBlindBlack8[2]) +
theme_cowplot(font_size= 12) +
xlab('Beta')

R Markdown ggplot2 From line 72 of reports/meta_qc.Rmd

**Effective sample size** for binary phenotypes was calculated as:  
$$\frac{2}{(\frac{1}{Ncases} + \frac{1}{Ncontrols})}$$  

R Markdown From line 91 of reports/meta_qc.Rmd

ge= data.frame(CHR= c(5, 3, 1, 23, 1), pos_ge= c(157895049, 127881613, 22470407, 115164770, 22470407))

#kable(summary(select(d, BETA, SE, pvalue, EAF, TOTALSAMPLESIZE)), digits = c(3, 3, 5, 4, 0), col.names = c('Beta', 'Standard error', 'P-value', 'Effect allele frequency', 'Sample size'), caption= 'Summary statistics after QC.')

R Markdown From line 110 of reports/meta_qc.Rmd

The same number of loci is obtained when using a larger radius (1.5Mb).

R Markdown From line 124 of reports/meta_qc.Rmd

We note that we used a naive approach to identify independent loci. This should be interpreted cautiously. We mapped top genetic variants to the body (TSS or TES) nearest protein coding gene.  

R Markdown From line 128 of reports/meta_qc.Rmd

df= arrange(d, pvalue)

#df= group_by(df, CHR, POS) %>% filter(row_number() == 1) %>% ungroup()
df= df[!duplicated(df[, c('CHR', 'POS')]), ]

dg= df %>% arrange(CHR, POS) %>% filter(pvalue< 5*10**-8) %>% group_by(CHR) %>%
		mutate(d=POS-lag(POS, default=-Inf), clumpid=cumsum(d>250000)) %>%
		group_by(CHR, clumpid) %>%
		filter(rank(pvalue, ties.method = "random")==1) %>%
		mutate(GENE= ifelse(SYMBOL=='', RSID, SYMBOL)) %>%
		ungroup()

dg= filter(dg, pvalue< 5*10**-8)
dg= group_by(dg, CHR, POS) %>% filter(row_number()== 1)
#dg$GENE= ifelse(grepl('rs|-', dg$GENE), dg$nearestGene, ifelse(dg$GENE=='', dg$nearestGene, dg$GENE))
dg$GENE= dg$nearestGene

  don <- df %>%
    group_by(CHR)      %>%
    summarise(chr_len= max(POS)) %>%
    mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome
    select(-chr_len) %>%
    left_join(df, ., by= 'CHR') %>%
    arrange(CHR, POS) %>% # Add a cumulative position of each SNP
    mutate( BPcum=POS+tot) %>%
	 ungroup()

  axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
  names(axisdf)= c('CHR', 'center')
HC= -log10(5*10**-8)
dg= dg %>% ungroup() %>% select(ID, GENE, CHR, POS, MAF, BETA)
don= left_join(don, select(dg, ID, GENE), by= 'ID')
names(dg)= c('ID', 'GENE', 'CHR', 'POS_new', 'MAF', 'BETA')

lims= 250000

don$disc= ifelse(don$pvalue> 5*10**-8, 0, NA)

for (i in rownames(dg)) {
don= mutate(don, disc= ifelse(CHR== as.integer(dg[i, 'CHR']) & POS>= as.integer(dg[i, 'POS_new']) - lims & POS<= as.integer(dg[i, 'POS_new']) + lims, 2, disc))
}

for (i in rownames(ge)) {
don= mutate(don, disc= ifelse(CHR== as.integer(ge[i, 'CHR']) & POS>= as.integer(ge[i, 'pos_ge']) - lims & POS<= as.integer(ge[i, 'pos_ge']) + lims, 1, disc))
}

don= don[order(don$disc, decreasing= F, na.last= T), ]
don$disc= factor(don$disc, levels=c(0, 1, 2), labels=c('Not significant', 'Previous discovery', 'New discovery'))

cols <- c('Not significant'= 'grey', 'Previous discovery'= colorBlindBlack8[4], 'New discovery'= colorBlindBlack8[2])

don$GENE= ifelse(!is.na(don$GENE), don$nearestGene, don$GENE)
ggplot(don) +
    geom_point(data= don, aes(x=BPcum, y= -log10(pvalue), colour= disc), size=0.3) +   # Show all points
theme_cowplot(font_size= 12) + #theme_minimal_hgrid(12, rel_small = -1) + 
#scale_alpha_manual(values= rep(c(1/10, 1/2), 23)) +
scale_colour_manual(values= cols) +
    scale_x_continuous(label = axisdf$CHR, breaks= axisdf$center, expand=c(0,0) ) + # custom X axis
scale_y_continuous(expand= c(0,0)) +
         xlab('Chromosome') +
    ylab('-log10(pvalue)') +
labs(colour= '') +
geom_hline(yintercept= 0, size= 0.5, colour= 'black') +
geom_hline(yintercept= HC, size= 0.5, linetype= 2, colour= '#878787') +
geom_text_repel(data= don, aes(x= BPcum, y= -log10(pvalue), label= GENE), size= 3, hjust= 1, force= 1, vjust= 1, colour= 'black') +
theme(legend.position= 'bottom') +
guides(colour = guide_legend(override.aes = list(size=3)))

R Markdown ggplot2 From line 134 of reports/meta_qc.Rmd

indep= fread(snakemake@input[[6]])

indep$nd2P= sapply(strsplit(indep$SP2, ','), length)
indep= inner_join(indep, dg, by= 'CHR')
indep= filter(indep, BP>= POS_new - 1.5*10**6, BP<= POS_new + 1.5*10**6)
indep_df= group_by(indep, GENE) %>% summarize(total= sum(TOTAL), nsig= sum(NSIG), GWS= n(), sug_ev= sum(nd2P), mP= min(P))
indep_df= indep_df[order(indep_df$mP, decreasing=T), ]
indep_df$GENE= factor(indep_df$GENE, levels= indep_df$GENE)
indep_df= filter(indep_df, !grepl('HLA', GENE))

p1= ggplot(data=indep_df, aes(x= GENE, y= GWS)) + 
geom_col(fill=colorBlindBlack8[2], alpha= 0.6) + 
theme_cowplot() +
ylab('# Independent GW significant') +
xlab('Locus') +
coord_flip()

p2= ggplot(data=indep_df, aes(x= GENE, y= total)) + 
geom_col(fill=colorBlindBlack8[4], alpha= 0.6) + 
theme_cowplot() +
ylab('Total # of \n genetic variants in locus') +
xlab('Locus') +
theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
coord_flip() 

p3= ggplot(data=indep_df, aes(x= GENE, y= sug_ev / total)) + 
geom_col(fill=colorBlindBlack8[8], alpha= 0.6) + 
theme_cowplot() +
ylab('Proportion of variants with P<1e-5') +
xlab('Locus') +
ylim(0, 1) +
theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
coord_flip() 

plot_grid(p1, p2, p3, align = "h", nrow= 1)

R Markdown ggplot2 From line 207 of reports/meta_qc.Rmd

d= mutate(d, maf_tertiles = ntile(MAF, 3))
m1= round(max(d[d$maf_tertiles== 1, 'MAF']), 3)
m2= round(max(d[d$maf_tertiles== 2, 'MAF']), 3)


d$maf_tertiles= factor(d$maf_tertiles, levels=c("1", "2", "3"), labels=c(paste('MAF<', m1), paste(m1,'< MAF >', m2), paste('MAF>', m2)))

df= arrange(d, pvalue) %>% group_by(maf_tertiles) %>% mutate(exp1= -log10(1:length(pvalue)/length(pvalue)))

ggplot(filter(df, pvalue<0.05), aes(exp1, -log10(pvalue), color= maf_tertiles)) +
  geom_point(size= 0.4) +
scale_color_manual(values= colorBlindBlack8[2:4])+ 
  geom_abline(intercept = 0, slope = 1, alpha = .5) +
labs(colour="") +
theme_cowplot(font_size= 12) +
xlab('Expected (-log10(p-value))') +
ylab('Observed (-log10(p-value))') +
theme(legend.position= 'bottom') +
guides(colour = guide_legend(override.aes = list(size=3)))

R Markdown ggplot2 From line 263 of reports/meta_qc.Rmd

don= filter(don, pvalue< 1*10**-4)
d1= filter(don, pvalue<5*10**-8)
d1= d1[order(d1$pvalue, decreasing= F), ]
d1$pvalue= format(d1$pvalue, digits= 3)
don$pvalue= format(don$pvalue, digits= 3)

kable(filter(d1, GENE!= '') %>% select(ID, RSID, GENE, TOTALSAMPLESIZE, EAF, BETA, SE, pvalue, Consequence), col.names= c('CHR:POS:REF:EFF', 'RSID', 'Gene', 'N', 'EAF', 'Beta', 'SE', 'P-value', 'Consequence'), digits= 3) #%>% kable_styling(latex_options = c("striped", "scale_down"))

R Markdown From line 292 of reports/meta_qc.Rmd

if (nrow(dg)>1){

(ggplot(dg, aes(MAF, abs(BETA), size= abs(BETA)),  alpha= 0.7) +
geom_point(colour= colorBlindBlack8[4]) +
theme_cowplot(font_size= 12) +
scale_size_continuous(name= 'Absolute Beta', guide= F) +
geom_text_repel(data= dg, aes(label= GENE), guide= F) +
xlab('Minor allele frequency') +
ylab('Absolute effect size')) %>% print()

} else{ 

print('Only one locus identified, check the table.')
plot_comment=''
}

R Markdown From line 315 of reports/meta_qc.Rmd

z= fread(snakemake@input[[5]], header= T, sep= '\t', select= c('MarkerName', 'Effect', 'P-value', 'HetPVal'))

names(z)= c('ID', 'beta', 'pvalue', 'het_pvalue')
z$ID= gsub(':SNP', '', z$ID)
z$ID= gsub(':INDEL', '', z$ID)

z= inner_join(z, dg, by= 'ID')


if (nrow(z)>1) {

plot_comment= 'No pattern between effect size and heterogeneity. Attention should be paid to the top hit.'

z$Direction= ifelse(z$beta> 0, 'Positive', 'Negative')
ggplot(z, aes(-log10(het_pvalue), -log10(pvalue), size= abs(beta)), alpha= 0.7) +
geom_point(colour= colorBlindBlack8[4]) +
theme_cowplot(font_size= 12) +
scale_size_continuous(name= 'Absolute effect size') +
geom_text_repel(data= z, aes(label= GENE), hjust =1, show.legend = FALSE) +
xlab('-log10(Het pvalue)') +
ylab('-log10(Association pvalue)') +
theme(legend.position="bottom")

} else{
print(paste('Pvalue for heterogeneity: ', z$het_pvalue))
#plot_comment= ''
}

R Markdown ggplot2 From line 338 of reports/meta_qc.Rmd

kable(filter(don, (IMPACT== 'HIGH') | (IMPACT== 'MODERATE')) %>% select(ID, RSID, SYMBOL, TOTALSAMPLESIZE, EAF, BETA, SE, pvalue, Consequence), col.names= c('CHR:POS:REF:EFF', 'RSID', 'Gene', 'N', 'EAF', 'Beta', 'SE', 'P-value', 'Consequence'), digits= 3)

x= readLines(snakemake@input[[2]])
x= x[match('Heritability of phenotype 1', x) + 2]

R Markdown From line 376 of reports/meta_qc.Rmd

Ideally, calculate LDscores from our sample (MOBAGENETICS) or from a bigger cohort (UKBIOBANK).

R Markdown From line 393 of reports/meta_qc.Rmd

d= fread(snakemake@input[[3]])

d$pheno1= gsub('.txt.sumstats.gz', '', sapply(strsplit(as.character(d$p1), '/'), tail, 1))
d$pheno2= gsub('.txt.sumstats.gz', '', sapply(strsplit(as.character(d$p2), '/'), tail, 1))

d$rg= ifelse(d$rg> 1, 1, ifelse(d$rg< ( -1), -1, d$rg))

maxy= with(d, ifelse(max(rg + 1.96*se)> 1, max(rg + 1.96*se), 1))
miny=with(d, ifelse(min(rg - 1.96*se)< -1, min(rg - 1.96*se), -1))

ggplot(d, aes(pheno2, rg, colour= pheno2)) +
  geom_point(alpha= 0.5) +
geom_errorbar(aes(ymin= I(rg - 1.96*se) , ymax= (rg + 1.96 * se)), width=.2, position=position_dodge(.9)) +
theme_cowplot(font_size= 9) +
scale_fill_manual(values= colorBlindBlack8[2:4]) +
scale_colour_manual(guide= F, values= colorBlindBlack8[2:4]) +
xlab('Phenotype') +
ylab(paste0('R coefficient [95% CI] \n', pheno)) +
ylim(c(miny, maxy)) +
theme(legend.position= 'none')

link= 'https://drive.google.com/drive/folders/101ErlqwE4_iFwZFCTM0QZUtUVwOoOE1L?usp=sharing'

R Markdown ggplot2 From line 397 of reports/meta_qc.Rmd

library(tint)

knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tint'))
options(htmltools.dir.version = FALSE)
pdf.options(useDingbats = TRUE)
#knitr::opts_chunk$set(dpi=300)
knitr::opts_chunk$set(dev = 'png', warning= FALSE, message= FALSE, dpi= 600)



model= ifelse(grepl('rec', snakemake@input[[1]]), 'recessive', 'dominant')

R Markdown From line 10 of reports/nonadditive_qc.Rmd

library("ggplot2")
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library("kableExtra")
library(ggrepel)
library("data.table")
options(warn=-1)

R Markdown ggplot2 dplyr data.table tidyr cowplot ggrepel knitr kableExtra From line 31 of reports/nonadditive_qc.Rmd

d= fread(snakemake@input[[1]], h= T)
d$MAF= ifelse(d$EAF>0.5,  1 - d$EAF, d$EAF)

d= filter(d, TOTALSAMPLESIZE> 66106)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

ggplot(d, aes(MAF)) +
  geom_density(fill= colorBlindBlack8[4]) +
theme_cowplot(font_size= 12) +
xlab('MAF')

#d= select(d, -MAF)

R Markdown ggplot2 From line 44 of reports/nonadditive_qc.Rmd

ggplot(d, aes(TOTALSAMPLESIZE)) +
  geom_density(fill= colorBlindBlack8[3]) +
theme_cowplot(font_size= 12) +
xlab('Sample size')

R Markdown ggplot2 From line 62 of reports/nonadditive_qc.Rmd

add_model= fread(snakemake@input[[2]])
add_model$CHR= ifelse(add_model$CHR== 'X', '23', add_model$CHR)
add_model$CHR= as.numeric(add_model$CHR)
add_model$pos= round((add_model$pos1 + add_model$pos2) / 2)

R Markdown From line 95 of reports/nonadditive_qc.Rmd

The same number of loci is obtained when using a larger radius (1.5Mb).

R Markdown From line 110 of reports/nonadditive_qc.Rmd

We note that we used a naive approach to identify independent loci. This should be interpreted cautiously. We mapped top genetic variants to the body (TSS or TES) nearest protein coding gene.  

R Markdown From line 114 of reports/nonadditive_qc.Rmd

df= arrange(d, pvalue)

#df= group_by(df, CHR, POS) %>% filter(row_number() == 1) %>% ungroup()
df= df[!duplicated(df[, c('CHR', 'POS')]), ]

dg= df %>% arrange(CHR, POS) %>% filter(pvalue< 5*10**-8) %>% group_by(CHR) %>%
		mutate(d=POS-lag(POS, default=-Inf), clumpid=cumsum(d>250000)) %>%
		group_by(CHR, clumpid) %>%
		filter(rank(pvalue, ties.method = "random")==1) %>%
		mutate(GENE= ifelse(SYMBOL=='', RSID, SYMBOL)) %>%
		ungroup()

dg= filter(dg, pvalue< 5*10**-8)
dg= group_by(dg, CHR, POS) %>% filter(row_number()== 1)
#dg$GENE= ifelse(grepl('rs|-', dg$GENE), dg$nearestGene, ifelse(dg$GENE=='', dg$nearestGene, dg$GENE))
dg$GENE= dg$nearestGene

  don <- df %>%
    group_by(CHR)      %>%
    summarise(chr_len= max(POS)) %>%
    mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome
    select(-chr_len) %>%
    left_join(df, ., by= 'CHR') %>%
    arrange(CHR, POS) %>% # Add a cumulative position of each SNP
    mutate( BPcum=POS+tot) %>%
	 ungroup()

  axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
  names(axisdf)= c('CHR', 'center')
HC= -log10(5*10**-8)
dg= dg %>% ungroup() %>% select(ID, GENE, CHR, POS, MAF)
don= left_join(don, select(dg, ID, GENE), by= 'ID')
names(dg)= c('ID', 'GENE', 'CHR', 'POS_new', 'MAF')

lims= 250000

don$disc= ifelse(don$pvalue> 5*10**-8, 0, NA)

don= data.frame(don)
dg= data.frame(dg)
add_model= data.frame(add_model)

for (i in rownames(dg)) {
don= mutate(don, disc= ifelse(CHR== as.integer(dg[i, 'CHR']) & POS>= as.integer(dg[i, 'POS_new']) - lims & POS<= as.integer(dg[i, 'POS_new']) + lims, 2, disc))
}

for (i in rownames(add_model)) {
don= mutate(don, disc= ifelse(CHR== as.integer(add_model[i, 'CHR']) & POS>= as.integer(add_model[i, 'pos']) - lims & POS<= as.integer(add_model[i, 'pos']) + lims, 1, disc))
}

don= don[order(don$disc, decreasing= F, na.last= T), ]
don$disc= factor(don$disc, levels=c(0, 1, 2), labels=c('Not significant', 'Additive model discovery', 'New discovery'))

cols <- c('Not significant'= 'grey', 'Additive model discovery'= colorBlindBlack8[4], 'New discovery'= colorBlindBlack8[2])

don$GENE= ifelse(!is.na(don$GENE), don$nearestGene, don$GENE)
ggplot(don) +
    geom_point(data= don, aes(x=BPcum, y= -log10(pvalue), colour= disc), size=0.3) +   # Show all points
theme_cowplot(font_size= 12) + #theme_minimal_hgrid(12, rel_small = -1) + 
#scale_alpha_manual(values= rep(c(1/10, 1/2), 23)) +
scale_colour_manual(values= cols) +
    scale_x_continuous(label = axisdf$CHR, breaks= axisdf$center, expand=c(0,0) ) + # custom X axis
scale_y_continuous(expand= c(0,0)) +
         xlab('Chromosome') +
    ylab('-log10(pvalue)') +
labs(colour= '') +
geom_hline(yintercept= 0, size= 0.5, colour= 'black') +
geom_hline(yintercept= HC, size= 0.5, linetype= 2, colour= '#878787') +
geom_text_repel(data= don, aes(x= BPcum, y= -log10(pvalue), label= GENE), size= 3, hjust= 1, force= 1, vjust= 1, colour= 'black') +
theme(legend.position= 'bottom') +
guides(colour = guide_legend(override.aes = list(size=3)))

R Markdown ggplot2 From line 120 of reports/nonadditive_qc.Rmd

d= mutate(d, maf_tertiles = ntile(MAF, 3))
m1= round(max(d[d$maf_tertiles== 1, 'MAF']), 3)
m2= round(max(d[d$maf_tertiles== 2, 'MAF']), 3)


d$maf_tertiles= factor(d$maf_tertiles, levels=c("1", "2", "3"), labels=c(paste('MAF<', m1), paste(m1,'< MAF >', m2), paste('MAF>', m2)))

df= arrange(d, pvalue) %>% group_by(maf_tertiles) %>% mutate(exp1= -log10(1:length(pvalue)/length(pvalue)))

ggplot(filter(df, pvalue<0.05), aes(exp1, -log10(pvalue), color= maf_tertiles)) +
  geom_point(size= 0.4) +
scale_color_manual(values= colorBlindBlack8[2:4])+ 
  geom_abline(intercept = 0, slope = 1, alpha = .5) +
labs(colour="") +
theme_cowplot(font_size= 12) +
xlab('Expected (-log10(p-value))') +
ylab('Observed (-log10(p-value))') +
theme(legend.position= 'bottom') +
guides(colour = guide_legend(override.aes = list(size=3)))

R Markdown ggplot2 From line 201 of reports/nonadditive_qc.Rmd

don= filter(don, pvalue< 1*10**-4)
d1= filter(don, pvalue<5*10**-8)
d1= d1[order(d1$pvalue, decreasing= F), ]
d1$pvalue= format(d1$pvalue, digits= 3)
don$pvalue= format(don$pvalue, digits= 3)

kable(filter(d1, GENE!= '') %>% select(ID, RSID, GENE, TOTALSAMPLESIZE, EAF, pvalue, Consequence), col.names= c('CHR:POS:REF:EFF', 'RSID', 'Gene', 'N', 'EAF', 'P-value', 'Consequence'), digits= 3) #%>% kable_styling(latex_options = c("striped", "scale_down"))

R Markdown From line 230 of reports/nonadditive_qc.Rmd

kable(filter(don, (IMPACT== 'HIGH') | (IMPACT== 'MODERATE')) %>% select(ID, RSID, SYMBOL, TOTALSAMPLESIZE, EAF, pvalue, Consequence), col.names= c('CHR:POS:REF:EFF', 'RSID', 'Gene', 'N', 'EAF', 'P-value', 'Consequence'), digits= 3)

R Markdown From line 249 of reports/nonadditive_qc.Rmd

library(tint)

knitr::opts_chunk$set(tidy = FALSE, cache.extra = packageVersion('tint'))
options(htmltools.dir.version = FALSE)
pdf.options(useDingbats = TRUE)
#knitr::opts_chunk$set(dpi=300)
knitr::opts_chunk$set(dev = 'png', warning= FALSE, message= FALSE, dpi= 600)
cohort= ifelse(grepl('MOBA', snakemake@input[[1]]), 'MoBa', '23andMe')
pheno= ifelse(grepl('GAraw', snakemake@input[[1]]), 'GA days', 'GA normalized')

R Markdown From line 10 of reports/other_meta.Rmd

library("ggplot2")
library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library("kableExtra")
library(ggrepel)
library("data.table")
options(warn=-1)

R Markdown ggplot2 dplyr data.table tidyr cowplot ggrepel knitr kableExtra From line 30 of reports/other_meta.Rmd

d= fread(snakemake@input[[1]], h= T)
d$MAF= ifelse(d$EAF>0.5,  1 - d$EAF, d$EAF)

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

ggplot(d, aes(MAF)) +
  geom_density(fill= colorBlindBlack8[4]) +
theme_cowplot(font_size= 12) +
xlab('MAF')

#d= select(d, -MAF)

R Markdown ggplot2 From line 45 of reports/other_meta.Rmd

ggplot(d, aes(TOTALSAMPLESIZE)) +
  geom_density(fill= colorBlindBlack8[3]) +
theme_cowplot(font_size= 12) +
xlab('Sample size')

R Markdown ggplot2 From line 61 of reports/other_meta.Rmd

ggplot(d, aes(BETA)) +
  geom_density(fill= colorBlindBlack8[2]) +
theme_cowplot(font_size= 12) +
xlab('Beta')

R Markdown ggplot2 From line 72 of reports/other_meta.Rmd

**Effective sample size** for binary phenotypes was calculated as:  
$$\frac{2}{(\frac{1}{Ncases} + \frac{1}{Ncontrols})}$$  

R Markdown From line 89 of reports/other_meta.Rmd

ge= data.frame(CHR= c(5, 3, 1, 23, 1), pos_ge= c(157895049, 127881613, 22470407, 115164770, 22470407))


if (grepl('GAraw', snakemake@input[[1]])){
topids= c('1:22462111:A:G', '3:128038373:A:C', '5:157896786:C:T', '23:115184372:A:C', '1:228216997:A:C', '3:123112292:C:T', '3:141147414:C:T', '3:155859113:A:G', '23:131268226:C:T', '2:74207357:A:G', '4:174734471:A:G', '6:32589937:A:G', '6:49559793:G:T', '9:16408826:A:G', '20:62692060:A:C')

} else {
topids= c('1:22414785:G:T', '5:157895049:C:T', '23:115129904:C:T', '1:41955090:A:G', '1:50959262:A:C', '3:14293832:A:G', '3:139004333:A:G', '3:141147414:C:T', '3:155862524:A:G', '3:156697097:A:G', '2:74253326:A:G', '4:55895282:C:T', '4:174739258:A:G', '6:32604898:A:G', '8:75315146:C:G', '9:116935764:C:G')
}
fullmeta= fread(snakemake@input[[3]])
fullmeta= filter(fullmeta, ID %in% topids) %>% select(ID, POS, CHR, BETA, SE, pvalue)
names(fullmeta)= c('ID_f', 'POS_f', 'CHR_f', 'BETA_f', 'SE_f', 'pvalue_f')

R Markdown From line 105 of reports/other_meta.Rmd

The same number of loci is obtained when using a larger radius (1.5Mb).

R Markdown From line 128 of reports/other_meta.Rmd

We note that we used a naive approach to identify independent loci. This should be interpreted cautiously.

R Markdown From line 132 of reports/other_meta.Rmd

df= arrange(d, pvalue)

df= df[!duplicated(df[, c('CHR', 'POS')]), ]

dg= df %>% arrange(CHR, POS) %>% filter(pvalue< 5*10**-8) %>% group_by(CHR) %>%
		mutate(d=POS-lag(POS, default=-Inf), clumpid=cumsum(d>250000)) %>%
		group_by(CHR, clumpid) %>%
		filter(rank(pvalue, ties.method = "random")==1) %>%
		mutate(GENE= nearestGene) %>%
		ungroup()

dg= group_by(dg, CHR, POS) %>% filter(row_number()== 1)


  don <- df %>%
    group_by(CHR)      %>%
    summarise(chr_len= max(POS)) %>%
    mutate(tot= cumsum(as.numeric(chr_len))-chr_len) %>% # Calculate cumulative position of each chromosome
    select(-chr_len) %>%
    left_join(df, ., by= 'CHR') %>%
    arrange(CHR, POS) %>% # Add a cumulative position of each SNP
    mutate( BPcum=POS+tot) %>%
	 ungroup()

  axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
  names(axisdf)= c('CHR', 'center')
HC= -log10(5*10**-8)
dg= dg %>% ungroup() %>% select(ID, GENE, CHR, POS, MAF, BETA)
don= left_join(don, select(dg, ID, GENE), by= 'ID')
names(dg)= c('ID', 'GENE', 'CHR', 'POS_new', 'MAF', 'BETA')
don= left_join(don, fullmeta, by= c('CHR'= 'CHR_f'))

lims= 250000

don$disc= ifelse((don$POS> don$POS_f - lims) & (don$POS < don$POS_f + lims), 2, 0)
don= don[order(don$disc, decreasing= T, na.last= T), ]
don= group_by(don, ID) %>% filter(row_number() == 1)
don$disc= ifelse(is.na(don$disc), 0, don$disc)

don= left_join(don, select(dg, CHR, POS_new), by= 'CHR')

don$disc= ifelse(don$disc== 2, 2, ifelse((don$POS> (don$POS_new - lims)) & (don$POS < (don$POS_new + lims)), 1, 0))
don$disc= ifelse(is.na(don$disc), 0, don$disc)
don= don[order(don$disc, decreasing= T, na.last= T), ]
don= group_by(don, ID) %>% filter(row_number() == 1)

don$disc= ifelse(is.na(don$disc), 0, ifelse(don$disc== 1, 2, ifelse(don$disc== 2, 1, 0)))
don= don[order(don$disc, decreasing= F, na.last= T), ]

don$disc= factor(don$disc, levels=c(0, 1, 2), labels=c('Not significant', 'Full meta discovery', 'New discovery'))

cols <- c('Not significant'= 'grey', 'Full meta discovery'= colorBlindBlack8[4], 'New discovery'= colorBlindBlack8[2])



ggplot(don) +
    geom_point(data= don, aes(x=BPcum, y= -log10(pvalue), colour= disc), size=0.3) +   # Show all points
theme_cowplot(font_size= 12) + #theme_minimal_hgrid(12, rel_small = -1) + 
#scale_alpha_manual(values= rep(c(1/10, 1/2), 23)) +
scale_colour_manual(values= cols) +
    scale_x_continuous(label = axisdf$CHR, breaks= axisdf$center, expand=c(0,0) ) + # custom X axis
scale_y_continuous(expand= c(0,0)) +
         xlab('Chromosome') +
    ylab('-log10(pvalue)') +
labs(colour= '') +
geom_hline(yintercept= 0, size= 0.5, colour= 'black') +
geom_hline(yintercept= HC, size= 0.5, linetype= 2, colour= '#878787') +
geom_text_repel(data= don, aes(x= BPcum, y= -log10(pvalue), label= GENE), size= 3, hjust= 1, force= 1, vjust= 1, colour= 'black') +
theme(legend.position= 'bottom') +
guides(colour = guide_legend(override.aes = list(size=3)))

R Markdown ggplot2 From line 138 of reports/other_meta.Rmd

d= mutate(d, maf_tertiles = ntile(MAF, 3))
m1= round(max(d[d$maf_tertiles== 1, 'MAF']), 3)
m2= round(max(d[d$maf_tertiles== 2, 'MAF']), 3)


d$maf_tertiles= factor(d$maf_tertiles, levels=c("1", "2", "3"), labels=c(paste('MAF<', m1), paste(m1,'< MAF >', m2), paste('MAF>', m2)))

df= arrange(d, pvalue) %>% group_by(maf_tertiles) %>% mutate(exp1= -log10(1:length(pvalue)/length(pvalue)))

ggplot(filter(df, pvalue<0.05), aes(exp1, -log10(pvalue), color= maf_tertiles)) +
  geom_point(size= 0.4) +
scale_color_manual(values= colorBlindBlack8[2:4])+ 
  geom_abline(intercept = 0, slope = 1, alpha = .5) +
labs(colour="") +
theme_cowplot(font_size= 12) +
xlab('Expected (-log10(p-value))') +
ylab('Observed (-log10(p-value))') +
theme(legend.position= 'bottom') +
guides(colour = guide_legend(override.aes = list(size=3)))

R Markdown ggplot2 From line 214 of reports/other_meta.Rmd

don= filter(don, pvalue< 1*10**-4)
d1= filter(don, pvalue<5*10**-8)
d1$pvalue= format(d1$pvalue, digits= 3)
don$pvalue= format(don$pvalue, digits= 3)
kable(filter(d1, GENE!= '') %>% select(ID, RSID, GENE, TOTALSAMPLESIZE, EAF, BETA, SE, pvalue), col.names= c('CHR:POS:REF:EFF', 'RSID', 'Gene', 'N', 'EAF', 'Beta', 'SE', 'P-value'), digits= 3)
caption= 'As expected, beta increases with decreasing minor allele frequency.'

R Markdown From line 243 of reports/other_meta.Rmd

x= inner_join(fullmeta, d, by= c('ID_f'= 'ID'))


(ggplot(x, aes(BETA_f, BETA, size= abs(BETA)),  alpha= 0.7) +
geom_point(colour= colorBlindBlack8[4]) +
theme_cowplot(font_size= 12) +
scale_size_continuous(name= 'Absolute Beta', guide= F) +
geom_text_repel(data= x, aes(label= RSID), guide= F) +
xlab('Effect size full meta-analysis') +
ylab(paste('Effect size without', cohort))+ 
geom_abline(intercept= 0, slope= 1, linetype= 'dashed', colour= 'grey')) %>% print()

R Markdown From line 258 of reports/other_meta.Rmd

(ggplot(x, aes(-log10(pvalue_f), -log10(pvalue)),  alpha= 0.7) +
geom_point(colour= colorBlindBlack8[4]) +
theme_cowplot(font_size= 12) +
geom_text_repel(data= x, aes(label= RSID), guide= F) +
xlab('-log10(pvalue) full meta-analysis') +
ylab(paste('-log10(pvalue) without', cohort))+ 
geom_abline(intercept= 0, slope= 1, linetype= 'dashed', colour= 'grey')) %>% print()

R Markdown From line 279 of reports/other_meta.Rmd

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['#CHROM', 'POS', 'REF', 'ALT', 'AF_EXCLUDING_1000G'])
	d.columns= ['CHR', 'POS', 'oa', 'ea', 'eaf']
	d= d.loc[((d.eaf> 0.05) & (d.eaf<0.95)), :]
	d['eaf']= np.where(d.oa> d.ea, 1 - d.eaf, d.eaf)
	d= d.sample(n= 1000000)
	d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 11 of reports/Snakefile

script:
	'file_level_qc.Rmd'

SnakeMake From line 27 of reports/Snakefile

	script:
                'file_level_qc.Rmd'

SnakeMake From line 38 of reports/Snakefile

	script:
                'file_level_qc.Rmd'

SnakeMake From line 49 of reports/Snakefile

	script:
                'file_level_qc.Rmd'

SnakeMake From line 60 of reports/Snakefile

script:
	'all_files_QC.Rmd'

SnakeMake From line 70 of reports/Snakefile

script:
        'all_files_QC.Rmd'

SnakeMake From line 80 of reports/Snakefile

script:
	'all_files_QC.Rmd'

SnakeMake From line 91 of reports/Snakefile

script:
        'all_files_QC.Rmd'

SnakeMake From line 102 of reports/Snakefile

script:
	'meta_qc.Rmd'

SnakeMake From line 117 of reports/Snakefile

run:
	df= pd.read_csv(input[0], sep= '\t', header= 0)
	df.sort_values('SNP.PP.H4', ascending= False, inplace= True)
	d= df.groupby('locus').head(1).reset_index()
	df['trait']= input[1].split('pph_')[1].replace('.txt', '')
	d['direction']= np.where((d['z.df1'] > 0) & (d['z.df2'] > 0), 'Positive', np.where((d['z.df1'] < 0) & (d['z.df2'] < 0), 'Negative', 'Opposite'))
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	x['trait']= input[1].split('pph_')[1].replace('.txt', '')
	x= pd.merge(x, d[['snp', 'locus', 'SNP.PP.H4', 'direction']])
	x= x.loc[(x['PP.H0.abf'] != 0) & (x['PP.H1.abf'] != 0) & (x['PP.H2.abf'] != 0) & (x['PP.H0.abf'] != 0) & (x['PP.H4.abf'] != 0), :]
	x.dropna(axis= 0, inplace= True)
	x.to_csv(output[0], sep= '\t', header= True, index= False)
	df.to_csv(output[1], sep= '\t', header= True, index= False)

SnakeMake From line 129 of reports/Snakefile

script:
	'coloc.Rmd'

SnakeMake From line 151 of reports/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, compression= 'gzip')
	df= pd.read_csv(input[1], sep= '\t', header= 0)
	df['CHR']= np.where(df.CHR== 'X', '23', df.CHR)
	df['CHR']= df.CHR.astype(str).astype(int)
	d= pd.merge(d, df, on= 'CHR')
	d= d.loc[((d.POS> d.pos1) & (d.POS < d.pos2)), :]
	d.sort_values('pvalue', ascending= True, inplace= True)
	d= d.groupby('nearestGene_y').first()
	d.to_csv(output[0], sep= '\t', header= False, index= False, columns= ['ID']) 

SnakeMake From line 161 of reports/Snakefile

	shell:
                '''
                grep -f {input[0]} {input[1]} > {output[0]} || true
                touch {output[0]}
                '''

SnakeMake From line 179 of reports/Snakefile

shell:
        '''
        grep -f {input[0]} {input[1]} > {output[0]} || true
        touch {output[0]}
        '''

SnakeMake From line 192 of reports/Snakefile

shell:
        '''
        grep -f {input[0]} {input[1]} > {output[0]} || true
        touch {output[0]}
        '''

SnakeMake From line 205 of reports/Snakefile

        shell:
                '''
		grep -f {input[0]} {input[1]} > {output[0]} || true
		touch {output[0]}
		'''

SnakeMake From line 217 of reports/Snakefile

run:
	df_list= list()
	for infile in input:
		d= pd.read_csv(infile, sep= '\t', header= None, names= ['SNP', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue', 'STRAND', 'maf'])
		d['cohort']= infile.split('temp/')[1].replace('_topvariants.txt', '')
		df_list.append(d)
	d= pd.concat(df_list)
	d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 230 of reports/Snakefile

run:
        df_list= list()
        for infile in input:
                d= pd.read_csv(infile, sep= '\t', header= None, names= ['SNP', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue', 'STRAND', 'maf'])
                d['cohort']= infile.split('temp/')[1].replace('_topvariants.txt', '')
                df_list.append(d)
        d= pd.concat(df_list)
        d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 245 of reports/Snakefile

run:
        df_list= list()
        for infile in input:
                d= pd.read_csv(infile, sep= '\t', header= None, names= ['SNP', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue', 'STRAND', 'maf'])
                d['cohort']= infile.split('temp/')[1].replace('_topvariants.txt', '')
                df_list.append(d)
        d= pd.concat(df_list)
        d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 260 of reports/Snakefile

run:
        df_list= list()
        for infile in input:
                d= pd.read_csv(infile, sep= '\t', header= None, names= ['SNP', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue', 'STRAND', 'maf'])
                d['cohort']= infile.split('temp/')[1].replace('_topvariants.txt', '')

SnakeMake From line 275 of reports/Snakefile

script:
	'forest_plots.Rmd'

SnakeMake From line 292 of reports/Snakefile

script:
	'other_meta.Rmd'

SnakeMake From line 303 of reports/Snakefile

script:
        'other_meta.Rmd'

SnakeMake From line 314 of reports/Snakefile

script:
        'other_meta.Rmd'

SnakeMake From line 326 of reports/Snakefile

script:
        'nonadditive_qc.Rmd'

SnakeMake From line 337 of reports/Snakefile

script:
        'file_level_qc.Rmd'

SnakeMake From line 348 of reports/Snakefile

script:
        'file_level_qc.Rmd'

SnakeMake From line 359 of reports/Snakefile

script:
        'file_level_qc.Rmd'

SnakeMake From line 370 of reports/Snakefile

script:
        'file_level_qc.Rmd'

SnakeMake From line 381 of reports/Snakefile

library(data.table)
library(dplyr)
library(metafor)

funk= function(pheno) {

d_temp= d[d$outcome== pheno, ]

df_list= lapply(c('MT', 'MNT', 'PT'), function(i){

df_temp= d_temp[d_temp$haplotype== i, ]
print(nrow(d_temp))
res.FE= rma(yi= beta, sei= se,  data= df_temp, method= "FE")

df= data.frame(beta= res.FE$beta, se= res.FE$se, pvalue= res.FE$pval, lo95= res.FE$ci.lb, up95= res.FE$ci.ub, het_pvalue= res.FE$QEp, outcome= pheno, haplotype= i)

print(df)

return(df)

})

df= do.call('rbind', df_list)

return(df)

}

moba= fread(snakemake@input[[1]])
decode= fread(snakemake@input[[2]])
hunt= fread(snakemake@input[[3]])

d= rbind(moba, decode)
d= rbind(d, hunt)

df_list= lapply(unique(d$outcome), funk)

x= do.call('rbind', df_list)

df= group_by(d, haplotype, outcome) %>% summarize(n= sum(n))

x= inner_join(x, df, by= c('haplotype', 'outcome'))

fwrite(x, snakemake@output[[1]], sep= '\t')

R dplyr data.table metafor From line 1 of repr_traits_PGS/PGS_fetal_growth_meta.R

library(data.table)
library(dplyr)
library(metafor)

funk= function(pheno) {

d_temp= d[d$exposure== pheno, ]

df_list= lapply(c('MT', 'MNT', 'PT'), function(i){

df_temp= d_temp[d_temp$haplotype== i, ]
print(nrow(d_temp))
res.FE= rma(yi= beta, sei= se,  data= df_temp, method= "FE")

df= data.frame(beta= res.FE$beta, se= res.FE$se, pvalue= res.FE$pval, lo95= res.FE$ci.lb, up95= res.FE$ci.ub, het_pvalue= res.FE$QEp, exposure= pheno, haplotype= i)

print(df)

return(df)

})

df= do.call('rbind', df_list)

return(df)

}

moba= fread(snakemake@input[[1]])
decode= fread(snakemake@input[[2]])
hunt= fread(snakemake@input[[3]])

d= rbind(moba, decode)
d= rbind(d, hunt)

df_list= lapply(unique(d$exposure), funk)

x= do.call('rbind', df_list)

df= group_by(d, haplotype, exposure) %>% summarize(n= sum(n))

x= inner_join(x, df, by= c('haplotype', 'exposure'))

fwrite(x, snakemake@output[[1]], sep= '\t')

R dplyr data.table metafor From line 1 of repr_traits_PGS/PGS_repr_pheno_meta.R

script:
	'PGS_repr_pheno_meta.R'

SnakeMake From line 13 of repr_traits_PGS/Snakefile

script:
        'PGS_fetal_growth_meta.R'

SnakeMake From line 26 of repr_traits_PGS/Snakefile

library(data.table)
library(dplyr)
library(DESeq2)
library(tidyverse)

df_list= list()

flist= list.files(snakemake@params[[1]], 'CL', full.names=T)

for (i in 1:length(flist)){
d= fread(flist[i])
cname= unlist(strsplit(flist[i], '/'))[10]
d= select(d, Name, NumReads)

names(d)= c('Name', cname)
df_list[[i]]= d

}

x= df_list %>% reduce(left_join, by = "Name")

cols= data.frame(row.names= colnames(x)[2:7], condition= colnames(x)[2:7], subject= colnames(x)[2:7])

cols$condition= gsub('.txt', '', sapply(strsplit(cols$condition, '-'), tail, 1))
cols$subject= sapply(strsplit(cols$subject, '-'), head, 1)
cts= as.matrix(x[, 2:7])
row.names(cts)= x$Name

dds <- DESeqDataSetFromMatrix(countData = round(cts),
                              colData = cols,
                              design= ~ subject + condition)

dds= DESeq(dds)

res= results(dds, name="condition_unt_vs_dec")

res= data.frame(res)
res$geneid= row.names(res)

fwrite(res, snakemake@output[[1]], sep= '\t')

R tidyverse dplyr data.table From line 1 of stromal_cells/rna_seq_dif.R

script:
	'rna_seq_dif.R'

SnakeMake From line 16 of stromal_cells/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'ID'])

SnakeMake From line 28 of stromal_cells/Snakefile

library("dplyr")
library("knitr")
library("tidyr")
library(cowplot)
library(ggrepel)
library("data.table")
library('showtext')

colorBlindBlack8= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

x= fread(snakemake@input[[1]])

x$p1= gsub('.txt.sumstats.gz', '', apply(x[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2]))
x$p2= gsub('.txt.sumstats.gz', '', apply(x[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))

x1= fread(snakemake@input[[2]])

x1$p1= gsub('.txt.sumstats.gz', '', apply(x1[, 'p1'], 1, function(x) unlist(strsplit(x, 'LDscore/'))[2]))
x1$p2= gsub('.txt.sumstats.gz', '', apply(x1[, 'p2'], 1, function(x) unlist(strsplit(x, 'LDSC/'))[2]))
d= rbind(x, x1)


d$trait= d$p2
d$trait= with(d, ifelse(trait== 'GAraw', 'Maternal gestational duration',
ifelse(trait== 'miscarriage', 'Miscarriage',
                ifelse(trait== 'GA_fetal', 'GA fetal effect',
                ifelse(trait== 'BW_maternal', 'Maternal BW',
                ifelse(trait== 'AFB', 'Age at first birth',
                ifelse(trait== 'AMenarche', 'Age at menarche',
                ifelse(trait== 'AMenopause', 'Age at menopause',
                ifelse(trait== 'NLB', 'Number of live births',
                ifelse(trait== 'Testosterone_fem', 'Testosterone (women)',
                ifelse(trait== 'SHBG_fem', 'SHBG (women)',
                ifelse(trait== 'SHBG_male', 'SHBG (men)',
                ifelse(trait== 'CBAT_fem', 'CBAT (women)',
                ifelse(trait== 'CBAT_male', 'CBAT (men)',
                ifelse(trait== 'Oestradiol_fem', 'Oestradiol (women)',
                ifelse(trait== 'POP', 'Pelvic Organ Prolapse',
                ifelse(trait== 'Testosterone_male', 'Testosterone (men)',
                ifelse(trait== 'leiomyoma_uterus', 'Leiomyoma uterus',
                ifelse(trait== 'BW_fetal', 'Fetal',
                ifelse(trait== 'BW_fetal_effect', 'Fetal only',
                ifelse(trait== 'Preeclampsia', 'Pre-eclampsia',
                ifelse(trait== 'BW_maternal_effect', 'Maternal only',
                ifelse(trait== 'PCOS', 'Polycistic ovary syndrome', 'Endometriosis')))))))))))))))))))))))


d= filter(d, grepl('men', trait), !grepl('women', trait))





fwrite(d, snakemake@output[[1]], sep= '\t')

R dplyr data.table tidyr cowplot ggrepel knitr showtext From line 1 of tables/genetic_correlations_males.R

run:
	d= pd.read_csv(input[0], sep='\t', header= 0)
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	x['pheno']= 'Gestational duration'
	ptd= pd.read_csv(input[2], sep= '\t', header= 0)
	ptd['pheno']= 'Preterm delivery'
	postterm= pd.read_csv(input[3], sep= '\t', header= 0)
	postterm['pheno']= 'Post term delivery'
	gID= ['3:156697097:A:G', '5:158058432:G:T']
	d= d.loc[d.ID.isin(gID), :]
	d= pd.concat([x, d])
	d= pd.concat([d, ptd])
	d= pd.concat([d, postterm])
	d.sort_values('ID', inplace= True)
	d.to_csv(output[0], header= True, index= False, sep= '\t')

SnakeMake From line 10 of tables/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	d= pd.merge(d[['ID', 'pheno']], x, on= 'ID', how= 'inner')
	d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 34 of tables/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)

SnakeMake From line 49 of tables/Snakefile

run:
        d= pd.read_csv(input[1], sep= '\t', header= 0)
        top= pd.read_csv(input[2], sep= '\t', header= 0, usecols= ['ID', 'nearestGene', 'RSID'])
        d= pd.merge(d, top, left_on= 'rsid', right_on= 'RSID')

SnakeMake From line 67 of tables/Snakefile

shell:
	'''
	cp {input[0]} {output[0]}
	'''

SnakeMake From line 82 of tables/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	x['eqtl_data']= 'iPSC'
	d= pd.concat([d, x,])
	df= pd.read_csv(input[2], sep= '\t', header= None, names= ['chr', 'pos1', 'pos2', 'Gene_symbol', 'EID'], usecols= ['Gene_symbol', 'EID'])
	df['EID']= df['EID'].str.split('.').str[0]
	d= pd.merge(d, df, left_on= 'gene', right_on= 'EID')
	d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 95 of tables/Snakefile

script:
	'genetic_correlations_males.R'

SnakeMake From line 113 of tables/Snakefile

run:
	df_list= list()
	for i in input:
		d= pd.read_csv(i, sep= '\t', header= 0, usecols= ['CHR', 'N'])
		coh= i.split('filtered/')[1].replace('.txt', '')
		df_dict= pd.DataFrame({'cohort': coh, 'N': d.N.max()}, index= [0])
		df_list.append(df_dict)
	d= pd.concat(df_list)
	d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 122 of tables/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= None, names= ['ID', 'beta', 'se', 'pvalue', 'trait'])
	d[['CHR', 'POS', 'REF', 'EFF']]= d.ID.str.split(':', expand= True)
	d['CHR']= np.where(d.CHR== '23', 'X', d.CHR)
	d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 138 of tables/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	d= pd.concat([d, x])
	horm= ['CBAT_fem', 'SHBG_fem', 'Testosterone_fem', 'SHBG_fem_cluster', 'Testosterone_fem_cluster']
	df= d.loc[d.trait.isin(horm), :]
	ivw= df.loc[df.method== 'IVW', :]
	egger= df.loc[df.method== 'MR-Egger', :]
	egger_int= df.loc[np.array(df.index[df.method== 'MR-Egger' ] + 1), :]
	d= pd.concat([ivw, egger, egger_int])
	d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 151 of tables/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['MarkerName', 'HetISq', 'HetPVal'])
	d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	d['REF'], d['EFF']= np.where(d.REF> d.EFF, [d.EFF, d.REF], [d.REF, d.EFF])
	d['ID']= np.where(d.REF> d.EFF, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.EFF + ':' + d.REF, d.CHR.apply(str) + ':' + d.POS.apply(str) + ':' + d.REF + ':' + d.EFF)
	d= pd.merge(d, x, on= 'ID')
	d.to_csv(output[0], sep= '\t', header= True, index= False, columns= ['ID', 'HetISq', 'HetPVal'])

SnakeMake From line 170 of tables/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	d['Allele1']= d['Allele1'].str.upper()
	d['Allele2']= d['Allele2'].str.upper()
	d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
	d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
	d['CHR']= d['CHR'].astype(str).astype(int)
	d['POS']= d['POS'].astype(str).astype(int)
	d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']]
	d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']
	df= d.loc[d.pvalue< 5*10**-8, :]
	df.sort_values(by= 'pvalue', ascending= True, inplace= True)
	df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True)
	df_list= list()
	for chrom in set(df.CHR):
		d_temp= df.loc[df.CHR== chrom, :]
		positions= d_temp.POS.values
		for pos in positions:
			if pos in d_temp.POS.values:
				df_list.append(d_temp.loc[d_temp.POS== pos, :])
				d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :]
		else:
			continue
	x= pd.concat(df_list)
	x['pos1']= x.POS - 1.5*10**6
	x['pos2']= x.POS + 1.5*10**6
	x['CHR']= x.CHR.astype(str)
	x['CHR']= np.where(x.CHR== '23', 'X', x.CHR)
	x.to_csv(output[0], sep='\t', header= True, index= False, columns= ['CHR', 'pos1', 'pos2'])

SnakeMake From line 7 of top_regions/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', compression= 'gzip', usecols= ['CHR', 'POS', 'pvalue', 'nearestGene'])
        df= d.loc[d.pvalue< 5*10**-8, :]
        df.sort_values(by= 'pvalue', ascending= True, inplace= True)
        df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True)
        df_list= list()
        for chrom in set(df.CHR):
                d_temp= df.loc[df.CHR== chrom, :]
                positions= d_temp.POS.values
                for pos in positions:
                        if pos in d_temp.POS.values:
                                df_list.append(d_temp.loc[d_temp.POS== pos, :])
                                d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :]
                        else:
                                continue
        x= pd.concat(df_list)
        x['pos1']= x.POS - 1.5*10**6
        x['pos2']= x.POS + 1.5*10**6
        x['CHR']= x.CHR.astype(str)
        x['CHR']= np.where(x.CHR== '23', 'X', x.CHR)
        x.to_csv(output[0], sep='\t', header= True, index= False, columns= ['CHR', 'pos1', 'pos2', 'nearestGene'])

SnakeMake From line 43 of top_regions/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'EAF', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'RSID', 'ID', 'BETA', 'SE', 'pvalue'])
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	x['CHR']= np.where(x.CHR== 'X', '23', x.CHR)
	x['CHR']= x.CHR.apply(int)
	d= pd.merge(d, x, on= 'CHR')
	d= d.loc[((d.POS>= d.pos1) & (d.POS <= d.pos2)), ]
	d.sort_values('pvalue', ascending= True, inplace= True)
	d= d.groupby('nearestGene').head(1)
	d= d[['CHR', 'POS', 'EAF', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'RSID', 'nearestGene', 'ID', 'BETA', 'SE', 'pvalue']]
	d.to_csv(output[0], sep= '\t', header= True, index= False)

SnakeMake From line 72 of top_regions/Snakefile

run:
	for i in range(2):
		d= pd.read_csv(input[i], sep= '\t', compression= 'gzip', usecols= ['CHR', 'POS', 'pvalue', 'nearestGene'])
		df= d.loc[d.pvalue< 5*10**-8, :]
		df.sort_values(by= 'pvalue', ascending= True, inplace= True)
		df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True)
		df_list= list()
		for chrom in set(df.CHR):
			d_temp= df.loc[df.CHR== chrom, :]
			positions= d_temp.POS.values
			for pos in positions:
				if pos in d_temp.POS.values:
					df_list.append(d_temp.loc[d_temp.POS== pos, :])
					d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :]
				else:
					continue
		x= pd.concat(df_list)
		x['pos1']= x.POS - 1.5*10**6
		x['pos2']= x.POS + 1.5*10**6
		x['CHR']= x.CHR.astype(str)
		x['CHR']= np.where(x.CHR== '23', 'X', x.CHR)
		x.to_csv(output[i], sep='\t', header= True, index= False, columns= ['CHR', 'pos1', 'pos2', 'nearestGene'])

SnakeMake From line 93 of top_regions/Snakefile

run:
	for i in range(2):
		d= pd.read_csv(input[i], sep= '\t', header= 0, usecols= ['CHR', 'POS', 'EAF', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'RSID', 'ID', 'pvalue'])
		x= pd.read_csv(input[i+2], sep= '\t', header= 0)
		x['CHR']= np.where(x.CHR== 'X', '23', x.CHR)
		x['CHR']= x.CHR.apply(int)
		d= pd.merge(d, x, on= 'CHR')
		d= d.loc[((d.POS>= d.pos1) & (d.POS <= d.pos2)), ]
		d.sort_values('pvalue', ascending= True, inplace= True)
		d= d.groupby('nearestGene').head(1)
		d= d[['CHR', 'POS', 'EAF', 'TOTALSAMPLESIZE', 'REF', 'EFF', 'RSID', 'nearestGene', 'ID', 'pvalue']]
		d.to_csv(output[i], sep= '\t', header= True, index= False)

SnakeMake From line 126 of top_regions/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t',usecols= ['CHR', 'POS', 'pvalue'])
        df= d.loc[d.pvalue< 5*10**-8, :]
        df.sort_values(by= 'pvalue', ascending= True, inplace= True)
        df.drop_duplicates(subset= ['CHR', 'POS'], keep= 'first', inplace= True)
        df_list= list()
        for chrom in set(df.CHR):
                d_temp= df.loc[df.CHR== chrom, :]
                positions= d_temp.POS.values
                for pos in positions:
                        if pos in d_temp.POS.values:
                                df_list.append(d_temp.loc[d_temp.POS== pos, :])
                                d_temp= d_temp.loc[(d_temp.POS < pos - (1.5*10**6)) | (d_temp.POS> pos + (1.5 * 10**6)), :]
                        else:
                                continue
        x= pd.concat(df_list)
        x['pos1']= x.POS - 1.5*10**6
        x['pos2']= x.POS + 1.5*10**6
        x['CHR']= x.CHR.astype(str)
        x['CHR']= np.where(x.CHR== '23', 'X', x.CHR)
        x.to_csv(output[0], sep='\t', header= True, index= False, columns= ['CHR', 'POS', 'pos1', 'pos2'])

SnakeMake From line 145 of top_regions/Snakefile

import pandas as pd
import numpy as np

d= pd.read_csv(snakemake.input[0], sep= '\t', header=0, compression= 'gzip')
d= d.loc[~d['#chrom'].str.contains('_'), :]
d['a1']= d.alts.str.split(',').str[0]
d['a2']= d.alts.str.split(',').str[1]
d['#chrom']= d['#chrom'].str.replace('chr', '')
d['POS']= np.where(d.ref.str.len() < d.alts.str.len(), d.chromStart, d.chromEnd)
d['ref']= np.where(d.ref.str.len()< d.alts.str.len(), 'I', d.ref)
d['ref']= np.where(d.ref.str.len() > d.alts.str.len(), 'D', d.ref)
d['a1']= np.where(d.ref== 'I', 'D', d.a1)
d['a1']= np.where(d.ref== 'D', 'I', d.a1)
df= d.copy()
df= df.loc[df.a2!= '', :]
d.loc[d.ref > d.a1, ['ref', 'a1']] = d.loc[d.ref > d.a1, ['a1', 'ref']].values

d['ID']= d['#chrom'] + ':' + d['POS'].astype(int).astype(str) + ':' + d.ref + ':' + d.a1
df.loc[df.ref > df.a2, ['ref', 'a2']] = df.loc[df.ref > df.a2, ['a2', 'ref']].values
df['ID']= df['#chrom'] + ':' + df['POS'].astype(int).astype(str) + ':' + df.ref + ':' + df.a2
df= df[['ID', 'name']]
d= d[['ID', 'name']]
d= pd.concat([d, df])

# Read RSIDs from HRC
x= pd.read_csv(snakemake.input[1], sep= '\t', header=0, usecols= ['#CHROM', 'POS', 'ID', 'REF', 'ALT'])
x.columns= ['CHROM', 'POS', 'name', 'REF', 'ALT']
x= x.loc[x.name!= '.', :]

x['CHROM']= np.where(x.CHROM== 'X', '23', x.CHROM)
x['CHROM']= x.CHROM.apply(str)

x.loc[x.REF > x.ALT, ['REF', 'ALT']] = x.loc[x.REF > x.ALT, ['ALT', 'REF']].values
x['ID']= x['CHROM'] + ':' + x['POS'].astype(int).astype(str) + ':' + x.REF + ':' + x.ALT
x= x[['ID', 'name']]
x= x.loc[~x.ID.isin(d.ID), :]

d= pd.concat([d, x])

d.to_csv(snakemake.output[0], sep= '\t', header= True, index= False)

Python Pandas numpy From line 1 of VEP/format_dbSNP.py

import pandas as pd
import numpy as np
import re

#d= pd.read_csv(snakemake.input[0], sep= '\t', header= 0)

#d['Allele1']= d['Allele1'].str.upper()
#d['Allele2']= d['Allele2'].str.upper()
#d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
#d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
#d['CHR']= d['CHR'].astype(str).astype(int)
#d['POS']= d['POS'].astype(str).astype(int)
#d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']]
#d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']
#d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
#d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)

#d['CHR']= d['CHR'].astype(str).astype(int)
#d['POS']= d['POS'].astype(str).astype(int)

#d['pvalue']= d['pvalue'].astype(str).astype(float)

#d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
#d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF

#d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]

col_list= ['IMPACT', 'DISTANCE', 'SYMBOL', 'SYMBOL_SOURCE', 'BIOTYPE']
df_list= list()

for vep in pd.read_csv(snakemake.input[1], sep= '\t', header= None, names= ['Variation', 'Location', 'Allele', 'Gene', 'Feature', 'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 'Existing_variation', 'Extra'], comment= '#', chunksize= 100000):
	for i in col_list:
		vep[i]= vep['Extra'].apply(lambda y: dict([(x.split('=', 1)) for x in re.split(';(?=\w)', y) if x.find('=') > -1])[i] if i in y else '')
	vep= vep[['Variation', 'Location', 'Existing_variation', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'SYMBOL_SOURCE', 'BIOTYPE']]
	vep.columns= ['ID', 'Location', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'SYMBOL_SOURCE', 'BIOTYPE']
	vep['BIOTYPE1']= np.where(vep.BIOTYPE== 'protein_coding', 0, np.where(vep.BIOTYPE.str.contains('pseudo'), 2, 1))
	vep['DISTANCE']= np.where(vep.DISTANCE== '', 0, vep.DISTANCE)
	vep[['chr', 'pos', 'All']]= vep.ID.str.split('_', expand= True)
	vep[['EFF', 'REF']]= vep.All.str.split('/', expand= True)
	vep.loc[vep.REF > vep.EFF, ['REF', 'EFF']] = vep.loc[vep.REF > vep.EFF, ['EFF', 'REF']].values
	vep[['CHR', 'POS']]= vep['Location'].str.split(':', expand= True)
	vep['CHR']= np.where(vep['CHR']== 'X', '23', vep['CHR'])
	vep['ID']= vep.CHR.astype(int).astype(str) + ':' + vep.POS.astype(int).astype(str) + ':' + vep.REF + ':' + vep.EFF
	vep= vep[['ID', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'BIOTYPE', 'BIOTYPE1']]
	vep.sort_values(by= ['BIOTYPE1'], ascending= True, inplace= True)
	vep.drop_duplicates(subset= ['ID'], keep= 'first', inplace= True)
	df_list.append(vep)

vep= pd.concat(df_list)

vep.sort_values(by= ['BIOTYPE1'], ascending= True, inplace= True)
vep.drop_duplicates(subset= ['ID'], keep= 'first', inplace= True)
vep= vep[['ID', 'RSID', 'Gene', 'SYMBOL', 'Consequence', 'IMPACT', 'DISTANCE', 'BIOTYPE']]


d= pd.read_csv(snakemake.input[0], sep= '\t', header= 0)
d['Allele1']= d['Allele1'].str.upper()
d['Allele2']= d['Allele2'].str.upper()
d= d.loc[(d.TOTALSAMPLESIZE> (d['TOTALSAMPLESIZE'].max())/ 2), :]
d[['CHR', 'POS', 'REF','EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
d['CHR']= d['CHR'].astype(str).astype(int)
d['POS']= d['POS'].astype(str).astype(int)
d= d[['CHR', 'POS', 'Allele1', 'Allele2', 'TOTALSAMPLESIZE', 'Freq1', 'Effect', 'StdErr', 'P-value']]
d.columns= ['CHR', 'POS', 'EFF', 'REF', 'TOTALSAMPLESIZE', 'EAF', 'BETA', 'SE', 'pvalue']
d['BETA']=np.where(d.REF > d.EFF, -1* d.BETA, d.BETA)
d['EAF']= np.where(d.REF > d.EFF, 1 - d.EAF, d.EAF)
d['CHR']= d['CHR'].astype(str).astype(int)
d['POS']= d['POS'].astype(str).astype(int)
d['pvalue']= d['pvalue'].astype(str).astype(float)
d.loc[d.REF > d.EFF, ['REF', 'EFF']] = d.loc[d.REF > d.EFF, ['EFF', 'REF']].values
d['ID']= d.CHR.astype(int).astype(str) + ':' + d.POS.astype(int).astype(str) + ':' + d.REF + ':' + d.EFF
d= d.loc[((d.pvalue>0) & (d.pvalue <1)), :]
d= pd.merge(d, vep, on= ['ID'], how= 'left')
d.to_csv(snakemake.output[0], header=True, index= False, sep= '\t')

Python Pandas numpy Variant Effect Predictor (VEP) From line 1 of VEP/format_VEP.py

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0)
	x= pd.read_csv(input[1], sep= '\t', header= 0)
	x['CHR']= np.where(x['CHR']== '23', 'X', x['CHR'])
	d[['CHR', 'POS', 'REF', 'EFF', 'SNP']]= d['MarkerName'].str.split(':', expand= True)
	d= d.loc[d.SNP== 'SNP', :]
	d['POS2']= d['POS']
	d['CHR']= np.where(d['CHR']== '23', 'X', d['CHR'])
	d['POS']= d['POS'].astype(str).astype(int)
	df_list= list()
	for index, row in x.iterrows():
		temp_df= d.loc[d.CHR== row['CHR'], :]
		temp_df= temp_df.loc[((temp_df.POS >= int(row['pos1'])) & (temp_df.POS <= int(row['pos2']))), :]
		df_list.append(temp_df)
	d= pd.concat(df_list)
	d['Allele']= d['Allele1'].str.upper() + '/' + d['Allele2'].str.upper()
	d['STRAND']= '+'
	d.sort_values(by= ['CHR', 'POS'], inplace= True)
	d.to_csv(output[0], sep= '\t', header= False, index= False, columns= ['CHR', 'POS', 'POS2', 'Allele', 'STRAND'])

SnakeMake From line 11 of VEP/Snakefile

shell:
	'/home/pol/software/ensembl-vep/vep -i {input[0]} --check_existing --symbol --biotype --cache -O {output[0]} --offline --force_overwrite'

SnakeMake Variant Effect Predictor (VEP) From line 37 of VEP/Snakefile

script:
	'format_VEP.py'

SnakeMake From line 48 of VEP/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', header= 0)
        x= pd.read_csv(input[1], sep= '\t', header= 0)
        d= d.loc[~d.geneSymbol.isin(x.name2), :]

SnakeMake From line 58 of VEP/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['MarkerName', 'Allele1'])
	d['CHR']= d.MarkerName.str.split(':').str[0]
	d['end']= d.MarkerName.str.split(':').str[1]
	d['CHR']= d.CHR.astype('str').astype('int')
	d['end']= d.end.astype('str').astype('int')
	d['start']= d.end - 1 
	d['MarkerName']= d.MarkerName.str.replace(':SNP', '')
	d['MarkerName']= d.MarkerName.str.replace(':INDEL', '')
	d.sort_values(by= ['CHR', 'start'], inplace= True)
	d= d[['CHR', 'start', 'end', 'MarkerName']]
	d.to_csv(output[0], sep= '\t', header= False, index= False)

SnakeMake From line 88 of VEP/Snakefile

shell:
	'bedtools closest -t all -a {input[0]} -b {input[1]} > {output[0]}'

SnakeMake BEDTools From line 108 of VEP/Snakefile

script:
	'format_dbSNP.py'

SnakeMake From line 118 of VEP/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header=0)
	rs= pd.read_csv(input[1], sep= '\t', header=0)
	d= pd.merge(d, rs, on= 'ID', how= 'left')
	d['RSID']= np.where(pd.isnull(d.RSID), d.name, d.RSID)
	d['RSID']= np.where(d.RSID== '', d.name, d.RSID)
	d['RSID']= np.where(d.RSID== '-', d.name, d.RSID)
	d.drop('name', 1, inplace= True)
	ne= pd.read_csv(input[2], sep= '\t', header= None, names= ['CHR', 'X', 'POS', 'ID', 'c1', 'p1', 'p2', 'nearestGene', 'Ensembl_gene'])
	ne= ne[['ID', 'nearestGene']]
	d= pd.merge(d, ne, on= 'ID', how= 'left')
	d.to_csv(output[0], sep= '\t', header= True, index= False, compression= 'gzip')

SnakeMake From line 129 of VEP/Snakefile

run:
        d= pd.read_csv(input[0], sep= '\t', header=0, usecols= ['ID', 'CHR', 'POS'])
        d['end']= d.POS
        d['start']= d.end - 1
        d.sort_values(by= ['CHR', 'start'], inplace= True)
        d= d[['CHR', 'start', 'end', 'ID']]
        d.to_csv(output[0], sep= '\t', header= False, index= False)

SnakeMake From line 148 of VEP/Snakefile

shell:
        'bedtools closest -t all -k 2 -a {input[0]} -b {input[1]} > {output[0]}'

SnakeMake BEDTools From line 163 of VEP/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['ID', 'nearestGene'])
	ne= pd.read_csv(input[1], sep= '\t', header= None, names= ['CHR', 'X', 'POS', 'ID', 'c1', 'p1', 'p2', 'nearestGene2', 'Ensembl_gene'])

SnakeMake From line 173 of VEP/Snakefile

run:
	d= pd.read_csv(input[0], sep= '\t', header= 0, usecols= ['MarkerName', 'Allele1', 'Allele2', 'P-value'])
	d= d.loc[d['P-value']< 5e-5, :]

SnakeMake From line 188 of VEP/Snakefile

shell:
        '/home/pol/software/ensembl-vep/vep -i {input[0]} --check_existing --symbol --biotype --cache -O {output[0]} --offline --force_overwrite'