Workflow Steps and Code Snippets

2 tagged steps and code snippets that match keyword preeclampsia

Snakemake workflow: Meta-analysis of GWAS of gestational duration, preterm and post-term deliveries (EGG Consortium) (v1.0.0)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
import pandas as pd
import numpy as np
from scipy.special import chdtri
import gzip
import csv

def not_number(s):
	if s != None:
		try:
			float(s)
			return False
		except ValueError:
			return True
	else:
		return True


def select_format(repr_pheno, row):
	'For each wildcard assign the correct formating function.'
	if repr_pheno== 'Preeclampsia':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= preeclampsia(row)
	if repr_pheno== 'POP': 
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= POP(row)
	if repr_pheno== 'miscarriage':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= miscarriage(row)
	if repr_pheno== 'GA_fetal':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= fet_GA(row)
	if repr_pheno== 'BW_maternal':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_maternal(row)
	if repr_pheno== 'BW_fetal':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_fetal(row)
	if repr_pheno== 'BW_maternal_effect':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_maternal_adjusted_effect(row)
	if repr_pheno== 'BW_fetal_effect':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= BW_fetal_adjusted_effect(row)
	if repr_pheno== 'leiomyoma_uterus':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= leiomyoma_uterus(row)
	if repr_pheno== 'AMenopause':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= AMenopause(row)
	if repr_pheno in ['Oestradiol_fem', 'NLB', 'AFB', 'AMenarche', 'endometriosis']:
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= UKBB_traits(row)
	if repr_pheno in ['SHBG_fem', 'Testosterone_fem', 'Testosterone_male', 'SHBG_male', 'CBAT_fem', 'CBAT_male']:
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= pritchard(row)
	if repr_pheno == 'PCOS':
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= PCOS(row)
	if repr_pheno in ['Ruth_CBAT_female', 'Ruth_CBAT_male', 'Ruth_SHBG_female', 'Ruth_SHBG_male', 'Ruth_Testosterone_female', 'Ruth_Testosterone_male', 'Ruth_oestradiol']:
		rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= Ruth(row, repr_pheno) 
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]


def AMenopause(row):
	'REPROGEN Age at menopause.'
	EAF= float(row['EAF'])
	CHR= row['CHR']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['POS'])
	REF= row['Other_Allele'].upper()
	EFF= row['Effect_Allele'].upper()
	BETA= float(row['Effect'])
	pvalue= float(row['Pval'])
	SE= float(row['SE'])
	N= int(row['N'])
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def Ruth(row, repr_pheno):
	''
	EAF= float(row['effect_allele_frequency'])
	CHR= row['chromosome']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['base_pair_location'])
	REF= row['other_allele']
	EFF= row['effect_allele']
	BETA= float(row['beta'])
	pvalue= float(row['p_value'])
	SE= float(row['standard_error'])
	N= np.where(repr_pheno== 'Ruth_SHBG_female', 189473,
	np.where(repr_pheno== 'Ruth_SHBG_make', 180726,
	np.where(repr_pheno== 'Ruth_Testosterone_female', 230454,
	np.where(repr_pheno== 'Ruth_SHBG_male',194453 ,
	np.where(repr_pheno== 'Ruth_CBAT_female', 188507,
	np.where(repr_pheno== 'Ruth_SHBG_male', 178782, 206927))))))
	rsid= row['variant_id']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def pritchard(row):
	''
	EAF= float(row['A1_FREQ'])
	CHR= row['#CHROM']
	if CHR== 'X': CHR= 23
	if not_number(CHR): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	POS= int(row['POS'])
	CHR= int(CHR)
	REF= row['REF']
	EFF= row['ALT']
	N= int(row['OBS_CT'])
	if not_number(row['BETA']): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	if not_number(row['SE']): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	if not_number(row['P']): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	BETA= float(row['BETA'])
	SE= float(row['SE'])
	pvalue= float(row['P'])
	rsid= row['ID']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def leiomyoma_uterus(row):
	''
	EAF= float(row['EAF'])
	CHR= row['CHR']
	if CHR== 'X': CHR= 23
	if not_number(CHR): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	POS= int(row['POS'])
	CHR= int(CHR)
	REF= row['REF']
	EFF= row['EFF']
	N= row['TOTALSAMPLESIZE']
	BETA= float(row['beta'])
	SE= float(row['se'])
	pvalue= float(row['pvalue'])
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def preeclampsia(row):
	''
	CHR= row['CHR']
	if CHR== 'X': CHR= 23
	if not_number(CHR): return [0, 0, 0 , 0, 0, 0, 0, 0, 0, 0]
	POS= int(row['POS'])
	CHR= int(CHR)
	REF= row['REF'].upper()
	EFF= row['EFF'].upper()
	N= 4630 + 373345
	rsid= row['rsid']
	BETA= float(row['beta'])
	SE= float(row['se'])
	EAF= float(row['EAF'])
	pvalue= float(row['pvalue'])
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def BW_fetal_adjusted_effect(row):
	'Define each header for Birth weight fetal effect.'
	EAF= float(row['eaf'])
	CHR= row['chr']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['pos'])
	REF= row['nea'].upper()
	if REF== 'R': REF= 'D'
	EFF= row['ea'].upper()
	if EFF== 'R': EFF= 'D'
	BETA= float(row['beta'])
	pvalue= float(row['p'])
	SE= float(row['se'])
	N= int(row['n_ownBW'])
	rsid= row['RSID']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def BW_maternal_adjusted_effect(row):
	'Define each header for Birth weight fetal effect.'
	EAF= float(row['eaf'])
	CHR= row['chr']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['pos'])
	REF= row['nea'].upper()
	if REF== 'R': REF= 'D'
	EFF= row['ea'].upper()
	if EFF== 'R': EFF= 'D'
	BETA= float(row['beta'])
	pvalue= float(row['p'])
	SE= float(row['se'])
	N= int(row['n_offBW'])
	rsid= row['RSID']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]


def BW_maternal(row):
	'Define each header for Birth weight maternal effect.'
	EAF= float(row['eaf'])
	CHR= row['chr']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['pos'])
	REF= row['nea']
	EFF= row['ea']
	if REF== 'R': REF= 'D'
	if EFF== 'R': EFF= 'D'
	BETA= float(row['beta'])
	pvalue= float(row['p'])
	SE= float(row['se'])
	N= int(row['n'])
	rsid= row['SNP']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def BW_fetal(row):
	'Define each header for Birth weight maternal effect.'
	EAF= float(row['eaf'])
	CHR= row['chr']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['pos'])
	REF= row['nea']
	EFF= row['ea']
	if REF== 'R': REF= 'D'
	if EFF== 'R': EFF= 'D'
	BETA= float(row['beta'])
	pvalue= float(row['p'])
	SE= float(row['se'])
	N= int(row['n'])
	rsid= row['rsid']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]


def PCOS(row):
	'Define each header for PCOS excluding 23andme.'
	EAF= float(row['EAF'])
	CHR= row['CHR']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['POS'])
	REF= row['REF']
	EFF= row['EFF']
	BETA= float(row['beta'])
	pvalue= float(row['pvalue'])
	SE= float(row['se'])
	N= int(round(float(row['TOTALSAMPLESIZE'])))
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def UKBB_traits(row):
	'Define each header for UKBB traits (hormones).'
	if row['low_confidence_variant']== 'true': return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	CHR= row['variant'].split(':')[0]
	if CHR== 'X': CHR= 23
	POS= row['variant'].split(':')[1]
	if any([not_number(t) for t in [row['minor_AF'], CHR, POS, row['beta'], row['pval'], row['se'], row['n_complete_samples']]]): return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	CHR= int(CHR)
	POS= int(POS)
	REF= row['variant'].split(':')[2]
	EFF= row['variant'].split(':')[3]
	BETA= float(row['beta'])
	pvalue= float(row['pval'])
	SE= float(row['se'])
	N= int(row['n_complete_samples'])
	if row['minor_allele']== EFF:
		EAF= float(row['minor_AF'])
	else:
		EAF= 1- float(row['minor_AF'])
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def AP_repr(row):
	'Define each header for BOLT-LMM sumstats.'
	EAF= float(row['EAF'])
	CHR= row['CHR']
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['POS'])
	REF= row['A2']
	EFF= row['A1']
	BETA= float(row['Beta'])
	pvalue= float(row['P'])
	SE= float(row['se'])
	N= row['N']
	rsid= row['SNP']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]


def POP(row):
	'Define each header for pelvic organ prolapse.'
	if not row['CHR'].isdigit(): return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	EAF= float(row['EAF'])
	MAF= np.where(EAF> 0.5, 1 - EAF, EAF)
	if MAF < 0.005: return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	if row['CHR']== 'X': row['CHR']= 23
	CHR= int(row['CHR'])
	POS= int(row['POS'])
	REF= row['REF']
	EFF= row['EFF']
	BETA= float(row['BETA'])
	pvalue= float(row['pvalue'])
	SE= float(row['SE'])
	N= float(row['N'])
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def fet_GA(row):
	'Define each header for Fetal gestational duration.'
	EAF= ''
	if row['Chr']== 'X': row['Chr']= 23
	CHR= int(row['Chr'])
	POS= int(row['Pos'])
	REF= row['Non_effect_allele'].upper()
	EFF= row['Effect_allele'].upper()
	BETA= float(row['Effect'])
	pvalue= float(row['P'])
	SE= float(row['StdErr'])
	N= int(row['N'])
	rsid= row['Rsid']
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]

def miscarriage(row):
	'Define each header for Miscarriage.'
	EAF= row['Freq1']
	CHR= row['MarkerName'].split(':')[0]
	if CHR== 'X': CHR= 23
	CHR= int(CHR)
	POS= int(row['MarkerName'].split(':')[1])
	REF= row['Allele2'].upper()
	EFF= row['Allele1'].upper()
	BETA= float(row['Effect'])
	pvalue= float(row['P-value'])
	SE= float(row['StdErr'])
	N= 49996 + 174109
	rsid= ''
	return [rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue]


def format_list(input, output):
	with gzip.open(input, 'rt', newline='') as f:
		print(input)
		dialect = csv.Sniffer().sniff(f.readline(), delimiters= ' \t')
		f.seek(0)
		input_file= csv.DictReader(f, dialect= dialect)
		df_list= list()
		with open(output, 'w') as csvfile:
			writer = csv.writer(csvfile, delimiter= '\t')
			writer.writerow([g for g in ['ID', 'rsid', 'CHR', 'POS', 'EAF', 'N', 'REF', 'EFF', 'BETA', 'SE', 'pvalue']])
		for row in input_file:
			rsid, CHR, POS, EAF, N, REF, EFF, BETA, SE, pvalue= select_format(snakemake.wildcards.repr_pheno, row)
			if CHR== 0: continue
			if len(REF) >1: REF= 'I'
			if len(EFF) >1: EFF= 'I'
			if REF== 'I': EFF= 'D'
			if EFF== 'I': REF= 'D'
			if REF> EFF:
				ID= str(CHR) + ':' + str(POS) + ':' + EFF + ':' + REF
				BETA= -1 * float(BETA)
				ref= EFF
				eff= REF
				EAF= 1 - float(EAF)
			else:
				ID= str(CHR) + ':' + str(POS) + ':' + REF + ':' + EFF
				BETA= float(BETA)
				eff= EFF
				ref= REF
			df_list.append([ID, rsid, CHR, POS, EAF, N, ref, eff, BETA, SE, pvalue])
			if len(df_list)== 1000:
				with open(output, 'a', newline= '') as file_handler:
					writer1= csv.writer(file_handler, delimiter= '\t')
					for item in df_list:
						writer1.writerow(item)
				df_list= list()
	with open(output, 'a', newline= '') as file_handler:
			writer1= csv.writer(file_handler, delimiter= '\t')
			for item in df_list:
				writer1.writerow(item)


format_list(snakemake.input[0], snakemake.output[0])

Snakemake workflow for the project Exploring the Impact of Parity and its Interaction with History of Preterm Delivery on Gestational Duration

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
library(dplyr)
library(data.table)


#### Loading data ####
dat = fread(snakemake@input[[1]]) # Swedish Medical Birth Register
p_id = fread(snakemake@input[[2]]) # Multi-Generation Register
edu = fread(snakemake@input[[3]]) # Education Register



#### Creating variables ####

## PTB < 37 weeks
dat = dat %>% mutate(PTB = ifelse(GRDBS< (37*7),1,0))


## Sex
# male as ref
# 1=Pojke (boy); 2=Flicka (girl)
dat = dat %>% mutate(KON = ifelse(KON == 1, "M","F"))
dat$KON = factor(dat$KON, level = c("M","F"))


## Maternal age categorized:
# < 20
# 20-29
# 30-39
# >= 40

# Maternal age categorization
dat = dat %>% mutate(MALDER_c = ifelse(MALDER<20,1,0),
                     MALDER_c = ifelse(MALDER>=20 & MALDER<=29,0,MALDER_c),
                     MALDER_c = ifelse(MALDER>=30 & MALDER<=39,2,MALDER_c),
                     MALDER_c = ifelse(MALDER>=40,3,MALDER_c))
dat$MALDER_c = as.factor(dat$MALDER_c)


## Finding father to each child in  the multi-generation register and the fathers age
f_id = p_id %>% select(LopnrBarn, LopnrFar, FoddArBioFar) # selecting columns of interest
colnames(f_id) = c("lpnr_BARN","lpnr_far", "ar_far")
dat = left_join(dat,f_id,by="lpnr_BARN") # adding fathers info from parents.csv to mfr data
rm(f_id)
dat = dat %>% mutate(FALDER = AR-ar_far) # calculating how old the father was when their child was born


## Nationality  
# As maternal citizenship and the mothers birth country
dat = dat %>% mutate(swe_citizenship =as.numeric(MNAT %in% c('SVERIGE'))) %>% mutate(mor_birth_country_NORDIC = as.numeric(MFODLAND %in% c('SVERIGE','NORGE','FINLAND','ISLAND','DANMARK')))


## First child (parity 0) born preterm
dat = dat %>% group_by(lpnr_mor) %>% arrange(parity_clean) %>%
  mutate(PTB_first_born = any(row_number() == 1 & PTB ==1)*1) %>% 
  mutate(PTB_first_born = ifelse(dplyr::first(parity_clean)!=1,NA,PTB_first_born)) #parity_clean == 1 is parity 0


## Previous preterm delivery
dat = dat %>% group_by(lpnr_mor) %>% arrange(parity_clean )%>% mutate(prev_PTD = ifelse(dplyr::lag(PTB)==1,1,0))
dat = dat %>% group_by(lpnr_mor) %>% arrange(parity_clean) %>%  mutate(diff_p = parity_clean - dplyr::lag(parity_clean)) %>% mutate(prev_PTD = ifelse(diff_p == 1,prev_PTD,NA))


## Mother was born preterm herself
barn = dat %>% pull(lpnr_BARN)
mor_also_barn_in_mfr =  dat[dat$lpnr_mor %in% barn,]
mor_also_barn_in_mfr = mor_also_barn_in_mfr %>% select(lpnr_mor)
mor_also_barn_in_mfr = unique(mor_also_barn_in_mfr)
mor_as_barn = inner_join(dat,mor_also_barn_in_mfr, by = c("lpnr_BARN" ="lpnr_mor")) # The pregnancies in which the mothers where born

mor_as_barn = mor_as_barn %>% mutate(mother_herself_PTB = ifelse(PTB == 1, 1,0)) %>%
  select(lpnr_BARN,mother_herself_PTB) #lpnr_BARN here are barn that also are mothers in mfr

dat = full_join(dat, mor_as_barn, by = c("lpnr_mor" ="lpnr_BARN"))
dat = select(dat, -lpnr_mor.y)


## Diabetes
dat = dat %>% mutate(diab1 = ifelse(DIABETES != 1 | is.na(DIABETES), 0,1))  #diabetes according to mfr variable Diabetes

test = dat[grepl("O24|E10|E11|E12|E13|E14|648A|250A|250B|250C|250D|250E|250F|250G|250H|250X|25000| 25001| 25002| 25003| 25004| 25005| 25006| 25007| 25008| 2500",paste(dat$MDIAG1,dat$MDIAG2,dat$MDIAG3,dat$MDIAG4,dat$MDIAG5,dat$MDIAG6,dat$MDIAG7,dat$MDIAG8,dat$MDIAG9,dat$MDIAG10,dat$MDIAG11,dat$MDIAG12)),] #ICD codes (ICD-10-SE,ICD9-SE,ICD-8) related to diabetes, extracted from maternal icd diagnosis in mfr
mor_with_diabetes = test %>% pull(sq) # rows of mothers that have diabetes according to icd codes

dat = dat %>% mutate(diab2 = ifelse(sq %in% mor_with_diabetes,1,0))

dat = dat %>% mutate(diab = ifelse(diab1==1 | diab2 ==1,1,0)) # mother will have diabetes based on icd codes and the mfr variable Diabetes
dat = select(dat, -diab1,-diab2) 


## BMI
print("s1")
dat1 = dat %>% mutate(BMI = MVIKT / (MLANGD/100)^2) # BMI
#source("/home/karin/Parity_Project1/scripts/functions/1_cleaning_modules.R")
source(snakemake@params[[1]]) #fun_mBmiQC modified to not remove "bad" BMI, just set them as NA.
print("s2")
year_matrix = NULL
dat2 = fun_mBmiQC(as.data.frame(dat1)) # setting bad BMI to NA
print("s3")
dat = dat2
rm(dat1,dat2)


## Smoking 
dat = dat %>% mutate(smoking = ifelse((ROK1 ==1 | is.na(ROK1)) & (ROK0 == 1 | is.na(ROK0)) ,0,1),
                     smoking = ifelse(ROK2 == 1 |is.na(ROK2),smoking,2)) # 0 = Not smoking, 1 =  Smoking 3 months prior to the current pregnancy or/and Smoking at admission to maternal health, 2 = Smoking in pregnancy week 30-32  


## Preeclampsia
test = dat[grepl("O14|O11|O15|642E|642F|642H|63703 |63704 | 63709| 63710|6612",paste(dat$MDIAG1,dat$MDIAG2,dat$MDIAG3,dat$MDIAG4,dat$MDIAG5,dat$MDIAG6,dat$MDIAG7,dat$MDIAG8,dat$MDIAG9,dat$MDIAG10,dat$MDIAG11,dat$MDIAG12)),] #ICD codes (ICD-10-SE,ICD9-SE,ICD-8) related to preeclampsia, extracted from maternal icd diagnosis in mfr
mor_with_preeclampsia = test %>% pull(sq) # rows of mothers that have preeclampsia according to icd codes

dat = dat %>% mutate(preeclamspia = ifelse(sq %in% mor_with_preeclampsia,1,0))


## Education 
# find the maximum edu + filtering
edu = edu %>% group_by(LopNr) %>% filter(n()==1) # can not tell which of the rows are the ture one when ID for the same person exist in several rows, are removed
edu = as.data.frame(edu)
edu_grades = edu[grep("SUN2000", names(edu))] # education based on SUN2000
edu_grades[, "max"] <- apply(edu_grades, 1, max, na.rm=TRUE) # Finding highest education for each person
edu = cbind(edu, edu_grades[,"max"])
names(edu)[names(edu) == 'edu_grades[, "max"]'] = "max_grade"

# Remove reused LopNr based on AterPnr
edu_rm = edu[grep("Ater", names(edu))]
edu_rm = edu_rm %>% mutate(remove = ifelse(rowSums(edu_rm == 1,na.rm = TRUE) > 0, F, T))
edu = edu[edu_rm$remove,]

# Remove reused LopNr based on SenPnr
edu_rm = edu[grep("Sen", names(edu))]
#nr = ncol(edu_rm)
edu_rm = edu_rm %>% mutate(remove = ifelse(rowSums(edu_rm == 0,na.rm = TRUE) >0 , F, T))
edu = edu[edu_rm$remove,]
#nrow(edu) == 5828310

#Join with mfr
edu_max = edu[grep("LopNr|max_grade", names(edu))]
d_mor = left_join(dat, edu_max, by = c("lpnr_mor" = "LopNr") )
names(d_mor)[names(d_mor) == 'max_grade'] = "max_grade_mor"

d_mor_far = left_join(d_mor, edu_max, by = c("lpnr_far" = "LopNr") )
names(d_mor_far)[names(d_mor_far) == 'max_grade'] = "max_grade_far"

d_mor_far_child = left_join(d_mor_far, edu_max, by = c("lpnr_BARN" = "LopNr") )
names(d_mor_far_child)[names(d_mor_far_child) == 'max_grade'] = "max_grade_child"

dat = d_mor_far_child
rm(edu_grades,edu_max,d_mor,d_mor_far,d_mor_far_child)

# Max_grade in categories
dat = dat %>% mutate(max_grade_mor_c  = ifelse(max_grade_mor==2 | max_grade_mor==1,1,0),                   # 9 years or less
                     max_grade_mor_c = ifelse(max_grade_mor==3 | max_grade_mor ==4,2,max_grade_mor_c),     # Gymnasial utbilding (additional 2-3 years)      
                     max_grade_mor_c = ifelse(max_grade_mor >=5,3,max_grade_mor_c)) # 0 is nas             # Eftergymnasial utbildning (shorter than 3 years, 3 years or longer, postgraduate education)

dat = dat %>% mutate(max_grade_far_c  = ifelse(max_grade_far==2 | max_grade_far==1,1,0),
                     max_grade_far_c = ifelse(max_grade_far==3 | max_grade_far ==4,2,max_grade_far_c),
                     max_grade_far_c = ifelse(max_grade_far >=5,3,max_grade_far_c)) # 0 is nas


## Parity, grouping after parity 4
dat = dat %>% mutate(Parity_logreg = ifelse(as.numeric(parity_clean)<5,parity_clean,4))


#### Saving ####
fwrite(dat, snakemake@output[[1]], sep=",")
topic

preeclampsia

Preeclampsia is persistent high blood pressure that develops during pregnancy or the postpartum period and is often associated with high levels of protein in the urine OR the new development of decreased blood platelets, trouble with the kidneys or liver, fluid in the lungs, or signs of brain trouble such as seizures and/or visual disturbances.