Snakemake workflow for the project Exploring the Impact of Parity and its Interaction with History of Preterm Delivery on Gestational Duration

public public 1yr ago 0 bookmarks

The script in this folder present the Snakmake workflow for the project Exploring the Impact of Parity and its Interaction with History of Preterm Delivery on Gestational Duration.

##Contact

Code Snippets

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
library(dplyr)
library(data.table)


#### Loading data ####
dat = fread(snakemake@input[[1]]) # Swedish Medical Birth Register
p_id = fread(snakemake@input[[2]]) # Multi-Generation Register
edu = fread(snakemake@input[[3]]) # Education Register



#### Creating variables ####

## PTB < 37 weeks
dat = dat %>% mutate(PTB = ifelse(GRDBS< (37*7),1,0))


## Sex
# male as ref
# 1=Pojke (boy); 2=Flicka (girl)
dat = dat %>% mutate(KON = ifelse(KON == 1, "M","F"))
dat$KON = factor(dat$KON, level = c("M","F"))


## Maternal age categorized:
# < 20
# 20-29
# 30-39
# >= 40

# Maternal age categorization
dat = dat %>% mutate(MALDER_c = ifelse(MALDER<20,1,0),
                     MALDER_c = ifelse(MALDER>=20 & MALDER<=29,0,MALDER_c),
                     MALDER_c = ifelse(MALDER>=30 & MALDER<=39,2,MALDER_c),
                     MALDER_c = ifelse(MALDER>=40,3,MALDER_c))
dat$MALDER_c = as.factor(dat$MALDER_c)


## Finding father to each child in  the multi-generation register and the fathers age
f_id = p_id %>% select(LopnrBarn, LopnrFar, FoddArBioFar) # selecting columns of interest
colnames(f_id) = c("lpnr_BARN","lpnr_far", "ar_far")
dat = left_join(dat,f_id,by="lpnr_BARN") # adding fathers info from parents.csv to mfr data
rm(f_id)
dat = dat %>% mutate(FALDER = AR-ar_far) # calculating how old the father was when their child was born


## Nationality  
# As maternal citizenship and the mothers birth country
dat = dat %>% mutate(swe_citizenship =as.numeric(MNAT %in% c('SVERIGE'))) %>% mutate(mor_birth_country_NORDIC = as.numeric(MFODLAND %in% c('SVERIGE','NORGE','FINLAND','ISLAND','DANMARK')))


## First child (parity 0) born preterm
dat = dat %>% group_by(lpnr_mor) %>% arrange(parity_clean) %>%
  mutate(PTB_first_born = any(row_number() == 1 & PTB ==1)*1) %>% 
  mutate(PTB_first_born = ifelse(dplyr::first(parity_clean)!=1,NA,PTB_first_born)) #parity_clean == 1 is parity 0


## Previous preterm delivery
dat = dat %>% group_by(lpnr_mor) %>% arrange(parity_clean )%>% mutate(prev_PTD = ifelse(dplyr::lag(PTB)==1,1,0))
dat = dat %>% group_by(lpnr_mor) %>% arrange(parity_clean) %>%  mutate(diff_p = parity_clean - dplyr::lag(parity_clean)) %>% mutate(prev_PTD = ifelse(diff_p == 1,prev_PTD,NA))


## Mother was born preterm herself
barn = dat %>% pull(lpnr_BARN)
mor_also_barn_in_mfr =  dat[dat$lpnr_mor %in% barn,]
mor_also_barn_in_mfr = mor_also_barn_in_mfr %>% select(lpnr_mor)
mor_also_barn_in_mfr = unique(mor_also_barn_in_mfr)
mor_as_barn = inner_join(dat,mor_also_barn_in_mfr, by = c("lpnr_BARN" ="lpnr_mor")) # The pregnancies in which the mothers where born

mor_as_barn = mor_as_barn %>% mutate(mother_herself_PTB = ifelse(PTB == 1, 1,0)) %>%
  select(lpnr_BARN,mother_herself_PTB) #lpnr_BARN here are barn that also are mothers in mfr

dat = full_join(dat, mor_as_barn, by = c("lpnr_mor" ="lpnr_BARN"))
dat = select(dat, -lpnr_mor.y)


## Diabetes
dat = dat %>% mutate(diab1 = ifelse(DIABETES != 1 | is.na(DIABETES), 0,1))  #diabetes according to mfr variable Diabetes

test = dat[grepl("O24|E10|E11|E12|E13|E14|648A|250A|250B|250C|250D|250E|250F|250G|250H|250X|25000| 25001| 25002| 25003| 25004| 25005| 25006| 25007| 25008| 2500",paste(dat$MDIAG1,dat$MDIAG2,dat$MDIAG3,dat$MDIAG4,dat$MDIAG5,dat$MDIAG6,dat$MDIAG7,dat$MDIAG8,dat$MDIAG9,dat$MDIAG10,dat$MDIAG11,dat$MDIAG12)),] #ICD codes (ICD-10-SE,ICD9-SE,ICD-8) related to diabetes, extracted from maternal icd diagnosis in mfr
mor_with_diabetes = test %>% pull(sq) # rows of mothers that have diabetes according to icd codes

dat = dat %>% mutate(diab2 = ifelse(sq %in% mor_with_diabetes,1,0))

dat = dat %>% mutate(diab = ifelse(diab1==1 | diab2 ==1,1,0)) # mother will have diabetes based on icd codes and the mfr variable Diabetes
dat = select(dat, -diab1,-diab2) 


## BMI
print("s1")
dat1 = dat %>% mutate(BMI = MVIKT / (MLANGD/100)^2) # BMI
#source("/home/karin/Parity_Project1/scripts/functions/1_cleaning_modules.R")
source(snakemake@params[[1]]) #fun_mBmiQC modified to not remove "bad" BMI, just set them as NA.
print("s2")
year_matrix = NULL
dat2 = fun_mBmiQC(as.data.frame(dat1)) # setting bad BMI to NA
print("s3")
dat = dat2
rm(dat1,dat2)


## Smoking 
dat = dat %>% mutate(smoking = ifelse((ROK1 ==1 | is.na(ROK1)) & (ROK0 == 1 | is.na(ROK0)) ,0,1),
                     smoking = ifelse(ROK2 == 1 |is.na(ROK2),smoking,2)) # 0 = Not smoking, 1 =  Smoking 3 months prior to the current pregnancy or/and Smoking at admission to maternal health, 2 = Smoking in pregnancy week 30-32  


## Preeclampsia
test = dat[grepl("O14|O11|O15|642E|642F|642H|63703 |63704 | 63709| 63710|6612",paste(dat$MDIAG1,dat$MDIAG2,dat$MDIAG3,dat$MDIAG4,dat$MDIAG5,dat$MDIAG6,dat$MDIAG7,dat$MDIAG8,dat$MDIAG9,dat$MDIAG10,dat$MDIAG11,dat$MDIAG12)),] #ICD codes (ICD-10-SE,ICD9-SE,ICD-8) related to preeclampsia, extracted from maternal icd diagnosis in mfr
mor_with_preeclampsia = test %>% pull(sq) # rows of mothers that have preeclampsia according to icd codes

dat = dat %>% mutate(preeclamspia = ifelse(sq %in% mor_with_preeclampsia,1,0))


## Education 
# find the maximum edu + filtering
edu = edu %>% group_by(LopNr) %>% filter(n()==1) # can not tell which of the rows are the ture one when ID for the same person exist in several rows, are removed
edu = as.data.frame(edu)
edu_grades = edu[grep("SUN2000", names(edu))] # education based on SUN2000
edu_grades[, "max"] <- apply(edu_grades, 1, max, na.rm=TRUE) # Finding highest education for each person
edu = cbind(edu, edu_grades[,"max"])
names(edu)[names(edu) == 'edu_grades[, "max"]'] = "max_grade"

# Remove reused LopNr based on AterPnr
edu_rm = edu[grep("Ater", names(edu))]
edu_rm = edu_rm %>% mutate(remove = ifelse(rowSums(edu_rm == 1,na.rm = TRUE) > 0, F, T))
edu = edu[edu_rm$remove,]

# Remove reused LopNr based on SenPnr
edu_rm = edu[grep("Sen", names(edu))]
#nr = ncol(edu_rm)
edu_rm = edu_rm %>% mutate(remove = ifelse(rowSums(edu_rm == 0,na.rm = TRUE) >0 , F, T))
edu = edu[edu_rm$remove,]
#nrow(edu) == 5828310

#Join with mfr
edu_max = edu[grep("LopNr|max_grade", names(edu))]
d_mor = left_join(dat, edu_max, by = c("lpnr_mor" = "LopNr") )
names(d_mor)[names(d_mor) == 'max_grade'] = "max_grade_mor"

d_mor_far = left_join(d_mor, edu_max, by = c("lpnr_far" = "LopNr") )
names(d_mor_far)[names(d_mor_far) == 'max_grade'] = "max_grade_far"

d_mor_far_child = left_join(d_mor_far, edu_max, by = c("lpnr_BARN" = "LopNr") )
names(d_mor_far_child)[names(d_mor_far_child) == 'max_grade'] = "max_grade_child"

dat = d_mor_far_child
rm(edu_grades,edu_max,d_mor,d_mor_far,d_mor_far_child)

# Max_grade in categories
dat = dat %>% mutate(max_grade_mor_c  = ifelse(max_grade_mor==2 | max_grade_mor==1,1,0),                   # 9 years or less
                     max_grade_mor_c = ifelse(max_grade_mor==3 | max_grade_mor ==4,2,max_grade_mor_c),     # Gymnasial utbilding (additional 2-3 years)      
                     max_grade_mor_c = ifelse(max_grade_mor >=5,3,max_grade_mor_c)) # 0 is nas             # Eftergymnasial utbildning (shorter than 3 years, 3 years or longer, postgraduate education)

dat = dat %>% mutate(max_grade_far_c  = ifelse(max_grade_far==2 | max_grade_far==1,1,0),
                     max_grade_far_c = ifelse(max_grade_far==3 | max_grade_far ==4,2,max_grade_far_c),
                     max_grade_far_c = ifelse(max_grade_far >=5,3,max_grade_far_c)) # 0 is nas


## Parity, grouping after parity 4
dat = dat %>% mutate(Parity_logreg = ifelse(as.numeric(parity_clean)<5,parity_clean,4))


#### Saving ####
fwrite(dat, snakemake@output[[1]], sep=",")
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
source(snakemake@params[[1]]) # "1_cleaning_modules.R"
source(snakemake@params[[2]]) # "2_renumber_parity_to_parityF.R"




#### Cleaning the data by Removing maternal with no ID, Removing mother-kids duplications, Remove Stillbirths, Remove rows with no GestAge and Only keeping singeltons. Also cleaning parity ####

## Loading data
library(data.table)
mfr_raw = fread(snakemake@input[[1]]) # load data
print("Number of pregnancies in the data")
print(nrow(mfr_raw)) #Number of pregnancies in the data


###  Removing maternal with no ID -> Removing mother-kids duplications -> Remove Stillbirths -> Remove rows with no GestAge -> Only keeping singeltons -> cleaning parity
year_matrix = NULL  # For the function fun_momID
cleaning = function(dat){

  dat = fun_momID(dat)  # Removing mothers without IDs
  colnames(dat)[grep("lpnr_barn",colnames(dat))]  = "lpnr_BARN"  # Change the name the column with child id 
  print("Removed mothers without IDs, pregnancies left:")
  print(row(dat))

  dat = fun_momkidID(dat)  # Removing mother-child duplications 
  print("Removed mother-child duplications, pregnancies left:")
  print(nrow(dat))

  dat = fun_deadborn(dat)  # Removing stillbirths
  print("Removed stillbirths, pregnancies left:")
  print(nrow(dat))

  dat = fun_GAmiss(dat)  # Removing pregnancies with no gestational age 
  print("Removed pregnancies with no gestational age, pregnancies left:")
  print(nrow(dat))

  dat = fun_multipregs(dat)   # Only keeping singeltons 
  print("Only keeping singeltons, pregnancies left:")
  print(nrow(dat)) 

  nnn = recalculateParity(dat, thr_d_low = 31, thr_d_upp = 200)  # Function resulting in the parity (number of pregnancies) for each woman (cleaning parity)
  mfr = cbind(dat, data.frame(parity_clean = nnn))  # Adding the parity to the "input data" that has been filtered

  dat = mfr %>% filter(BORDF2==1)
  mor = dat %>% group_by(lpnr_mor) %>% filter(parity_clean==1) %>% filter(n()>1) %>% pull(lpnr_mor)
  dat = dat[!(dat$lpnr_mor %in% mor),]
  print("Only keeping singeltons, pregnancies left:")
  print(nrow(dat))

  print("Done: Removed maternal with no ID -> Removed mother-kids duplications -> Removed Stillbirths -> Removed rows with no GestAge -> Only keeping singeltons -> cleaning parity")

dat
}

mfr_new = cleaning(mfr_raw)  # Cleaning data with the function cleaning
print("Number of pregnancies in the data after the filering function:")
print(nrow(mfr_new))




####
## Filtering GD?
# No using GRDBS. 
# GRDBS range between 154 to 321 days. The number of GD > 310 days is 2774, and more common in the earlier years of the register. Maybe there are some reason that the GD was bigger than 310 days, for now they are included.
####




#### Cleaning for outliers based on deviating child weight and gestational duration correlation with a Generalised Additive Models for Location Scale and Shape ####
library(dplyr)
library(gamlss)
b=subset(mfr_new,select=c("BVIKTBS","GRDBS")) # extracting weigth and gestational duration from data
b=na.omit(b) # removing incompleacte vases of weigth and gestational duration

newy=b$BVIKTBS
newx=b$GRDBS

mBCTcs<- gamlss(BVIKTBS~cs(GRDBS),family=BCT, data=b) # A generalised additive model utilizing a Box-Cox-t distribution and cubic spline (cs) smoothing term
saveRDS(mBCTcs, file = "~/Parity_Project1/mBCTcs.rds")
h=centiles.pred(mBCTcs, xname="GRDBS",xvalues=newx, type="s",dev = c( -5, -4.5, -4,-3.5, -3, -2, -1, 0, 1, 2, 3,3.5, 4,4.5, 5)) # looks good!

mfr_new$sq = seq(nrow(mfr_new)) # to restore the original order in the end
lower<-h[,3]
upper<-h[,15]
dat2<- arrange(mfr_new,GRDBS) %>% filter(!is.na(BVIKTBS), !is.na(GRDBS)) %>% mutate(rows_to_remove = ifelse(((BVIKTBS<lower)|
                 (BVIKTBS>upper)),T,F)) # to remove = T, removing any values above 4.5 or below -4.5 standard deviations (upper and lower)
dat3 <- filter(dat2,rows_to_remove==F)
#ggplot(dat3, aes(x=GRDBS,y=BVIKTBS)) + geom_point() # control removal

dat = dat3[order(dat3$sq),] # to restore order
rm(mfr_new,mBCTcs,dat2,dat3,b)

print("Pregnancies left after cleaning for outliers based on deviating child weight and gestational duration correlation:")
print(nrow(dat))




#### IVF ####
## Filter IVF cases and making a variable telling if this mother had a problem becomming pregnant = unwilling subfertility ##
# Variable
dat = dat %>% mutate(unwilling_subfertility = ifelse(is.na(OFRIBARN) | OFRIBARN == 0,0, OFRIBARN)) # 0 = no unwilling subfertility, variable >= 1 --> years in unwilling subfertility

# Filtering
# Variables below regards action on infertility --> pregancies with a 1 (=yes) are removed
# - OFRIIATG
# - OFRIABEF
# - OFRISTIM
# - OFRIKIRU
# - OFRIICSI
# - OFRIANN
dat = dat %>% filter(is.na(OFRIIATG), is.na(OFRIABEF), is.na(OFRISTIM), is.na(OFRIKIRU), is.na(OFRIICSI), is.na(OFRIANN))

print("Pregnancies left after removing pregnancies with action on infertility:")
print(nrow(dat))



#### Saving ####
fwrite(dat, snakemake@output[[1]], sep=",")
49
50
script:
    "scripts/filtering_script_10feb2022.R"
65
66
script:
    "scripts/create_variables.R"
85
86
script:
    "/home/karin/Parity_gestational_duration_interaction/scripts/plots/descriptive_table.R"
105
106
script:
    "/home/karin/Parity_gestational_duration_interaction/scripts/plots/Parity_affects_gd.R"
SnakeMake From line 105 of master/Snakefile
126
127
script:
    "/home/karin/Parity_gestational_duration_interaction/scripts/plots/variance.R"
SnakeMake From line 126 of master/Snakefile
146
147
script:
    "/home/karin/Parity_gestational_duration_interaction/scripts/plots/clinical_history.R"
SnakeMake From line 146 of master/Snakefile
168
169
script:
    "/home/karin/Parity_gestational_duration_interaction/scripts/plots/family_history_maternal_sisters_given_birth_preterm.R"
SnakeMake From line 168 of master/Snakefile
189
190
script:
    "/home/karin/Parity_gestational_duration_interaction/scripts/plots/Supplementary/clinical_history_only_spontaneous.R"
SnakeMake From line 189 of master/Snakefile
210
211
script:
    "/home/karin/Parity_gestational_duration_interaction/scripts/plots/Supplementary/famliy_history_no_parity_interaction_plot.R"
SnakeMake From line 210 of master/Snakefile
230
231
script:
    "/home/karin/Parity_gestational_duration_interaction/scripts/plots/Supplementary/mother_herself_ptd.R"
SnakeMake From line 230 of master/Snakefile
248
249
script:
    "/home/karin/Parity_gestational_duration_interaction/scripts/plots/Supplementary/year_of_delivery_interaction.R"
SnakeMake From line 248 of master/Snakefile
268
269
script:
    "/home/karin/Parity_gestational_duration_interaction/scripts/plots/Supplementary/parity_risk_of_ptd_maternal_age_grouping_mixed_model.R"
SnakeMake From line 268 of master/Snakefile
286
287
script:
    "/home/karin/Parity_gestational_duration_interaction/scripts/plots/Supplementary/PTB_by_parity_BMI_unwilling_subfertilitu_smoking_diab_preeclamspia_descriptive_table.R"
SnakeMake From line 286 of master/Snakefile
305
306
script:
    "/home/karin/Parity_gestational_duration_interaction/scripts/plots/Supplementary/simulation_group_effect_bias.R"
SnakeMake From line 305 of master/Snakefile
326
327
script:
    "/home/karin/Parity_gestational_duration_interaction/scripts/plots/Supplementary/ultrasound.R"
SnakeMake From line 326 of master/Snakefile
347
348
script:
    "/home/karin/Parity_gestational_duration_interaction/scripts/plots/Supplementary/Parity_affects_gd.R"
SnakeMake From line 347 of master/Snakefile
ShowHide 16 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/PerinatalLab/Parity_gestational_duration_interaction
Name: parity_gestational_duration_interaction
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...