BioWorkflows: Improving pathogenicity prediction of missense variants by using AlphaFold-derived features

library(tidyverse)
library(readxl)
library(gridExtra)
library(optparse)

option_list = list(
  make_option(c("-r", "--input_recalibrated"), type="character", default="C:/Users/Karo.PC-Karo/Master/Semester_3/Lab_rotation_Ludwig/for_Karola/SSH/gnomad_extracted_prepro_rec.csv.gz", 
              help="location of csv.gz file"),
  make_option(c("-e", "--excel_location"), type="character", default="C:/Users/Karo.PC-Karo/Master/Semester_3/Lab_rotation_Ludwig/for_Karola/local_files/available_colnames_regular.xlsx", 
              help="location of excel file"),
  make_option(c("-o", "--out_folder"), type="character", default="C:/Users/Karo.PC-Karo/Master/Semester_3/Lab_rotation_Ludwig/for_Karola/local_files", 
              help="name of folder to store output")

)

opt = parse_args(OptionParser(option_list=option_list))

variants<-read_csv(opt$input_recalibrated, na=c(".","NA"))
colnames_usage <- read_excel(opt$excel_location)

dir.create(opt$out_folder, recursive=TRUE)
setwd(opt$out_folder)

write_tsv(as_tibble(colnames(variants)), file="available_colnames_W_surr.tsv")

for_prediction<-(colnames_usage %>% filter(!is.na(for_prediction)))$value

mean_no_na_no_round<-function(x) {(mean(x, na.rm=TRUE))}


mean_properties<-variants %>% filter(b_factor>80) %>% 
  group_by(from_AS, outcome)%>% 
  dplyr::select(for_prediction) %>%
  summarise_all(mean_no_na_no_round)%>%
  ungroup()%>%
  dplyr::select(-starts_with("RESIDUE_NAME"))

mean_properties<-mean_properties %>% 
  gather(variable, value, -outcome, -from_AS) %>%
  group_by(variable,from_AS) %>%
  summarise(value = (value[outcome == 1] - value[outcome == 0]))

####Both Outcomes, total count####
mean_properties_freq<-variants %>%  
  group_by(from_AS, to_AS) %>%
  select(from_AS, to_AS) %>%
  filter(!to_AS=="%3D") %>%
  filter(!to_AS=="???") %>%
  count(from_AS,to_AS,sort=TRUE)

plot_tot_AA<-ggplot(mean_properties_freq, aes(x = from_AS, y = to_AS, fill = n)) +
  geom_tile()+
  theme_minimal()+  
  scale_fill_viridis_c()+
  theme(plot.title= element_text(size = 6),
        axis.text.x = element_text(size=5,angle = 90, vjust = 0.5, hjust=1),
        axis.text.y = element_text(size=5, vjust = 0.5, hjust=1),
        legend.key.height = unit(3, 'mm'),
        legend.key.width = unit(2.5, 'mm'),
        legend.title = element_text(size=5), 
        legend.text = element_text(size=5),
        legend.box.spacing = unit(0.1,"mm"),
        axis.title.y=element_text(size=6),
        axis.title.x=element_text(size=6),
        plot.margin=unit(c(1,1,1,1), "mm"))+
  ggtitle("AA exchanges, raw count")+
  geom_text(aes(from_AS, to_AS, label=n), colour = "white", check_overlap = TRUE, size=1.5)

#ggsave("AA_ex_tot.pdf", width=28.575, height=19.05, units="cm", dpi=300)

####Relative fraction outcome1/outcome0####
mean_properties_freq_outcome<-variants %>% 
  filter(!to_AS=="%3D") %>%
  filter(!to_AS=="???") %>%
  group_by(from_AS, to_AS) %>%
  select(from_AS, to_AS, outcome) %>%
  summarise(frac=mean(outcome))

frac_round<- round(mean_properties_freq_outcome$frac, digits = 2)

plot_AA_freq<-ggplot(mean_properties_freq_outcome, aes(x = from_AS, y = to_AS, fill = frac)) +
  geom_tile()+
  theme_minimal()+  
  scale_fill_viridis_c()+
  theme(plot.title= element_text(size = 6),
        axis.text.x = element_text(size=5,angle = 90, vjust = 0.5, hjust=1),
        axis.text.y = element_text(size=5, vjust = 0.5, hjust=1),
        legend.key.height = unit(3, 'mm'),
        legend.key.width = unit(2.5, 'mm'),
        legend.title = element_text(size=5), 
        legend.text = element_text(size=5),
        legend.box.spacing = unit(0.1,"mm"),
        axis.title.y=element_text(size=6),
        axis.title.x=element_text(size=6),
        plot.margin=unit(c(1,1,1,1), "mm"))+
  ggtitle("Fraction of AA exchanges with outcome = 1")+
  geom_text(aes(from_AS, to_AS, label=frac_round), colour = "white", check_overlap = TRUE, size=1.5)

#ggsave("AA_ex_frac.pdf",width=28.575, height=19.05, units="cm", dpi=300)

g<-arrangeGrob(plot_tot_AA, plot_AA_freq, ncol=1) #combine both plots in one
ggsave("AA_ex_compared.pdf", width=95, height=120,units="mm", dpi=300,g)

R tidyverse optparse gridExtra readxl From line 1 of scripts/AA_freq.R

library(tidyverse)
library(pROC)
library(gridExtra)
library(optparse)
library(boot)
source("workflow/scripts/existing_scores_glm_functions.R")
BOOT_REPETITIONS=1000
N_CPU=3
COL_ORDER=c("Alph_null","AlphScore","CADD_raw","DEOGEN2_score_med","REVEL_score","glm_AlphCadd","glm_AlphDeogen",
  "glm_AlphRevel","glm_CaddDeogen","glm_DeogenRevel","glm_RevelCadd","glm_AlphDeogenRevel",
  "glm_AlphCaddDeogen","glm_AlphCaddRevel","glm_CaddDeogenRevel")

set.seed(1)


option_list = list(
  make_option(c("-t", "--variants"), type="character", default="results/prediction_final/pre_final_model_regular_variants.csv.gz", 
              help="csv.gz file, test dataset"),
  make_option(c("-v", "--validation_set"), type="character", default="results/validation_set/validation_set_w_AlphScore.csv.gz", 
              help="Validation set"),
  make_option(c("-o", "--out_folder"), type="character", default="results/analyse_score", 
              help="Output folder"))

opt = parse_args(OptionParser(option_list=option_list))

variants<-read_csv(opt$variants)
validation_dataset<-read_csv(opt$validation_set)

dir.create(opt$out_folder, recursive=TRUE)
setwd(opt$out_folder)

# get DEOGEN2 Scores:
variants$pos_in_VEP_and_Uniprot<-get_index_col(variants)
variants$DEOGEN2_score_med<-unlist_score(variants$DEOGEN2_score, variants$pos_in_VEP_and_Uniprot)

validation_dataset$pos_in_VEP_and_Uniprot<-get_index_col(validation_dataset)
validation_dataset$DEOGEN2_score_med<-unlist_score(validation_dataset$DEOGEN2_score, validation_dataset$pos_in_VEP_and_Uniprot)

variants$AlphScore<-variants$predicted_Alph

variants<-variants%>% 
  mutate(ID=paste(`#chr`, `pos(1-based)`,ref, alt, sep=":"))

### combine scores on interim dataset:
interim_dataset<-variants %>% 
  filter(clinvar_interim_test==TRUE)
gnomad_train_dataset<-variants %>% 
  filter(gnomad_train==TRUE)

validation_dataset<-validation_dataset %>%
  mutate(ID=paste(`#chr`, `pos(1-based)`,ref, alt, sep=":"))%>%
  filter(!ID %in% interim_dataset$ID)%>%
  filter(!ID %in% gnomad_train_dataset$ID)


set_of_models<-fit_set_of_models(interim_dataset)
validation_dataset<-predict_set_of_models(set_of_models, validation_dataset)


selected_scores<-c("HRAS_DMS_g12v",
                   "P53_DMS_WT_Nutlin",
                   "ADRB2_DMS_0.625",          
                   "BRCA1_DMS_E3_high_(a)",    
                   "BRCA1_DMS_(b)",            
                   "MSH2_DMS",      
                   "TPMT_DMS",                 
                   "PTEN_DMS_(a)",             
                   "PTEN_DMS_highqual_(b)",    
                   "SUMO1_DMS_flipped",        
                   "UBE2I_DMS_flipped",       
                   "VKOR1_DMS_abundance_score",
                   "TPK1_DMS_flipped")


validation_dataset<-validation_dataset%>%
  mutate(scoreID=paste(gene_dms, DMS, sep="_"))%>%
  filter(scoreID %in% selected_scores)



###### validation Dataset

prot_info_for_join<-validation_dataset %>% 
  distinct(Uniprot_acc_split, protein_length, protein_mean_b_factor)

SCORES<-c("Alph_null","AlphScore", "CADD_raw", "glm_AlphCadd", 
          "REVEL_score", "glm_AlphRevel", "glm_RevelCadd", 
          "DEOGEN2_score_med", "glm_AlphDeogen", "glm_CaddDeogen", 
          "glm_AlphCaddDeogen", "glm_DeogenRevel","glm_AlphDeogenRevel",
          "glm_AlphCaddRevel",    "glm_CaddDeogenRevel")

get_correlations<-function(val_dat){


  spearman_joined_private<-tibble()

for (un_ID in unique(val_dat$Uniprot_acc_split)){
  temp_values_joined<-val_dat %>% 
    filter(Uniprot_acc_split==un_ID)
  temp_values_joined <- temp_values_joined[complete.cases(temp_values_joined[, SCORES]),]
  for (dms in unique(temp_values_joined$DMS)){
    temp_inner_loop_values_joined<-temp_values_joined %>% 
      filter(DMS==dms)
    for (score in SCORES){
     score_DMS_tibble<-tibble(score_val=unlist(temp_inner_loop_values_joined %>% select(one_of(score))),
                              DMS_val=temp_inner_loop_values_joined$DMS_val)
     score_DMS_tibble<-score_DMS_tibble %>% filter(complete.cases(.))
      spearmans<-tibble(
      gene=unique(temp_inner_loop_values_joined$gene_dms), 
      UP_ID=un_ID, 
      DMS=dms, 
      spearm=ifelse(nrow(score_DMS_tibble)>2,
      cor.test(score_DMS_tibble$DMS_val, score_DMS_tibble$score_val,
               method = "spearman", na.action="na.ommit")$estimate,NA),
      method=score,
      number_of_vals=nrow(score_DMS_tibble))
      spearman_joined_private=rbind(spearman_joined_private,spearmans)
      }
  }
}

bad_UP_ids<-(spearman_joined_private %>% 
               filter(number_of_vals<21))$UP_ID

spearman_joined_private<-spearman_joined_private %>% 
  mutate(scoreID=paste(gene, DMS, sep="_"))%>%
  mutate(method=factor(method, levels= SCORES))%>%
  filter(! UP_ID %in% bad_UP_ids)%>%
  left_join(prot_info_for_join, by=c("UP_ID"="Uniprot_acc_split"))%>%
  mutate(abs_spear=abs(spearm)) %>%
  mutate(alphInclude=grepl("Alph", method))

return(spearman_joined_private)
}

spearmans_joined<-get_correlations(validation_dataset)

scatter_dms<-ggplot(data=validation_dataset)+
  geom_point(aes(x=DMS_val, y=AlphScore), color="darkblue", alpha=0.1)+
  geom_text(data=spearmans_joined %>% filter(method=="AlphScore"), aes(label=round(abs_spear, 3)),x = Inf, y = Inf, hjust = 1, vjust = 1)+
  labs(x="DMS score")+
  facet_wrap(~ scoreID, scales = "free", ncol = 3)+
  theme_minimal()

ggsave(filename = "scatter_dms_alphscore.png",
       plot = scatter_dms,
       height = 10,
       width=8, 
       dpi=300)

scatter_dms<-ggplot(data=validation_dataset)+
  geom_point(aes(x=DMS_val, y=glm_AlphDeogenRevel), color="darkblue", alpha=0.1)+
  geom_text(data=spearmans_joined %>% filter(method=="glm_AlphDeogenRevel"), aes(label=round(abs_spear, 3)),x = Inf, y = Inf, hjust = 1, vjust = 1)+
  labs(y="AlphScore + DEOGEN2 + REVEL", x="DMS score")+
  facet_wrap(~ scoreID, scales = "free", ncol = 3)+
  theme_minimal()

ggsave(filename = "scatter_dms_alph_doegen_revel.png",
       plot = scatter_dms,
       height = 10,
       width=8, 
       dpi=300)



pairwise.t.test(x = spearmans_joined$abs_spear, g = spearmans_joined$method, paired = TRUE, 
                p.adjust.method = "none", alternative = "greater")
# has been performed first; 
spearmans_joined %>% 
  group_by(gene, UP_ID, DMS) %>% 
  summarise(mean_abs_spear=mean(abs(spearm)))
## selected the scores with the highest overall correlation in mean_corr_per_score

spearmans_joined<-spearmans_joined%>%
  mutate(method=factor(method, levels=COL_ORDER))

plot_spearmans<-ggplot(spearmans_joined, aes(x=method, y=abs(spearm)))+
  stat_summary(fun.y = mean, geom = "bar", width=0.8, color="black", fill="grey") + 
  stat_summary(fun.data = mean_se, geom = "errorbar", width=0.4)+
  geom_jitter(width=0.15, color="blue4", alpha=1, size=1)+
  theme_minimal()+ 
  theme(axis.text.x = element_text(angle = 45, vjust = 1.0, hjust=1, color="black", size=10),
                          axis.text.y = element_text(color="black", size=10))+
  labs(x = "")+
  labs(y = "absolute Spearman correlation", size=12)

plot_spearmans
ggsave(filename= "spearman_plot.pdf", plot=plot_spearmans, height=5, width=4)

getmeansSD_spearman<-function(spearmans_joined_private){
  table_spearm_private<-spearmans_joined_private %>% 
    mutate(abs_spearman=abs(spearm))%>%
    group_by(method)%>%
    summarise(mean_spearm=mean(abs_spearman), sd=sd(abs_spearman), num=n(), sem=sd(abs_spearman)/sqrt(n())) %>%
    arrange(-mean_spearm)
  return(table_spearm_private)
}

table_spearm<-getmeansSD_spearman(spearmans_joined)
print(table_spearm)

write_tsv(x = table_spearm, file = "table_spearm.tsv")

getMeanAbsCorr<-function(data_priv, i){
  spearman_joined_priv<-get_correlations(data_priv[i,])
  table_spearm_private<-getmeansSD_spearman(spearman_joined_priv)
  return(unlist(table_spearm_private[,"mean_spearm"]))
}

getMeanAbsCorr_scoreids<-function(data_priv, i){
  filtered_data<-tibble()

  for (score_id in unlist(data_priv[i,]) ){
  filtered_data<-rbind(filtered_data,
    validation_dataset %>%
      filter(scoreID %in% score_id) )
  }

  spearman_joined_priv<-get_correlations(filtered_data)
  table_spearm_private<-getmeansSD_spearman(spearman_joined_priv)
  table_spearm_private$used_scores<-as.numeric(paste(i, collapse = ""))
  # for check print
  #print(data_priv[i,])
  #print(nrow(filtered_data))

  return(table_spearm_private)
}

getMeanAbsCorr_scoreids_return_vector<-function(data_priv, i){
  table_spearm_private<-getMeanAbsCorr_scoreids(data_priv, i)
  correlations_used_scores<-c(unlist(table_spearm_private[,"mean_spearm"]),
                              unlist(table_spearm_private[1,"used_scores"]))

  return(correlations_used_scores)
}

compare_scores<-function(booted_values_private){
  booted_values_private_summarised<-booted_values_private %>%
    rowwise%>%
    select(-starts_with("used_cols_vector"))%>%
    mutate(rowmax=max(across()))%>%
    mutate(glm_AlphDeogenRevel_biggest=glm_AlphDeogenRevel==rowmax)%>%
    mutate(glm_AlphCadd_biggerThanCadd=glm_AlphCadd>CADD_raw)%>%
    mutate(glm_AlphDeogen_biggerThanDeogen=glm_AlphDeogen>DEOGEN2_score_med)%>%
    mutate(glm_AlphRevel_biggerThanRevel=glm_AlphRevel>REVEL_score)
  return(booted_values_private_summarised)
}

get_p_value_table<-function(booted_values_summarised_private){
  relevant_scores<-c("glm_AlphDeogenRevel_biggest","glm_AlphDeogen_biggerThanDeogen","glm_AlphRevel_biggerThanRevel","glm_AlphCadd_biggerThanCadd")
  p_val_tabl<-tibble()
  for (columname in relevant_scores){
    p_val_tabl<- rbind(p_val_tabl, 
                       tibble(name=columname, num_true=sum(unlist(booted_values_summarised_private[,columname])), total_num=nrow(booted_values_summarised_private)))
  }
  return(p_val_tabl)
}



# do bootstrapping for means
mean_original_mat<-getmeansSD_spearman(get_correlations(validation_dataset))
set.seed(1)

boot_meanAbsCor<-boot(data=validation_dataset, statistic=getMeanAbsCorr, R=BOOT_REPETITIONS, parallel="multicore", ncpus=N_CPU)
booted_meanAbsCor_values<-as_tibble(boot_meanAbsCor$t)
colnames(booted_meanAbsCor_values)<-mean_original_mat$method
booted_meanAbsCor_values<-compare_scores(booted_meanAbsCor_values)
pValTable_AbsCor<-get_p_value_table(booted_meanAbsCor_values)
pValTable_AbsCor<-pValTable_AbsCor%>% 
  mutate(pval=1-num_true/total_num)
write_tsv(x=pValTable_AbsCor, "pValTable_AbsCor.tsv")


# bootstrap the choice of scores
score_ids<-as_tibble(unique(validation_dataset$scoreID))
mean_original_mat<-getMeanAbsCorr_scoreids(score_ids,1:13)
set.seed(1)
boot_meanAbsCor<-boot(data=score_ids, statistic=getMeanAbsCorr_scoreids_return_vector, R=BOOT_REPETITIONS, parallel="multicore", ncpus=N_CPU)
booted_meanAbsCor_values<-as_tibble(boot_meanAbsCor$t)
colnames(booted_meanAbsCor_values)<-c(as.character(mean_original_mat$method), "used_cols_vector")
booted_meanAbsCor_values<-compare_scores(booted_meanAbsCor_values)
pValTable_AbsCor<-get_p_value_table(booted_meanAbsCor_values)
pValTable_AbsCor<-pValTable_AbsCor%>% 
  mutate(pval=1-num_true/total_num)
write_tsv(x=pValTable_AbsCor, "pValTable_AbsCor_choice_of_experiments.tsv")
pValTable_AbsCor

rank_score_spearm<-function(spearm_joined_private){
  rank_scored_priv<-spearm_joined_private%>%group_by(scoreID)%>%
    mutate(minCor=min(abs_spear), maxCor=max(abs_spear))%>%
    ungroup()%>%
    mutate(norm_cor=(abs_spear-minCor)/(maxCor-minCor))%>%
    group_by(method,gene)%>%
    summarise(gene_level_cor=mean(norm_cor))%>%
    group_by(method)%>%
    summarise(rank_score=sum(gene_level_cor)/n() , sd_score=sd(gene_level_cor)) %>%
    mutate(sd_score_norm=sd_score/rank_score)%>%
    arrange(-method)
  return(rank_scored_priv)
}

getSpearmanRank<-function(data_priv, i){
  spearman_joined_priv<-get_correlations(data_priv[i,])# %>%
    #filter(method!="Alph_null")

  rank_scored<-rank_score_spearm(spearman_joined_priv)
  return(unlist(rank_scored[,2]))
}


# do bootstrapping for rankscores
rank_scored_original_mat<-rank_score_spearm(get_correlations(validation_dataset))
set.seed(1)
boot_rank_rankscore<-boot(data=validation_dataset, statistic=getSpearmanRank, R=BOOT_REPETITIONS, parallel="multicore", ncpus=N_CPU) # parallel="multicore", ncpus=3
booted_rankscores_values<-as_tibble(boot_rank_rankscore$t)
colnames(booted_rankscores_values)<-rank_scored_original_mat$method
booted_rankscores_values<-compare_scores(booted_rankscores_values)
vals<-booted_rankscores_values %>% pivot_longer(cols=1:length(unique(rank_scored_original_mat$method)))

vals<-vals%>%
  mutate(name=factor(name, levels=COL_ORDER))



rankscore_plot<-ggplot(vals)+
  geom_boxplot(aes(x=name,y=value))+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, color="black", size=10))

ggsave(filename = "rankscore_plot.pdf", plot = rankscore_plot, width=5, height=5.5)

pValTable_RankScore<-get_p_value_table(booted_rankscores_values)
pValTable_RankScore<-pValTable_RankScore%>% 
  mutate(pval=1-num_true/total_num)
pValTable_RankScore
write_tsv(x=pValTable_RankScore, "pValTable_RankScore.tsv")


ggplot(spearmans_joined, aes(x=scoreID, y=abs_spear, color=method, label=method))+
  geom_jitter(position = position_jitter(seed = 1))+
  #geom_text(position = position_jitter(seed = 1))+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, color="black", size=10),
        axis.text.y = element_text(color="black", size=10))

R ggplot2 tidyverse optparse gridExtra pROC boot From line 1 of scripts/analyse_DMS_score.R

library(tidyverse)
library(readxl)
library(optparse)

option_list = list(
  make_option(c("-c", "--csv_location"), type="character", default="C:/Users/Karo.PC-Karo/Master/Semester_3/Lab_rotation_Ludwig/for_Karola/SSH/variants_preprocessed_samp.csv", 
              help="location of csv.gz file"),
  make_option(c("-o", "--out_folder"), type="character", default="C:/Users/Karo.PC-Karo/Master/Semester_3/Lab_rotation_Ludwig/for_Karola/local_files", 
              help="name of folder to store output")

)

opt = parse_args(OptionParser(option_list=option_list))

variants<-read_csv(opt$csv_location, na=c(".","NA"))

dir.create(opt$out_folder, recursive=TRUE)
setwd(opt$out_folder)

variants_gnomad<- variants %>% filter(gnomadSet==1)
variants_cv<- variants %>% filter(gnomadSet==0)
variants_unique_genes<-unique(variants$var_id_genomic)
variants_unique_proteins<-unique(variants$var_id_prot)
gnomad_outc1<-variants%>%filter(gnomadSet==1 & outcome==1)
gnomad_outc0<-variants%>%filter(gnomadSet==1 & outcome==0)
cv_outc1<-variants%>%filter(gnomadSet==0 & outcome==1)
cv_outc0<-variants%>%filter(gnomadSet==0 & outcome==0)
cv18_to_21_CV<- variants%>%filter(clinvar_holdout_test==1)
gene_no<-unique(variants$gene)
hold_out_genes<- variants%>%filter(hold_out_genes==1)
train1<-variants%>%filter(!(hold_out_genes==TRUE) & gnomadSet==TRUE)
test1<-variants%>%filter(clinvar_interim_test==TRUE)
test2<-variants %>% filter(clinvar_holdout_test==TRUE)
rest_train1<-variants%>%filter((hold_out_genes==TRUE) & gnomadSet==TRUE)
ensemblID_gene<-unique(variants$Ensembl_geneid)
ensemblID_protein<-unique(variants$Ensembl_proteinid)
uniprot<-unique(variants$Uniprot_entry)
uniprot_acc<-unique(variants$Uniprot_acc)
ensemblID_in_genes<-ensemblID_gene %in% variants_unique_genes == TRUE

var_tibble<-tibble(total_var=nrow(variants),
                        total_gnomad=nrow(variants_gnomad),
                        total_cv=nrow(variants_cv),
                        total_genes=length(variants_unique_genes),
                        total_proteins=length(variants_unique_proteins),                        
                        gnomad_outcome1=nrow(gnomad_outc1),
                        gmoad_outcome0=nrow(gnomad_outc0),
                        cv_outcome1=nrow(cv_outc1),
                        cv_outcome0=nrow(cv_outc0),
                        cv18_to_21_CV=nrow(cv18_to_21_CV),
                        gene_no=length(gene_no),
                        train1=nrow(train1),
                        test1=nrow(test1),
                        test2=nrow(test2),
                        rest_train1= nrow(rest_train1),
                        ensemblIDgene=length(ensemblID_gene),
                        uniprot=length(uniprot),
                        uniprot_acc=length(uniprot_acc),
                        ensemblID_in_genes=length(ensemblID_in_genes))
write_tsv(var_tibble,"characteristics_variants.tsv")

R tidyverse optparse readxl From line 1 of scripts/char_variants.R

director_combined_files=$1
out_dir=$2
first_file="$director_combined_files/"$(ls $director_combined_files | head -n1)

echo "using header of: $first_file"

mkdir -p $out_dir
mkdir -p $out_dir"/tmp"
zcat $first_file | head -n1 > $out_dir"/header.csv"
zcat_sting=""

# loop over directory content and extract collect non empty files with TRUE in filename 
for file in $(ls $director_combined_files)
do
if [[ $file == *"TRUE"* ]]; then
f_size=$(stat --printf="%s" $director_combined_files"/"$file)
if [ $f_size -gt 100 ]
then
zcat_string+=" ${director_combined_files}/${file}"
fi
fi
done

#relevant_cols=$(zcat results/validation_set/validation_set_w_AlphScore.csv.gz | awk -v RS=',' '/^b_factor$/{print NR} /^SOLVENT_ACCESSIBILITY_core$/{print NR} /^Uniprot_acc_split$/{print NR} /^gnomad_train$/{print NR} /AlphScore/{print NR; exit}' | tr "\n" ",")"2,3,4,5,6,7,8,9,10"


echo $zcat_string

cat $out_dir"/header.csv" > $out_dir"/header_short.csv"

zcat $zcat_string  | grep -v "chr,pos(1-based)," | sort -V -t, -T $out_dir"/tmp/" -k1 -k2 - | \
cat $out_dir"/header_short.csv" - | tr "," "\t" | bgzip -c > $out_dir"/all_possible_values_concat.csv.gz"

tabix -s 1 -b 2 -e 2 $out_dir"/all_possible_values_concat.csv.gz"

rm -rf $out_dir"/tmp"

Shell tabix From line 4 of scripts/combine_all_proteins_to_one_file.sh

import sys
import pandas as pd
import numpy as np
from operator import itemgetter


### for debuging
class testclass:
	def __init__(self):
		self.wildcards=["AF-A0A087WUL8-F11-model_v1"]
		self.output=["results/AF-A0A087WUL8-F11-model_v1__combined_features.csv.gz"]
		self.params=["results/pdb_features/"]

#snakemake=testclass()
offset=0
offset=( int(snakemake.wildcards[0].split("-")[2][1:] ) - 1) * 200

# generate a list with the number of interactions
possible_interactions=["aroaro", "arosul", "cationpi", "disulphide", "hbond_main_side", "hbond_side_side", "hydrophobic", "ionic", "within_radius"]
interaction_matrix=np.zeros((40000,len(possible_interactions)),np.int) # 40000 - longer than Titin
i=0
edges=[]
for interaction in possible_interactions:
	interactions_pd=pd.read_csv(snakemake.params[0] + "/result_" + interaction + ".csv", low_memory=False)
	interactions_np=interactions_pd.to_numpy()
	interactions_np[:,1] = interactions_np[:,1] + offset
	interactions_np[:,3] = interactions_np[:,3] + offset
	interactions_joined=np.append(interactions_np[:,1], interactions_np[:,3])
	interactions_np[:,0]=np.array([s.strip() for s in interactions_np[:,0]])
	interactions_np[:,2]=np.array([s.strip() for s in interactions_np[:,2]])
	for ele in interactions_joined:
		interaction_matrix[ele,i]=interaction_matrix[ele,i]+1
	i=i+1


# Read features from FEATURIZE Framework
header=pd.read_csv(snakemake.params[1],skipinitialspace=True).columns.to_list()

to_remove=["RAUTE","coord1", "coord2", "coord3", "RAUTE2", "SECONDARY_STRUCTURE2_IS_UNKNOWN", "SECONDARY_STRUCTURE1_IS_UNKNOWN", "RESIDUE_CLASS2_IS_UNKNOWN","RESIDUE_CLASS1_IS_UNKNOWN", "RESIDUE_NAME_IS_OTHER", "ATOM_TYPE_IS_OTHER", "ELEMENT_IS_OTHER", "Atom_name"]

for suff in ["_core", "_3A", "_6A", "_9A", "_12A"]:
	header_surround=[ el + suff  if el not in ["RESN_RESI_AT"] + to_remove else el for el in header] 
	features=pd.read_csv(snakemake.params[0] + "/" + snakemake.wildcards[0] + suff + ".txt", delimiter="\t", names=header_surround,skipinitialspace=True , low_memory=False)
	features["RESN_RESI"]=features.RESN_RESI_AT.str.split(":",expand=True).iloc[:,0]
	features["RESN"]=features.RESN_RESI.str[:3]
	features["RESI"]=features.RESN_RESI.str[3:]
	features=features.astype({"RESI": int})
	features.RESI=features.RESI + offset
	features["RESN_RESI"]=features["RESN"]+features["RESI"].astype(str)
	features.index=features.RESI
	features=features.drop(to_remove, axis=1)
	print(suff)
	if suff!="_core":
		features_joined=features_joined.join(features.drop(['RESN_RESI_AT', 'RESN_RESI', 'RESI','RESN'], axis=1))
	else:
		features_joined=features

# join interactions and FEATURE based values
interactions_for_join=pd.DataFrame(interaction_matrix, columns=possible_interactions) # offset already inserted
interactions_for_join=interactions_for_join.astype(int)
interactions_for_join.index.name="RESI" 
features_joined2=features_joined.join(interactions_for_join)

# join the bfactors
b_factors=pd.read_csv(snakemake.params[0] + "/b_factors.csv", delimiter=" ", names=["residue_number","atom_name","b_factor"], skipinitialspace=True, low_memory=False)
b_factors["residue_number"] = pd.to_numeric(b_factors["residue_number"]) + offset
b_factors.index=b_factors["residue_number"] ########### +
features_joined3=features_joined2.join(b_factors)

# join the HSEs
HSE=pd.read_csv(snakemake.params[0] + "/HSEs.csv", delimiter=",", skipinitialspace=True, low_memory=False)
print(HSE.columns)
HSE["RESI"] = pd.to_numeric(HSE["RESI"]) + offset
HSE.index=HSE["RESI"]
HSE=HSE.drop("RESI", axis=1) ########### +
features_joined4=features_joined3.join(HSE)


features_joined4[["pdb_file"]]=snakemake.wildcards[0]
features_joined4.to_csv(snakemake.output[0])

Python Pandas numpy From line 1 of scripts/combine_features_pdb_level.py

library(tidyverse)
library(readxl)
library(data.table)
library(optparse)

option_list = list(
  make_option(c("-m", "--mapping"), type="character", default="resources/mapping_genename_uniprot.tsv", 
              help="location of file that maps gene name in assay to UniProt ID"),
  make_option(c("-p", "--prot_folder"), type="character", default="results/predicted_prots_eval", 
              help="Folder with protein files with AlphScore predictions"),
  make_option(c("-a", "--suffix"), type="character", default="_w_AlphScore_red_FALSE.csv.gz", 
              help="suffix that protein files with AlphScore predictions have")
)
opt = parse_args(OptionParser(option_list=option_list))

list_uniprot_ids<-read_tsv(opt$mapping)
PREDICTED_PROT_SUFFIX=opt$suffix
folder_of_prots=opt$prot_folder

# MAPK1	/ Uniprot P28482 not in mapping file, as this was not mapped between Uniprot and dbNSFP

sheets<-c("ADRB2","BRCA1","CALM1", "HRAS", "MAPK1", "P53",  "PTEN", "SUMO1", "TPK1", "TPMT", "UBE2I")
scores_conc<-tibble()

for (gene in sheets){
  scores <- read_excel("resources/Source.xlsx", sheet=gene) %>% 
    select(variant, starts_with("DMS"))%>%
    mutate(gene=gene)%>%
    gather(key="DMS", value="DMS_val", -variant, -gene)
  scores_conc=rbind(scores_conc, scores)
}

#MSH2
scores <- read_excel("resources/MSH2_Jia_2020/1-s2.0-S0002929720304390-mmc2.xlsx", sheet="TableS5_AllVariants")%>%
  rename(variant=Variant, DMS=`LOF score`)%>%
  select(variant, starts_with("DMS"))%>%
  mutate(gene="MSH2")%>%
  gather(key="DMS", value="DMS_val", -variant, -gene)
scores_conc=rbind(scores_conc, scores)

#Abeta
scores <- read_excel("resources/Abeta_Seuma_2020/elife-63364-supp4-v2.xlsx", sheet="1 aa change")%>%
  mutate(variant=paste0(WT_AA, Pos, Mut))%>%
    rename( DMS_nscore=`nscore`)%>%
    select(variant, starts_with("DMS"))%>%
    mutate(gene="Abeta")%>%
    gather(key="DMS", value="DMS_val", -variant, -gene)
scores_conc=rbind(scores_conc, scores)


#VKOR1
scores <- read_excel("resources/VKOR1_Chiasson_2020_activity/elife-58026-fig1-data1-v1.xlsx")%>%
  rename( DMS_abundance_score=`abundance_score`)%>%
  rename( DMS_activity_score=`activity_score`)%>%
  select(variant, starts_with("DMS"))%>%
  mutate(gene="VKOR1")%>%
  gather(key="DMS", value="DMS_val", -variant, -gene)
scores_conc=rbind(scores_conc, scores)

scores_conc<-scores_conc %>% 
  filter(!is.na(DMS_val))%>%
  left_join(list_uniprot_ids, by=c("gene"="gene_name"))%>%
  rename(gene_dms=gene)%>% 
  filter(!is.na(uniprot_id))


files<-paste0(folder_of_prots, list_uniprot_ids$uniprot_id, PREDICTED_PROT_SUFFIX) 

values_joined<-lapply(files, function(x) { #  mc.cores = 1,
  return(fread(x, na.strings = c("NA","."))  ) } )

values_joined<-rbindlist(values_joined)

values_joined<-values_joined%>%
  mutate(variant=paste0(
str_replace_all(RESN_RESI, c( 
 "ALA"="A", "ARG"="R", "ASN"="N", "ASP"="D",
 "CYS"="C", "GLU"="E", "GLN"="Q", "GLY"="G",
 "HIS"="H", "ILE"="I", "LEU"="L", "LYS"="K",
 "MET"="M", "PHE"="F", "PRO"="P", "SER"="S",
 "THR"="T", "TRP"="W", "TYR"="Y", "VAL"="V")),
str_replace_all(to_AS, c( 
  "ALA"="A", "ARG"="R", "ASN"="N", "ASP"="D",
  "CYS"="C", "GLU"="E", "GLN"="Q", "GLY"="G",
  "HIS"="H", "ILE"="I", "LEU"="L", "LYS"="K",
  "MET"="M", "PHE"="F", "PRO"="P", "SER"="S",
  "THR"="T", "TRP"="W", "TYR"="Y", "VAL"="V"))
))

values_joined<-values_joined %>% 
  left_join(scores_conc, by=c("variant"="variant", "Uniprot_acc_split"="uniprot_id"))%>%
  filter(!is.na(DMS_val))%>%
  mutate(DMS_val=as.double(DMS_val))

dir.create("results/validation_set")
setwd("results/validation_set")

fwrite(x=values_joined, file="validation_set_w_AlphScore.csv.gz")

R tidyverse data.table optparse readxl From line 1 of scripts/compile_scores.R

library(readr)
library(dplyr)
library(optparse)
library(stringr)
source("workflow/scripts/existing_scores_glm_functions.R")
set.seed(1)

option_list = list(
  make_option(c("-t", "--training_dataset"), type="character", default="results/prediction_final/pre_final_model_regular_variants.csv.gz", 
              help="File containing the training dataset"),
  make_option(c("-c", "--combined_model"), type="character", default="results/prediction_final/combined_model.RData", 
              help="File where the final combined logistic regression model should be saved to"),
  make_option(c("-i", "--training_var_ids"), type="character", default="results/prediction_final/training_var_ids.tsv", 
              help="IDs of variants that were in the training / test set")
)

opt = parse_args(OptionParser(option_list=option_list))

variants<-read_csv(opt$training_dataset, na=c(".","NA", NA))
variants$AlphScore<-variants$predicted_Alph

index_col<-get_index_col(variants)

variants$DEOGEN2_score_med<- unlist_score(variants$DEOGEN2_score, index_col)

gnomad_vars<-variants %>% 
  filter(gnomadSet==1)

clinvar_vars_not_gnomad <-variants%>% 
  filter(gnomadSet==0) %>%
  filter(! var_id_genomic %in% gnomad_vars$var_id_genomic )

set_of_models<-fit_set_of_models(clinvar_vars_not_gnomad)

gnomad_set_position<-gnomad_vars$var_id_genomic
clinvar_set_position<-clinvar_vars_not_gnomad$var_id_genomic

training_vars<-rbind(
tibble(var_id_genomic=gnomad_set_position,
       gnomadSet=1),
tibble(var_id_genomic=clinvar_set_position,
       gnomadSet=0)
)
# write to disk
write_tsv(x=training_vars,
          file=opt$training_var_ids)

write_rds(x=set_of_models,
          file=opt$combined_model)

R dplyr stringr readr optparse From line 1 of scripts/create_final_combined_models.R

library(data.table)

files <- list.files(path=snakemake@params[["loaction_csvgzs"]], pattern=".csv.gz", full.names=TRUE, recursive=FALSE)
for (x_files in 1:length(files)){
  if (file.info(files[x_files])$size < 1000){
    files[x_files]<-NA
  }
}
print(files)
files<-files[!is.na(files)]

values_joined<-lapply(files, function(x) { #  mc.cores = 1,
  training_data<-fread(x, na.strings = c("NA","."))[!is.na(CADD_raw_sur), ] # load data

  singletons <- training_data[
    ((is.na(gnomAD_exomes_AC) | (gnomAD_exomes_AC<2 & (gnomAD_exomes_NFE_AC == gnomAD_exomes_AC) ) )  & 
    gnomAD_genomes_AN>90000 &
    (!gnomAD_genomes_flag %in% c("lcr","segdup"))&
    gnomAD_genomes_AC==1 & 
    gnomAD_genomes_NFE_AC==1) & 
    (is.na(`1000Gp3_AC`) | `1000Gp3_AC`==0)  & 
    (is.na(ESP6500_AA_AC) | ESP6500_AA_AC==0) & 
    (is.na(ESP6500_EA_AC) | ESP6500_AA_AC==0) 
    ]

  frequents<- training_data[
    gnomAD_genomes_AF> 0.001 & 
    gnomAD_genomes_AN>90000 & 
    (!gnomAD_genomes_flag %in% c("lcr","segdup"))
    ]

  clinvars<- training_data[
    clinvar_clnsig %in% c("Likely_pathogenic","Pathogenic","Benign", "Likely_benign")
    ]

  return(rbindlist(list(singletons[,`:=`(outcome=1, gnomadSet=1)],
                        frequents[,`:=` (outcome=0, gnomadSet=1)], 
                        clinvars[,`:=` (outcome=ifelse(clinvar_clnsig %in% c("Likely_pathogenic","Pathogenic"),1,0), gnomadSet=0)]), 
                   fill=TRUE, use.names=TRUE))
})

values_joined<-rbindlist(values_joined)

fwrite(values_joined, file=snakemake@output[["csv_gz_out"]])

R data.table From line 1 of scripts/extract_clinvar_gnomad_sets.R

library(tidyverse)
library(readxl)
library(dplyr)
library(optparse)

option_list = list(
  make_option(c("-t", "--tsv_location"), type="character", default="../results/prediction/pre_final_model_k_fold_results.tsv", 
              help="location of .tsv file"),
  make_option(c("-o", "--out_folder"), type="character", default="../results/prediction/plot_k/", 
              help="name of folder to store output")
)

opt = parse_args(OptionParser(option_list=option_list))

output_loop<-read.table(opt$tsv_location, sep="\t",header=TRUE)

dir.create(opt$out_folder, recursive=TRUE)
setwd(opt$out_folder)

# rename?
output_loop_num<-select_if(output_loop, is.numeric)


#output_loop_mean<-data.frame()
output_loop_mean<-output_loop %>%
  summarise_if(is.numeric,mean)%>%
  t()

output_loop_sd<-data.frame()
output_loop_sd<-output_loop%>%
  summarise_if(is.numeric,sd)%>%
  t()

df<-data.frame(colnames_cv=colnames(output_loop_num),mean=output_loop_mean,sd=output_loop_sd) 

df_sel<-df%>%
  filter(str_detect(colnames_cv, "^auc"))

df_sel$colnames_cv<-sub("auc_", "", df_sel$colnames_cv)

barplot_cv<-ggplot(df_sel, aes(x=reorder(colnames_cv, -mean), y=mean))+
  geom_bar(stat = "identity",width = 0.75, position="dodge")+
  geom_errorbar(aes(ymin=mean - sd,ymax=mean + sd), width=0.4)+
  geom_text(aes(x=reorder(colnames_cv, -mean), y=mean+0.014, label=round(mean, digits=3)), 
            position = position_dodge(0.9),
            vjust = -0.5, 
            size = 5, colour = "black", check_overlap = FALSE)+
  coord_cartesian(ylim=c(0,1))+
  theme_minimal()+
  labs(x="prediction method",y="mean AUC")

barplot_cv

ggsave(filename= "barplot_mean_auc_crossval.pdf",plot=barplot_cv, width=190, height=100, units="mm", dpi=300)

R tidyverse dplyr optparse readxl From line 1 of scripts/Fig_auc_cv.R

library(tidyverse)
library(pROC)
library(readxl)
library(ranger)
library(optparse)

option_list = list(
  make_option(c("-i", "--input_impurity"), type="character", default="results/prediction_final/final_regular_impurity_importance.tsv", 
              help="location of .tsv file when importance set to impurity"),
  make_option(c("-m", "--input_permutation"), type="character", default="results/prediction_final/final_regular_permutation_importance.tsv", 
              help="location of .tsv file when importance set to impurity"),
  make_option(c("-o", "--out_folder"), type="character", default="results/plot_k/", 
              help="name of folder to store output"),
  make_option(c("-p", "--prefix"), type="character", default="base_tree_2000_10", 
              help="Prefix for output")
  )

opt = parse_args(OptionParser(option_list=option_list))

importance_table_imp<-read.table(opt$input_impurity, sep="\t",header=TRUE)
importance_table_per<-read.table(opt$input_permutation, sep="\t",header=TRUE)

dir.create(opt$out_folder, recursive=TRUE)
setwd(opt$out_folder)


plot_variable_importance<-function(var_imp_ds){
var_importance_plot<-ggplot(var_imp_ds, aes(x=reorder(variable,-importance), y=importance))+ 
  geom_bar(stat="identity", position="dodge")+ 
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, color="black", size=10),
        axis.text.y = element_text(color="black", size=10))+
  labs(y = "Variable Importance", size=12)+
  labs(x = "")
return(var_importance_plot)
}

var_imp_imp<-importance_table_imp[order(-importance_table_imp$importance),]%>%
  slice(1:25)

var_importance_imp_plot<-plot_variable_importance(var_imp_imp)

ggsave(filename=paste0(opt$prefix,"_importance_impurity.pdf"), 
       plot=var_importance_imp_plot, width = 6, height = 6)


#Variable importance mode = permutation
var_imp_per<-importance_table_per[order(-importance_table_per$importance),]%>%
  slice(1:25)

var_importance_per_plot<-plot_variable_importance(var_imp_per)

ggsave(filename=paste0(opt$prefix,"_importance_permutation.pdf"), 
       plot=var_importance_per_plot, width = 6, height = 6)

R tidyverse optparse readxl pROC ranger From line 1 of scripts/Fig_feat_imp.R

library(tidyverse)
library(readxl)
library(optparse)

option_list = list(
  make_option(c("-p", "--input_preprocessed"), type="character", default="C:/Users/Karo.PC-Karo/Master/Semester_3/Lab_rotation_Ludwig/for_Karola/SSH/gnomad_extracted_prepro.csv.gz", 
              help="location of csv.gz file with preprocessed, uncalibrated variants"),
  make_option(c("-r", "--input_recalibrated"), type="character", default="C:/Users/Karo.PC-Karo/Master/Semester_3/Lab_rotation_Ludwig/for_Karola/SSH/gnomad_extracted_prepro_recal.csv.gz", 
              help="location of csv.gz file with preprocessed, recalibrated variants"),
  make_option(c("-o", "--out_folder"), type="character", default="C:/Users/Karo.PC-Karo/Master/Semester_3/Lab_rotation_Ludwig/for_Karola/local_files", 
              help="name of folder to store output")
)

opt = parse_args(OptionParser(option_list=option_list))

variants_pre<-read_csv(opt$input_preprocessed, na=c(".","NA"))
variants_recal<-read_csv(opt$input_recalibrated, na=c(".","NA"))

dir.create(opt$out_folder, recursive=TRUE)
setwd(opt$out_folder)

from_AS_pre<-variants_pre %>%
  filter(!(hold_out_genes==TRUE) & gnomadSet==TRUE)%>%
  group_by(from_AS) %>%
  summarise(frac=mean(outcome), num=n())

plot_pre<- ggplot(from_AS_pre,aes(x=reorder(from_AS, -frac), y=frac)) + 
  geom_bar(stat = "identity")+
  geom_text(aes(x=reorder(from_AS, -frac), y=frac, label=num),
            position = position_dodge(width = 1),
            vjust = -0.5, 
            size = 2.5, colour = "black", check_overlap = TRUE)+
  theme_minimal()+
  labs(x="reference AA",y="proportion of proxy-pathogenic AAs")

ggsave("barplot_preprocessed.pdf", width=200, height=100, units="mm", dpi=300)

order_AA_pre<- from_AS_pre[order(-from_AS_pre$frac),]
order_AA_pre<-order_AA_pre$from_AS

####recalibrated

from_AS_recal<-variants_recal %>% 
  filter(!(hold_out_genes==TRUE) & gnomadSet==TRUE)%>%
  group_by(from_AS) %>%
  summarise(frac=mean(outcome), num=n())

from_AS_recal<-from_AS_recal[match(order_AA_pre, from_AS_recal$from_AS),]
from_AS_recal$from_AS<-factor(from_AS_recal$from_AS, levels=from_AS_recal$from_AS)

plot_recal<-ggplot(from_AS_recal,aes(x=from_AS, y=frac)) + 
  geom_bar(stat = "identity")+
  geom_text(aes(x=from_AS, y=frac, label=num),
            position = position_dodge(width = 1),
            vjust = -0.5, 
            size = 2.5, colour = "black", check_overlap = TRUE)+
  theme_minimal()+
  labs(x="reference AA",y="proportion of proxy-pathogenic AAs")

# get stats of training / testing data set:


ggsave("barplot_recalibrated.pdf", width=200, height=100, units="mm", dpi=300)

R ggplot2 tidyverse optparse readxl From line 1 of scripts/Fig_recal.R

library(tidyverse)
library(optparse)
library(pROC)
library(PRROC)
library(VennDiagram)
library(boot)

source("workflow/scripts/existing_scores_glm_functions.R")
source("workflow/scripts/precision_recall_resource.R")

BOOT_REPETITIONS=1000
set.seed(1)

option_list = list(
  make_option(c("-b", "--clinvar_benign"), type="character", default="results/clinvar2022/clinvar_2022_benign.vcf.gz", 
              help="location of ClinVar benign missense variants in vcf like format"),
  make_option(c("-p", "--clinvar_pathogenic"), type="character", default="results/clinvar2022/clinvar_2022_pathogenic.vcf.gz", 
              help="location of ClinVar benign missense variants in vcf like format"),
  make_option(c("-a", "--AlphaFold_scores"), type="character", default="results/clinvar2022/values_of_clinvar_variants.tsv.gz", 
              help="pre calculated Alphafold scores"),
  make_option(c("-v", "--variants"), type="character", default="results/prediction_final/pre_final_model_regular_variants.csv.gz", 
              help="variants with calculated AlphScore"),
  make_option(c("-c", "--CV_training"), type="logical", default=FALSE, 
              help="Were ClinVar variants used as training set?"),
  make_option(c("-s", "--protein_subset"), type="character", default="resources/membrane_proteins.txt", 
              help="Subset of proteins to evaluate separately"),
  make_option(c("-o", "--out_folder"), type="character", default="results/clinvar2022_alphafold/", 
              help="name of folder to store output")
              )
opt = parse_args(OptionParser(option_list=option_list))

# load data, add variant ID (chrom:pos:ref:alt)
clinvar_benign<-read_tsv(opt$clinvar_benign, col_names=TRUE, col_types = cols(.default = "?",  CHROM = col_character()) ) %>%
  mutate(ID=paste(CHROM,POS,REF,ALT, sep=":"))
clinvar_pathogenic<-read_tsv(opt$clinvar_pathogenic, col_names=TRUE, col_types = cols(.default = "?",  CHROM = col_character()) ) %>%
  mutate(ID=paste(CHROM,POS,REF,ALT, sep=":"))
alphafold_pre_calculated<-read_tsv(opt$AlphaFold_scores, col_names=TRUE, 
                                   col_types = cols(.default = "?", REVEL_score = "d", "#chr" = col_character()))%>%
  mutate(ID=paste(`#chr`, `pos(1-based)`, ref, alt, sep=":"))
variants<-read_csv(opt$variants, 
                   col_types = cols(.default = "?", REVEL_score = "d", "#chr"= col_character() ))

protein_subset<-scan(file = opt$protein_subset, what=character())


dir.create(opt$out_folder, recursive=TRUE)
setwd(opt$out_folder)

head(alphafold_pre_calculated)

alphafold_pre_calculated$pos_in_VEP_and_Uniprot<-get_index_col(alphafold_pre_calculated)
alphafold_pre_calculated$DEOGEN2_score_med<-unlist_score(alphafold_pre_calculated$DEOGEN2_score, alphafold_pre_calculated$pos_in_VEP_and_Uniprot)


all_clinvar_ids<-c(clinvar_benign$ID, clinvar_pathogenic$ID)

alphafold_pre_calculated_w_CV2022<-alphafold_pre_calculated %>%
  filter(ID %in% all_clinvar_ids)%>%
  mutate(outcome=(ID %in% clinvar_pathogenic$ID))%>%
  filter(!is.na(AlphScore))


variants$AlphScore<-variants$predicted_Alph

variants$pos_in_VEP_and_Uniprot<-get_index_col(variants)
variants$DEOGEN2_score_med<-unlist_score(variants$DEOGEN2_score, variants$pos_in_VEP_and_Uniprot)

variants<-variants%>% 
  mutate(ID=paste(`#chr`, `pos(1-based)`,ref, sep=":"))




non_test_dataset<-variants %>% 
  filter(clinvar_holdout_test==FALSE)
test_dataset<-variants %>% 
  filter(clinvar_holdout_test==TRUE)

alphafold_pre_calculated_w_CV2022<-alphafold_pre_calculated_w_CV2022 %>%
  mutate(ID=paste(`#chr`, `pos(1-based)`,ref,  sep=":"))%>%
  mutate(prot_subset=Uniprot_acc_split %in% protein_subset)

### combine scores on interim dataset:
if (opt$CV_training){
  train_ds<-variants %>% 
    filter(clinvar_train==TRUE)
  interim_dataset<-variants %>% 
    filter(gnomad_interim_test==TRUE)

}else{
  interim_dataset<-variants %>% 
    filter(clinvar_interim_test==TRUE)
  train_ds<-variants %>% 
    filter(gnomad_train==TRUE)
}

alphafold_pre_calculated_w_CV2022<-alphafold_pre_calculated_w_CV2022 %>%
  filter(!ID %in% interim_dataset$ID)%>%
  filter(!ID %in% non_test_dataset$ID)%>%
  filter(!ID %in% train_ds$ID)

set_of_models<-fit_set_of_models(interim_dataset)
testSet<-predict_set_of_models(set_of_models, alphafold_pre_calculated_w_CV2022)

print("Number of protein-Variants in testSet:")
nrow(testSet)


# ensure that no variant is multiple times in the data set; if so, take the mean of the predictors, ensure, that the variant is rated consistently benign / pathogenic
testSet<-testSet %>% 
  group_by(ID) %>%
  summarise(AlphScore=mean(AlphScore), 
            REVEL_score=mean(REVEL_score), 
            glm_AlphRevel=mean(glm_AlphRevel), 
            Alph_null=mean(Alph_null),
            CADD_raw=mean(CADD_raw), 
            glm_AlphCadd=mean(glm_AlphCadd), 
            DEOGEN2_score_med=mean(DEOGEN2_score_med), 
            glm_AlphDeogen=mean(glm_AlphDeogen), 
            outcome=mean(outcome),
            prot_subset=max(prot_subset))%>%
  filter(outcome %in% c(0,1))%>%
  mutate(outcome=as.logical(outcome))

nrow(testSet)
sum(complete.cases(testSet))

sum(testSet$ID %in% test_dataset$ID)
sum(testSet$ID %in% variants$ID)

testSet<-testSet[complete.cases(testSet),]

table(testSet$outcome)



score_performance_tbl<-tibble()

pdf(file="ClinVar_val_REVEL.pdf")
plot(roc(as.integer(testSet$outcome), testSet$AlphScore), print.auc = TRUE, 
       col = "black", print.auc.y = .35)
  plot(roc(testSet$outcome, testSet$REVEL_score), print.auc = TRUE, 
       col = "blue", print.auc.y = .25, add = TRUE)
  plot(roc(testSet$outcome, testSet$glm_AlphRevel), print.auc = TRUE, 
       col = "red", print.auc.y = .15, add = TRUE)
  plot(roc(testSet$outcome, testSet$Alph_null), print.auc = TRUE, 
       col = "grey", print.auc.y = .05, add = TRUE)
  legend(0.2, 0.3, legend=c("AlphScore", "REVEL", "AlphScore + REVEL", "NullModel"),
         col=c("black", "blue","red", "grey"), lty=1, cex=0.8)
dev.off()

pdf(file="ClinVar_val_CADD.pdf")
  plot(roc(as.integer(testSet$outcome), testSet$AlphScore), print.auc = TRUE, 
       col = "black", print.auc.y = .35)
  plot(roc(testSet$outcome, testSet$CADD_raw), print.auc = TRUE, 
       col = "blue", print.auc.y = .25, add = TRUE)
  plot(roc(testSet$outcome, testSet$glm_AlphCadd), print.auc = TRUE, 
       col = "red", print.auc.y = .15, add = TRUE)
  plot(roc(testSet$outcome, testSet$Alph_null), print.auc = TRUE, 
       col = "grey", print.auc.y = .05, add = TRUE)
  legend(0.2, 0.3, legend=c("AlphScore", "CADD", "AlphScore + CADD", "NullModel"),
         col=c("black", "blue","red", "grey"), lty=1, cex=0.8)
dev.off()

pdf(file="ClinVar_val_DEOGEN2.pdf")
  plot(roc(as.integer(testSet$outcome), testSet$AlphScore), print.auc = TRUE, 
       col = "black", print.auc.y = .35)
  plot(roc(testSet$outcome, testSet$DEOGEN2_score_med), print.auc = TRUE, 
       col = "blue", print.auc.y = .25, add = TRUE)
  plot(roc(testSet$outcome, testSet$glm_AlphDeogen), print.auc = TRUE, 
       col = "red", print.auc.y = .15, add = TRUE)
  plot(roc(testSet$outcome, testSet$Alph_null), print.auc = TRUE, 
       col = "grey", print.auc.y = .05, add = TRUE)
  legend(0.2, 0.3, legend=c("AlphScore", "DEOGEN2", "AlphScore + DEOGEN2", "NullModel"),
         col=c("black", "blue","red", "grey"), lty=1, cex=0.8)
dev.off()





AlphNullPlot<-ggplot(testSet)+
  geom_density(aes(x=Alph_null, fill=outcome), alpha=0.5)+
  theme_bw()+
  labs(fill = "(likely)\npathogenic:")+
  scale_fill_manual(values=c("darkblue", "red"))
ggsave(filename="AlphNullPlot.pdf", 
       plot=AlphNullPlot,
       width=5, height=3)
alph_null_table<-generate_table("Alph_null")
write_tsv(x=alph_null_table,
          file="alph_null_table.tsv")

AlphScorePlot<-ggplot(testSet)+
  geom_density(aes(x=AlphScore, fill=outcome), alpha=0.5)+
  theme_bw()+
  labs(fill = "(likely)\npathogenic:")+
  scale_fill_manual(values=c("darkblue", "red"))
ggsave(filename="AlphScorePlot.pdf", 
       plot=AlphScorePlot,
       width=5, height=3)
AlphScore_table<-generate_table("AlphScore")
write_tsv(x=alph_null_table,
          file="alph_null_table.tsv")

glm_AlphRevel_plot<-ggplot(testSet)+
  geom_density(aes(x=glm_AlphRevel, fill=outcome), alpha=0.5)+
  theme_bw()+
  labs(fill = "(likely)\npathogenic:")+
  scale_fill_manual(values=c("darkblue", "red"))
ggsave(filename="glm_AlphRevel_plot.pdf", 
       plot=glm_AlphRevel_plot,
       width=5, height=3)
glm_AlphRevel_table<-generate_table("glm_AlphRevel")
write_tsv(x=glm_AlphRevel_table,
          file="glm_AlphRevel_table.tsv")

REVEL_score_plot<-ggplot(testSet)+
  geom_density(aes(x=REVEL_score, fill=outcome), alpha=0.5)+
  theme_bw()+
  labs(fill = "(likely)\npathogenic:")+
  scale_fill_manual(values=c("darkblue", "red"))
ggsave(filename="REVEL_score_plot.pdf", 
       plot=REVEL_score_plot,
       width=5, height=3)
REVEL_score_table<-generate_table("REVEL_score")
write_tsv(x=REVEL_score_table,
          file="REVEL_score_table.tsv")

CADD_raw_plot<-ggplot(testSet)+
  geom_density(aes(x=CADD_raw, fill=outcome), alpha=0.5)+
  theme_bw()+
  labs(fill = "(likely)\npathogenic:")+
  scale_fill_manual(values=c("darkblue", "red"))
ggsave(filename="CADD_raw_plot.pdf", 
       plot=CADD_raw_plot,
       width=6, height=4)
CADD_score_table<-generate_table("CADD_raw")
write_tsv(x=CADD_score_table,
          file="CADD_score_table.tsv")

glm_AlphCadd_plot<-ggplot(testSet)+
  geom_density(aes(x=glm_AlphCadd, fill=outcome), alpha=0.5)+
  theme_bw()+
  labs(fill = "(likely)\npathogenic:")+
  scale_fill_manual(values=c("darkblue", "red"))
ggsave(filename="glm_AlphCadd_plot.pdf", 
       plot=glm_AlphCadd_plot,
       width=5, height=3)
glm_AlphCadd_table<-generate_table("glm_AlphCadd")
write_tsv(x=glm_AlphCadd_table,
          file="glm_AlphCadd_table.tsv")


pr_cadd<-pr.curve(scores.class0=testSet$CADD_raw, weights.class0=testSet$outcome, curve=T)
pr_alphCadd<-pr.curve(scores.class0=testSet$glm_AlphCadd, weights.class0=testSet$outcome, curve=T)
pr_alph<-pr.curve(scores.class0=testSet$AlphScore, weights.class0=testSet$outcome, curve=T)
pr_alph_null<-pr.curve(scores.class0=testSet$Alph_null, weights.class0=testSet$outcome, curve=T)

pdf(file="proc_auc_CADD.pdf")
  plot(pr_alphCadd, color="red")
  plot(pr_cadd, add=TRUE, color="blue")
  plot(pr_alph, add=TRUE, color="black")
  plot(pr_alph_null, add=TRUE, color="grey")
  legend(0.2, 0.3, legend=c("AlphScore", "CADD", "AlphScore + CADD", "NullModel"),
         col=c("black", "blue","red", "grey"), lty=1, cex=0.8)
dev.off()


pr_revel<-pr.curve(scores.class0=testSet$REVEL_score, weights.class0=testSet$outcome, curve=T)
pr_alphRevel<-pr.curve(scores.class0=testSet$glm_AlphRevel, weights.class0=testSet$outcome, curve=T)

pdf(file="proc_auc_REVEL.pdf")
  plot(pr_alphRevel, color="red")
  plot(pr_revel, add=TRUE, color="blue")
  plot(pr_alph, add=TRUE, color="black")
  plot(pr_alph_null, add=TRUE, color="grey")
  legend(0.2, 0.3, legend=c("AlphScore", "REVEL", "AlphScore + REVEL", "NullModel"),
         col=c("black", "blue","red", "grey"), lty=1, cex=0.8)
dev.off()

pr_deogen<-pr.curve(scores.class0=testSet$DEOGEN2_score_med, weights.class0=testSet$outcome, curve=T)
pr_alphDeogen<-pr.curve(scores.class0=testSet$glm_AlphDeogen, weights.class0=testSet$outcome, curve=T)

pdf(file="proc_auc_DEOGEN2.pdf")
  plot(pr_alphDeogen, color="red")
  plot(pr_deogen, add=TRUE, color="blue")
  plot(pr_alph, add=TRUE, color="black")
  plot(pr_alph_null, add=TRUE, color="grey")
  legend(0.2, 0.3, legend=c("AlphScore", "DEOGEN2", "AlphScore + DEOGEN2", "NullModel"),
         col=c("black", "blue","red", "grey"), lty=1, cex=0.8)
dev.off()



# get aucs and do bootstrapping
get_score_performance_table<-function(testSet_private, i){
  testSet_private<-testSet_private[i,]
  score_performance_tbl_private <-
    tibble(
      Alph_ROC=roc(testSet_private$outcome, testSet_private$AlphScore)$auc, 
      Alph_null_ROC=roc(testSet_private$outcome, testSet_private$Alph_null)$auc, 
      CADD_ROC=roc(testSet_private$outcome, testSet_private$CADD_raw)$auc, 
      REVEL_ROC=roc(testSet_private$outcome, testSet_private$REVEL_score)$auc,
      DEOGEN2_ROC=roc(testSet_private$outcome, testSet_private$DEOGEN2_score_med)$auc, 
      AlphCadd_ROC=roc(testSet_private$outcome, testSet_private$glm_AlphCadd)$auc, 
      AlphDeogen_ROC=roc(testSet_private$outcome, testSet_private$glm_AlphDeogen)$auc, 
      AlphRevel_ROC=roc(testSet_private$outcome, testSet_private$glm_AlphRevel)$auc, 

      Alph_PROC=pr.curve(weights.class0=testSet_private$outcome, scores.class0=testSet_private$AlphScore)$auc.integral, 
      Alph_null_PROC=pr.curve(weights.class0=testSet_private$outcome, scores.class0=testSet_private$Alph_null)$auc.integral, 
      CADD_PROC=pr.curve(weights.class0=testSet_private$outcome, scores.class0=testSet_private$CADD_raw)$auc.integral, 
      REVEL_PROC=pr.curve(weights.class0=testSet_private$outcome, scores.class0=testSet_private$REVEL_score)$auc.integral,
      DEOGEN2_PROC=pr.curve(weights.class0=testSet_private$outcome, scores.class0=testSet_private$DEOGEN2_score_med)$auc.integral, 
      AlphCadd_PROC=pr.curve(weights.class0=testSet_private$outcome, scores.class0=testSet_private$glm_AlphCadd)$auc.integral, 
      AlphDeogen_PROC=pr.curve(weights.class0=testSet_private$outcome, scores.class0=testSet_private$glm_AlphDeogen)$auc.integral, 
      AlphRevel_PROC=pr.curve(weights.class0=testSet_private$outcome, scores.class0=testSet_private$glm_AlphRevel)$auc.integral,             

      num_test=nrow(testSet_private) )

  return(score_performance_tbl_private)
}


get_score_performance_table_boot<-function(testSet_private,i){
  return(unlist(get_score_performance_table(testSet_private,i)))
}

score_performance_tbl<-get_score_performance_table(testSet, TRUE)

write_tsv(x=score_performance_tbl, 
          file="score_performance_tbl.tsv")


score_performance_tbl_subset<-get_score_performance_table(testSet %>% filter(prot_subset==1), TRUE)
score_performance_tbl_NONsubset<-get_score_performance_table(testSet %>% filter(prot_subset==0), TRUE)

counts_test_set<-testSet %>% 
  group_by(outcome, prot_subset)%>%
  summarise(n=n())

write_tsv(x=counts_test_set,
          file="counts_test_set.tsv")

write_tsv(x=score_performance_tbl_subset, 
          file="score_performance_tbl_subset.tsv")

write_tsv(x=score_performance_tbl_NONsubset, 
          file="score_performance_tbl_NONsubset.tsv")



compare_scores<-function(booted_values_private){
  booted_values_private_summarised<-booted_values_private %>%
    rowwise%>%
    mutate(rowmax=max(across()))%>%
    mutate(AlphCadd_biggerThanCadd_ROC=AlphCadd_ROC>CADD_ROC)%>%
    mutate(AlphDeogen_biggerThanDeogen_ROC=AlphDeogen_ROC>DEOGEN2_ROC)%>%
    mutate(AlphRevel_biggerThanRevel_ROC=AlphRevel_ROC>REVEL_ROC)%>%
    mutate(AlphCadd_biggerThanCadd_PROC=AlphCadd_PROC>CADD_PROC)%>%
    mutate(AlphDeogen_biggerThanDeogen_PROC=AlphDeogen_PROC>DEOGEN2_PROC)%>%
    mutate(AlphRevel_biggerThanRevel_PROC=AlphRevel_PROC>REVEL_PROC)
  return(booted_values_private_summarised)
}

get_p_value_table<-function(booted_values_summarised_private){
  relevant_scores<-c("AlphDeogen_biggerThanDeogen_ROC","AlphRevel_biggerThanRevel_ROC","AlphCadd_biggerThanCadd_ROC",
                     "AlphDeogen_biggerThanDeogen_PROC","AlphRevel_biggerThanRevel_PROC","AlphCadd_biggerThanCadd_PROC")
  p_val_tabl<-tibble()
  for (columname in relevant_scores){
    p_val_tabl<- rbind(p_val_tabl, 
                       tibble(name=columname, num_true=sum(unlist(booted_values_summarised_private[,columname])), total_num=nrow(booted_values_summarised_private)))
  }
  return(p_val_tabl)
}

boot_aucs<-boot(data=testSet, statistic=get_score_performance_table_boot, R=BOOT_REPETITIONS, parallel="multicore", ncpus=3)
booted_aucs_table<-as_tibble(boot_aucs$t)
colnames(booted_aucs_table)<-colnames(score_performance_tbl)
booted_aucs_table<-compare_scores(booted_aucs_table)
pValTable_AbsCor<-get_p_value_table(booted_aucs_table)
pValTable_AbsCor<-pValTable_AbsCor%>% 
  mutate(pval=1-num_true/total_num)
pValTable_AbsCor

write_tsv(x = pValTable_AbsCor,
          file = "pValTable_AbsCor.tsv")



# Diagram with ROCs
plot_aucs_clinvar_bar<-function(perf_table, out_file){
  score_performance_tbl_spread<-perf_table%>%
    select(-num_test)%>%
    gather(key="method")%>%
    arrange(value)

  score_performance_tbl_spread<-score_performance_tbl_spread%>%
    mutate(method=factor(method, levels=c("Alph_null_ROC","Alph_ROC","CADD_ROC","AlphCadd_ROC","DEOGEN2_ROC","AlphDeogen_ROC","REVEL_ROC","AlphRevel_ROC")))%>%
    filter(!is.na(method))

  plot_aucs_ClinVar<-ggplot(score_performance_tbl_spread, aes(x=method, y=value))+
    stat_summary(fun.y = mean, geom = "bar") + 
    theme_minimal()+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, color="black", size=10),
          axis.text.y = element_text(color="black", size=10))+
    coord_cartesian(ylim=c(0.5,1)) + 
    labs(x = "")+
    labs(y = "AUC (ClinVar test set)", size=12)

  plot_aucs_ClinVar
  ggsave(filename= out_file, plot=plot_aucs_ClinVar, height=5, width=4)
}

plot_aucs_clinvar_bar(score_performance_tbl,"plot_aucs_ClinVar.pdf")
plot_aucs_clinvar_bar(score_performance_tbl_subset,"plot_aucs_ClinVar_subset.pdf")
plot_aucs_clinvar_bar(score_performance_tbl_NONsubset,"plot_aucs_ClinVar_NOsubset.pdf")



datasets_venn<-venn.diagram(x=list(train_ds$ID, interim_dataset$ID, testSet$ID), 
             category.names = c("gnomAD_train" , "ClinVar_validation", "ClinVar_test/hold-out"), 
             filename = NULL)

ggsave(datasets_venn, file="datasets_venn.pdf", device = "pdf", width=6, height=6)

R tidyverse optparse pROC VennDiagram boot PRROC From line 1 of scripts/get_performance_clinvar_2022.R

import sys
import pandas as pd
import numpy as np
from operator import itemgetter
import networkx as nx
import biographs as bg
import os

def exit_procedure(text_to_log):
	os.system('touch ' + snakemake.output[0])
	outF = open(snakemake.output[0]+".log", "w")
	outF.write(text_to_log)
	outF.close()
	print(text_to_log)
	os._exit(0)

#results/{pdb_name}_combined_features.csv.gz
class testclass:
	def __init__(self):
		self.wildcards=["AF-A0A087WUL8-F11-model_v1"]
		self.output=["results/AF-A0A087WUL8-F11-model_v1_features.tsvneighbour_vals.csv"]
		self.input=["results/AF-A0A087WUL8-F11-model_v1__combined_features.csv.gz"]
		self.params=["/media/axel/Dateien/Arbeit_Gen/alphafold2/dbnsfp_results/"]

#snakemake=testclass()

offset=0
offset=( int(snakemake.wildcards[0].split("-")[2][1:] ) - 1) * 200

#print(snakemake.wildcards, snakemake.output, snakemake.params)


pdb_path=snakemake.params[1]
os.system('gunzip -c "' + pdb_path + '".gz > ' +pdb_path)
molecule = bg.Pmolecule(pdb_path)
os.system('rm -f "' + pdb_path+ '"')

# biopython molecule structural model
mol_model = molecule.model
# networkx graph, by default 5 angstrom
network = molecule.network()
network = molecule.network(cutoff=4, weight=True)

start_resi=int(list(network.nodes)[0][1:])
network_relabeled=nx.convert_node_labels_to_integers(network, start_resi + offset)



#### read combined features

features_joined3=pd.read_csv(snakemake.input[0] ,low_memory=False)

#### read dbNSFP

dbnsfp=pd.read_csv(snakemake.params[0] + snakemake.wildcards[0].split("-")[1] + ".csv.gz", na_values=".", low_memory=False)


#### aggregate features of nodes
cons_scores=['CADD_raw','CADD_phred','BayesDel_noAF_score','REVEL_score', 'GERP++_NR', 'GERP++_RS', 'GERP++_RS_rankscore', 'phyloP100way_vertebrate', 'phyloP100way_vertebrate_rankscore', 'phyloP30way_mammalian', 'phyloP30way_mammalian_rankscore', 'phyloP17way_primate', 'phyloP17way_primate_rankscore', 'phastCons100way_vertebrate', 'phastCons100way_vertebrate_rankscore', 'phastCons30way_mammalian', 'phastCons30way_mammalian_rankscore', 'phastCons17way_primate', 'phastCons17way_primate_rankscore'] # removed:  'Polyphen2_HDIV_score', 'Polyphen2_HDIV_rankscore', 'Polyphen2_HDIV_pred', 'Polyphen2_HVAR_score', 'SIFT_score', 'SIFT_converted_rankscore', 'SIFT_pred', 'SiPhy_29way_pi'

def convert_int(num_str):
	try:
		num_int=int(num_str)
		return(num_int)
	except:
		return(0)

dbnsfp.HGVSp_VEP_split=dbnsfp.HGVSp_VEP_split.replace("p.Met1?","p.Met1???")
num_string=dbnsfp.HGVSp_VEP_split.str[5:-3]
dbnsfp["RESI"]=[convert_int(item) for item in num_string.to_list()]
dbnsfp["RESN"]=dbnsfp.HGVSp_VEP_split.str[2:5]
dbnsfp.RESN=dbnsfp.RESN.str.upper()
dbnsfp["RESN_RESI"]=dbnsfp["RESN"]  + dbnsfp["RESI"].astype(str)
dbnsfp=dbnsfp[dbnsfp.RESI!=0]
dbnsfp["to_AS"]=dbnsfp.HGVSp_VEP_split.str[-3:]
dbnsfp=dbnsfp[dbnsfp.to_AS!="Ter"]
dbnsfp_inf=dbnsfp.groupby('RESI')[cons_scores].mean()


protein_mean_CADD=dbnsfp_inf.mean()["CADD_raw"]
protein_mean_b_factor=features_joined3["b_factor"].mean()

dbnsfp_resn_resi=set(dbnsfp.RESN_RESI)

mismatch_count=0
nodes=[]

resis=list(set(features_joined3["RESI"].tolist()))

for i in resis:
	try:
		node_dict=features_joined3[features_joined3['RESI']==i].to_dict('records')[0]
		#if i in dbnsfp_inf.index:
		if not (node_dict["RESN_RESI"] in dbnsfp_resn_resi):
			print(node_dict["RESN_RESI"], "not present in dbnsfp!")
			mismatch_count=mismatch_count+1
			#import pdb; pdb.set_trace()
		node_dict={**node_dict, **dbnsfp_inf[dbnsfp_inf.index==i].to_dict('records')[0]}
		nodes.append([i, node_dict])
		#import pdb; pdb.set_trace()
	except:
		exit_procedure("1 Node construction failed")


if mismatch_count>i/2:
	exit_procedure("2 mismatched amino acids: " + str(mismatch_count))


network_relabeled.add_nodes_from(nodes)
print("Network okay!")

def create_pd_dict(diction, name_):
	return(pd.DataFrame.from_dict(diction, orient="index", columns=[name_]))

#score="HYDROPHOBICITY_3A"
physicochemical_scores=["CHARGE_3A","HYDROPHOBICITY_3A","MOBILITY_3A","SOLVENT_ACCESSIBILITY_core",'HSE1','HSE2']
dict_vals={}
list_vals=pd.DataFrame()
for score in physicochemical_scores + cons_scores:
	print(score + " ok.")
	for x in range(int(list(network_relabeled.nodes)[0]),int(list(network_relabeled.nodes)[-1])+1):
		sum_weight=0
		sum_value=0
		for neighbor in network_relabeled.neighbors(x):
			values=network_relabeled.nodes[neighbor][score]
			weight=network_relabeled[x][neighbor]["weight"]
			sum_weight=sum_weight+weight
			sum_value=sum_value+weight*values
			#print(values, weight, neighbor)
		#print(dict_vals)
		dict_vals={**dict_vals, **{x :  sum_value/sum_weight}}
	if list_vals.size==0:
		df_join=create_pd_dict(dict_vals, score+"_sur")
		list_vals=df_join.fillna(df_join.median())
	else:
		df_join=create_pd_dict(dict_vals, score+"_sur")
		list_vals=list_vals.join(df_join.fillna(df_join.median()))



deg=create_pd_dict(dict(nx.degree(network_relabeled)), "degree")
betw_cent=create_pd_dict(nx.betweenness_centrality(network_relabeled), "betweenness_centrality")
av_neigh=create_pd_dict(nx.average_neighbor_degree(network_relabeled), "av_neighbor_degree")
ev_cent=create_pd_dict(nx.eigenvector_centrality(network_relabeled, max_iter=100000, tol=0.0001),"eigenvec_centrality")

list_vals=list_vals.join([av_neigh, ev_cent, betw_cent, deg])
features_joined3.index = features_joined3.RESI

dbnsfp=dbnsfp.join(on="RESI", other=features_joined3, rsuffix="_features")
dbnsfp=dbnsfp.join(on="RESI", other=list_vals)
dbnsfp=dbnsfp[dbnsfp.residue_number.notnull()]

dbnsfp["protein_mean_CADD"]=protein_mean_CADD
dbnsfp["protein_mean_b_factor"]=protein_mean_b_factor

dbnsfp.to_csv(snakemake.output[0])

Python Pandas numpy Biopython networkx biographs From line 1 of scripts/graph_based_analysis_pdb_level.py

library(tidyverse)
library(optparse)
library(pROC)

source("workflow/scripts/existing_scores_glm_functions.R")
source("workflow/scripts/precision_recall_resource.R")

option_list = list(
  make_option(c("-b", "--clinvar_benign"), type="character", default="results/clinvar2022/clinvar_2022_benign.vcf.gz", 
              help="location of ClinVar benign missense variants in vcf like format"),
  make_option(c("-p", "--clinvar_pathogenic"), type="character", default="results/clinvar2022/clinvar_2022_pathogenic.vcf.gz", 
              help="location of ClinVar benign missense variants in vcf like format"),
  make_option(c("-a", "--AlphaFold_scores"), type="character", default="results/clinvar2022/values_of_clinvar_variants_FINAL.tsv.gz", 
              help="pre calculated Alphafold scores"),
  make_option(c("-v", "--variants"), type="character", default="results/prediction_final/final_regular_variants.csv.gz", 
              help="variants with calculated AlphScore"),
  make_option(c("-o", "--out_folder"), type="character", default="results/final_model_curves/", 
              help="name of folder to store output")
              )
opt = parse_args(OptionParser(option_list=option_list))

# load data, add variant ID (chrom:pos:ref:alt)
clinvar_benign<-read_tsv(opt$clinvar_benign, col_names=TRUE) %>%
  mutate(ID=paste(CHROM,POS,REF,ALT, sep=":"))
clinvar_pathogenic<-read_tsv(opt$clinvar_pathogenic, col_names=TRUE) %>%
  mutate(ID=paste(CHROM,POS,REF,ALT, sep=":"))
alphafold_pre_calculated<-read_tsv(opt$AlphaFold_scores, col_names=TRUE, 
                                   col_types = cols(.default = "?", REVEL_score = "d"))%>%
  mutate(ID=paste(`#chr`, `pos(1-based)`, ref, alt, sep=":"))
variants<-read_csv(opt$variants, 
                   col_types = cols(.default = "?", REVEL_score = "d"))


dir.create(opt$out_folder, recursive=TRUE)
setwd(opt$out_folder)

head(alphafold_pre_calculated)


all_clinvar_ids<-c(clinvar_benign$ID, clinvar_pathogenic$ID)

alphafold_pre_calculated_w_CV2022<-alphafold_pre_calculated %>%
  filter(ID %in% all_clinvar_ids)%>%
  mutate(outcome=(ID %in% clinvar_pathogenic$ID))%>%
  filter(!is.na(AlphScore))

variants<-variants%>% 
  mutate(ID=paste(`#chr`, `pos(1-based)`,ref, sep=":"))

alphafold_pre_calculated_w_CV2022$pos_in_VEP_and_Uniprot<-get_index_col(alphafold_pre_calculated_w_CV2022)
alphafold_pre_calculated_w_CV2022$DEOGEN2_score_med<-unlist_score(alphafold_pre_calculated_w_CV2022$DEOGEN2_score,
                                                                  alphafold_pre_calculated_w_CV2022$pos_in_VEP_and_Uniprot)

alphafold_pre_calculated_w_CV2022<-alphafold_pre_calculated_w_CV2022 %>%
  mutate(ID=paste(`#chr`, `pos(1-based)`,ref,  sep=":"))%>%
  filter(!ID %in% variants$ID)

# ensure that no variant is multiple times in the data set; if so, take the mean of the predictors, ensure, that the variant is rated consistently benign / pathogenic
testSet<-alphafold_pre_calculated_w_CV2022 %>% 
  group_by(ID) %>%
  summarise(AlphScore=mean(AlphScore), 
            REVEL_score=mean(REVEL_score), 
            CADD_raw=mean(CADD_raw), 
            DEOGEN2_score_med=mean(DEOGEN2_score_med), 
            glm_AlphRevel=mean(glm_AlphRevel),
            glm_AlphCadd=mean(glm_AlphCadd), 
            glm_AlphDeogen=mean(glm_AlphDeogen), 
            glm_AlphDeogenRevel=mean(glm_AlphDeogenRevel),
            glm_AlphCaddDeogen=mean(glm_AlphCaddDeogen),
            glm_CaddDeogenRevel=mean(glm_CaddDeogenRevel),
            outcome=mean(outcome))%>%
  filter(outcome %in% c(0,1))%>%
  mutate(outcome=as.logical(outcome))

nrow(testSet)
sum(complete.cases(testSet))

sum(testSet$ID %in% variants$ID)

testSet<-testSet[complete.cases(testSet),]
table(testSet$outcome)

dens_plot<-function(score, testSet_f=testSet){
  testSet_f_mod<-testSet_f
  testSet_f_mod$cur_score<-unlist(testSet_f[,score], use.names=FALSE)

AlphScorePlot<-ggplot(testSet_f_mod)+
  geom_density(aes(x=cur_score, fill=outcome), alpha=0.5)+
  theme_bw()+
  labs(fill = "(likely)\npathogenic:")+
  scale_fill_manual(values=c("darkblue", "red"))
ggsave(filename=paste0(score, "_FINAL.pdf"), 
       plot=AlphScorePlot,
       width=5, height=3)
AlphScore_table<-generate_table(score)
write_tsv(x=AlphScore_table,
          file=paste0(score, "_table_FINAL.tsv"))
}

dens_plot("AlphScore")
dens_plot("CADD_raw")
dens_plot("REVEL_score")
dens_plot("DEOGEN2_score_med")

dens_plot("glm_AlphRevel")
dens_plot("glm_AlphCadd")
dens_plot("glm_AlphDeogen")

dens_plot("glm_AlphDeogenRevel")
dens_plot("glm_AlphCaddDeogen")
dens_plot("glm_CaddDeogenRevel")


pdf(file="ClinVar_val_REVEL.pdf")
plot(roc(as.integer(testSet$outcome), testSet$AlphScore), print.auc = TRUE, 
     col = "black", print.auc.y = .35)
plot(roc(testSet$outcome, testSet$REVEL_score), print.auc = TRUE, 
     col = "blue", print.auc.y = .25, add = TRUE)
plot(roc(testSet$outcome, testSet$glm_AlphRevel), print.auc = TRUE, 
     col = "red", print.auc.y = .15, add = TRUE)
legend(0.2, 0.3, legend=c("AlphScore", "REVEL", "AlphScore + REVEL"),
       col=c("black", "blue","red"), lty=1, cex=0.8)
dev.off()

pdf(file="ClinVar_val_CADD.pdf")
plot(roc(as.integer(testSet$outcome), testSet$AlphScore), print.auc = TRUE, 
     col = "black", print.auc.y = .35)
plot(roc(testSet$outcome, testSet$CADD_raw), print.auc = TRUE, 
     col = "blue", print.auc.y = .25, add = TRUE)
plot(roc(testSet$outcome, testSet$glm_AlphCadd), print.auc = TRUE, 
     col = "red", print.auc.y = .15, add = TRUE)
legend(0.2, 0.3, legend=c("AlphScore", "CADD", "AlphScore + CADD"),
       col=c("black", "blue","red"), lty=1, cex=0.8)
dev.off()

pdf(file="ClinVar_val_DEOGEN2.pdf")
plot(roc(as.integer(testSet$outcome), testSet$AlphScore), print.auc = TRUE, 
     col = "black", print.auc.y = .35)
plot(roc(testSet$outcome, testSet$DEOGEN2_score_med), print.auc = TRUE, 
     col = "blue", print.auc.y = .25, add = TRUE)
plot(roc(testSet$outcome, testSet$glm_AlphDeogen), print.auc = TRUE, 
     col = "red", print.auc.y = .15, add = TRUE)
legend(0.2, 0.3, legend=c("AlphScore", "DEOGEN2", "AlphScore + DEOGEN2"),
       col=c("black", "blue","red"), lty=1, cex=0.8)
dev.off()

R tidyverse optparse pROC From line 1 of scripts/histogramsFinalModel.R

library(tidyverse)
library(gridExtra)
library(optparse)

option_list = list(
  make_option(c("-t", "--variants"), type="character", default="results/prediction_final/pre_final_model_regular_variants.csv.gz", 
              help="csv.gz file, test dataset"),
  make_option(c("-o", "--out_folder"), type="character", default="results/pLDDT", 
              help="Output folder"))

opt = parse_args(OptionParser(option_list=option_list))

variants<-read_csv(opt$variants)

dir.create(opt$out_folder, recursive=TRUE)
setwd(opt$out_folder)

gnomAD_vars<-variants %>% 
  filter(gnomadSet==1)%>%
  mutate(outcome_f=ifelse(outcome==1, "proxy pathogenic", "proxy benign"))%>%
  mutate(outcome_f=as.factor(outcome_f))%>%
  mutate(pLDDT_above_70=b_factor>70)

ClinVar_vars<-variants %>% 
  filter(gnomadSet==0)%>%
  mutate(outcome_f=ifelse(outcome==1, "(likely) pathogenic", "(likely) benign"))%>%
  mutate(outcome_f=as.factor(outcome_f))%>%
  mutate(pLDDT_above_70=b_factor>70)


hist_gnomad<-ggplot(gnomAD_vars)+
  geom_histogram(aes(x=b_factor, fill = outcome_f))+
  theme_bw()+
  geom_vline(xintercept=70, color="grey")+
  xlim(15,100)

hist_CV<-ggplot(ClinVar_vars)+
  geom_histogram(aes(x=b_factor, fill = outcome_f))+
  theme_bw()+
  geom_vline(xintercept=70, color="grey")+
  xlim(15,100)

bar_gnomad<-ggplot(gnomAD_vars)+
  geom_bar(aes(fill=outcome_f, x=pLDDT_above_70), position="fill",   width = 0.6)+
  theme_bw()

bar_CV<-ggplot(ClinVar_vars)+
  geom_bar(aes(fill=outcome_f, x=pLDDT_above_70), position="fill", width = 0.6)+
  theme_bw()

arranged_plddt_plots<-grid.arrange(hist_gnomad, hist_CV,bar_gnomad, bar_CV, nrow=2, ncol=2)

ggsave(filename = "arranged_plddt_plots.pdf",  plot=arranged_plddt_plots)

R tidyverse optparse gridExtra From line 1 of scripts/plot_pLDDT_vs_CV_gnomAD.R

library(tidyverse)
library(pROC)
library(readxl)
library(caret)
library(optparse)
source("workflow/scripts/existing_scores_glm_functions.R")

set.seed(1)

option_list = list(
  make_option(c("-c", "--csv_location"), type="character", default="/media/axel/Dateien/Arbeit_Gen/alphafold2/git_version/AlphScore/results/train_testset1/subsampled_vars.csv", 
              help="csv.gz file"),
  make_option(c("-e", "--excel_location"), type="character", default="resources/available_colnames_regular.xlsx", 
              help="Excel file listing columns to use"),
  make_option(c("-p", "--prefix"), type="character", default="base_model", 
              help="Prefix for output"),
  make_option(c("-m", "--method_pred"), type="character", default="randomforest", 
              help="Prediction method [randomforest, xgboost, extratree]"),
  make_option(c("-n", "--num_trees_param"), type="integer", default=200, 
              help="Number of trees to use / or number of rounds"),
  make_option(c("-x", "--max_depth_param"), type="integer", default=6, 
              help="Maximal depth of trees"),
  make_option(c("-l", "--lambda_param"), type="double", default=0, 
              help="Lambda value for xgboost"),
  make_option(c("-t", "--eta_param"), type="double", default=0.05, 
              help="Eta - learning rate, xgboost"),
  make_option(c("-s", "--subsample_param"), type="double", default=0.8, 
              help="Subsample parameter, xgboost"),
  make_option(c("-i", "--min_node_param"), type="integer", default=50, 
              help="randomforest: min.node.size, xgboost: min node parameter, extratree: nodesize"),
  make_option(c("-r", "--cor_param"), type="double", default=0.999999, 
              help="Just keep columns that have a lower correlation than this value, general"),
  make_option(c("-b", "--b_factor_param"), type="integer", default=20, 
              help="pLDDT value to filter positions that will be used to generate the average values of amino acids, general"),
  make_option(c("-d", "--min_child_weight_param"), type="integer", default=0, 
              help="min child weight, xgboost"),
  make_option(c("-g", "--gamma_param"), type="integer", default=0, 
              help="gamma value, xgboost"),
  make_option(c("-o", "--out_folder"), type="character", default="results/prediction", 
              help="name of folder to store output"),
  make_option(c("-w", "--write_dataset"), type="logical", default=FALSE, 
              help="Write predictions of test-dataset to file"),
  make_option(c("-f", "--full_model"), type="logical", default=FALSE, #results/preprocess/validation_set.csv.gzpreprocessed.csv.gz
              help="generate a full model"),
  make_option(c("-a", "--importance"), type="character", default="impurity", 
              help="variable importance mode of the ranger model"),
  make_option(c("-y", "--write_model"), type="logical", default=FALSE, #results/preprocess/validation_set.csv.gzpreprocessed.csv.gz
              help="write the model parameters to a file"),
  make_option(c("-k", "--k_fold_cross_val"), type="logical", default=FALSE,
              help="activate k-fold cross validation of the training and test data set")

)

opt = parse_args(OptionParser(option_list=option_list))

save_tibble<-data.frame()

variants<-read_csv(opt$csv_location, na=c(".","NA", NA))
colnames_usage <- read_excel(opt$excel_location, col_types="text")

dir.create(opt$out_folder, recursive=TRUE)
setwd(opt$out_folder)

pdf(file=paste0(opt$prefix, "_RPlots.pdf"))

# calculate average values of add_to_AS column for amino acids of training set
sel_vars_to<-(colnames_usage %>% filter(!is.na(add_to_AS)))$value
toAS_properties<-variants  %>%
  filter(gnomadSet == 1, b_factor>opt$b_factor_param)%>%
  dplyr::select(all_of(sel_vars_to), from_AS)%>%
  filter(complete.cases(.))%>%
  group_by(from_AS) %>%
  summarize_all(mean)
colnames(toAS_properties)<-paste0(colnames(toAS_properties), "_toAS")

# add these toAS values to the data set
variants<-variants%>%
  left_join(toAS_properties, by=c("from_AS"="from_AS_toAS"))
colnames_toAS<-colnames(toAS_properties[,names(toAS_properties) != "from_AS_toAS"])
variants[, colnames_toAS]<-  variants[, sel_vars_to] - 
  variants[, colnames_toAS]

# remove correlated columns in the data set
colnames_prediction<-c(colnames_toAS, 
                       (colnames_usage %>% 
                          filter(!is.na(for_prediction)))$value)

variants_pred<-variants %>% 
  dplyr::select(c(colnames_prediction, "outcome"))

#https://machinelearningmastery.com/feature-selection-with-the-caret-r-package/
zv <- apply(variants_pred, 2, function(x) length(unique(x)) == 1)
dfr <- variants_pred[, !zv]
n=length(colnames(dfr))
correlationMatrix <- cor(dfr,use="complete.obs")

set.seed(1)
# prune correltated variables 
df_colsCorrRemoved<-dfr[, -(findCorrelation(correlationMatrix, cutoff=opt$cor_param))]
colsCorrRemoved<-colnames(df_colsCorrRemoved)
rm(df_colsCorrRemoved)
rm(dfr)

variants<-variants[complete.cases(variants[,c(colsCorrRemoved, "outcome", "CADD_raw")]), ]
gc()


ranger_fit<-function(xtrain_dataset){
  library(ranger)
  model1<-ranger(outcome ~ ., 
                 data=xtrain_dataset, 
                 importance=opt$importance, 
                 max.depth=opt$max_depth_param,
                 num.trees = opt$num_trees_param,
                 min.node.size = opt$min_node_param
  ) 
  return(model1)
}
ranger_predict<-function(dataset, modelx){
  return(predict(modelx, dataset)$predictions) }

extraT_fit<-function(xtrain_dataset){
  library(ranger)
  model1<-ranger(outcome ~ ., 
                 data=xtrain_dataset, 
                 importance=opt$importance, 
                 splitrule="extratrees",
                 max.depth=opt$max_depth_param,
                 num.trees = opt$num_trees_param,
                 min.node.size = opt$min_node_param
  ) 
  return(model1)
}
ranger_save<-function(modelx, filename){
  saveRDS(modelx, file=filename)
}

xgboost_fit<-function(xtrain_dataset){
  library(xgboost)
  set.seed(1)
  gnomad_train<-xgb.DMatrix(data=as.matrix(xtrain_dataset %>% dplyr::select(-outcome)), label=xtrain_dataset$outcome )
  test_ds_tibble<-variants %>% 
    filter(clinvar_holdout_test==TRUE) %>% 
    dplyr::select(any_of(colnames(xtrain_dataset)))
  test_ds<-xgb.DMatrix(data=as.matrix(test_ds_tibble %>% dplyr::select(-outcome)), label=test_ds_tibble$outcome)
  watchlist=list(train=gnomad_train, test=test_ds)

  params <- list(max_depth = opt$max_depth_param, 
                 subsample=opt$subsample_param, 
                 eta=opt$eta_param, 
                 lambda=opt$lambda_param, 
                 alpha=0, 
                 gamma=opt$gamma_param, 
                 min_child_weight=opt$min_child_weight_param)
  model1<-xgb.train(
    data=gnomad_train,
    watchlist = watchlist,
    params=params,
    nrounds = opt$num_trees_param,
    early_stopping_rounds = 10,
    #eval_metric="auc",
    objective = "binary:logistic",  # for regression models
    verbose = 1)
  return(model1)
}

xgboost_predict<-function(dataset, modelx){
  data_x<-xgb.DMatrix(data=as.matrix(dataset %>% dplyr::select(-outcome)))
  return(
    predict(modelx, data_x) 
  )
}
xgboost_save<-function(modelx, filename){
  xgb.save(model=modelx, fname=filename)
}

if (opt$method_pred=="xgboost"){
  fit_model<-xgboost_fit
  predict_model<-xgboost_predict
  save_model<-xgboost_save
} else if (opt$method_pred=="randomforest") {
  fit_model<-ranger_fit
  predict_model<-ranger_predict
  save_model<-ranger_save
}else {
  fit_model<-extraT_fit
  predict_model<-ranger_predict
  save_model<-ranger_save
}


if (opt$k_fold_cross_val == TRUE){
  set.seed(1)

  # splitKFold function from existing_scores_glm_functions.R
  variants<-splitKFold(variants, 5)

  glm_cv<-data.frame()
  for (h in unique(variants$kfold_index)) {
    variants<-setTrainTestSet(variants, h)

    gnomad_train<-variants %>% filter(gnomad_train)

    gnomad_model_Alph<-fit_model(gnomad_train %>% dplyr::select(all_of(colsCorrRemoved)))
    variants$predicted_Alph<-predict_model(variants, gnomad_model_Alph) 

    glm_AlphCadd<- glm(outcome ~ . , family=binomial(link='logit'),
                            data=variants %>% filter(clinvar_holdout_test) %>% 
                            dplyr::select(outcome, predicted_Alph, CADD_raw) %>%
                            filter(complete.cases(.)))

    variants$glm_AlphCadd <-predict(glm_AlphCadd, variants)#prediction with model = non_h_cv_model_glm, test = : predicted__non_h_cv_glm_h_cv_test

    clinvar_holdout_test<-variants %>% filter(clinvar_holdout_test)
    clinvar_interim_test<-variants %>% filter(clinvar_interim_test)
    gnomad_train<-variants %>% filter(gnomad_train)

    save_tibble <- rbind(save_tibble, 
                         tibble(auc_Alph_train_gnomAD=roc(gnomad_train$outcome, gnomad_train$predicted_Alph)$auc, 
                                Alph_OOB=ifelse((opt$method_pred=="randomforest"), gnomad_model_Alph$prediction.error, NA),
                                auc_CADD_interim_CV=roc(clinvar_holdout_test$outcome, clinvar_holdout_test$CADD_raw)$auc, 
                                auc_Alph_interim_CV=roc(clinvar_holdout_test$outcome, clinvar_holdout_test$predicted_Alph)$auc,
                                auc_CADD_test_CV=roc(clinvar_interim_test$outcome, clinvar_interim_test$CADD_raw)$auc, 
                                auc_Alph_test_CV=roc(clinvar_interim_test$outcome, clinvar_interim_test$predicted_Alph)$auc, 
                                auc_glm_test_CV=roc(clinvar_interim_test$outcome, clinvar_interim_test$glm_AlphCadd)$auc, 
                                sample_num=h,
                                gnomad_train_nrow=nrow(gnomad_train),
                                interim_cv_nrow=nrow(clinvar_holdout_test),
                                test_cv_nrow=nrow(clinvar_interim_test),
                                condition=opt$prefix ))


   plot(roc(clinvar_holdout_test$outcome, clinvar_holdout_test$CADD_raw), print.auc = TRUE, col = "red")
   plot(roc(clinvar_holdout_test$outcome, clinvar_holdout_test$predicted_Alph), print.auc = TRUE, 
                               col = "green", print.auc.y = .2, add = TRUE)

   plot(roc(clinvar_interim_test$outcome, clinvar_interim_test$CADD_raw), print.auc = TRUE, col = "red", add=FALSE)
   plot(roc(clinvar_interim_test$outcome, clinvar_interim_test$glm_AlphCadd), print.auc = TRUE, 
                           col = "blue", print.auc.y = .2, add = TRUE)
   plot(roc(clinvar_interim_test$outcome, clinvar_interim_test$predicted_Alph), print.auc = TRUE, 
                           col = "green", print.auc.y = .4, add = TRUE)
  }

}else if (opt$full_model==TRUE){
  var_full_model<-variants %>% 
    filter(gnomadSet==1)%>%
    dplyr::select(all_of(colsCorrRemoved), "outcome") 

  gnomad_model_Alph<-fit_model(var_full_model)
  var_full_model$predicted_Alph<-predict_model(var_full_model, gnomad_model_Alph)
  variants$predicted_Alph<-predict_model(variants, gnomad_model_Alph)

  interim_dataset<-variants %>% 
    filter(clinvar_interim_test)

  save_tibble <- tibble(auc_Alph_train_gnomAD=roc(var_full_model$outcome, var_full_model$predicted_Alph)$auc,
                        Alph_OOB=ifelse((opt$method_pred %in% c("randomforest","extratree")), gnomad_model_Alph$prediction.error, NA),
                        gnomad_train_nrow=nrow(var_full_model),
                        gnomad_train_nProteins=train_dataset<-length(unique((variants %>% filter(gnomadSet==1))$Uniprot_acc_split)),
                        auc_CADD_interim_CV=roc(interim_dataset$outcome, interim_dataset$CADD_raw)$auc, 
                        auc_Alph_interim_CV=roc(interim_dataset$outcome, interim_dataset$predicted_Alph)$auc,
                        interim_nrow=nrow(interim_dataset),
                        interim_nProteins=length(unique(interim_dataset$Uniprot_acc_split))
                        )

}else{
  train_dataset<-variants %>% 
    filter(gnomad_train)%>%
    dplyr::select(all_of(colsCorrRemoved), "outcome")

  gnomad_model_Alph<-fit_model(train_dataset)

  train_dataset$predicted_Alph<-predict_model(train_dataset, gnomad_model_Alph)
  variants$predicted_Alph<-predict_model(variants %>% dplyr::select(all_of(colsCorrRemoved), "outcome"), gnomad_model_Alph)

  interim_dataset<-variants %>% 
    filter(clinvar_interim_test)

  test_dataset<-variants %>% 
    filter(clinvar_holdout_test)

  test_dataset_gnomad<-variants %>% 
    filter(gnomad_holdout_test) 

  roc_rose <- plot(roc(train_dataset$outcome, train_dataset$predicted_Alph), print.auc = TRUE, col = "blue")
  legend(0.3, 0.3, legend=c("Alph"),
         col=c("blue"), lty=1, cex=0.8)
  title(main = "train set")

  roc_rose <- plot(roc(interim_dataset$outcome, interim_dataset$CADD_raw), print.auc = TRUE, col = "red")
  roc_rose <- plot(roc(interim_dataset$outcome, interim_dataset$predicted_Alph), print.auc = TRUE, 
                   col = "blue", print.auc.y = .4, add = TRUE)
  legend(0.3, 0.3, legend=c("CADD", "Alph"),
         col=c("red", "blue"), lty=1, cex=0.8)
  title(main = "interim set")

  roc_rose <- plot(roc(test_dataset$outcome, test_dataset$CADD_raw), print.auc = TRUE, col = "red")
  roc_rose <- plot(roc(test_dataset$outcome, test_dataset$predicted_Alph), print.auc = TRUE, 
                   col = "blue", print.auc.y = .4, add = TRUE)
  legend(0.3, 0.3, legend=c("CADD", "Alph"),
         col=c("red", "blue"), lty=1, cex=0.8)
  title(main = "test set")

  roc_rose <- plot(roc(test_dataset_gnomad$outcome, test_dataset_gnomad$CADD_raw), print.auc = TRUE, col = "red")
  roc_rose <- plot(roc(test_dataset_gnomad$outcome, test_dataset_gnomad$predicted_Alph), print.auc = TRUE, 
                   col = "blue", print.auc.y = .4, add = TRUE)
  legend(0.3, 0.3, legend=c("CADD", "Alph"),
         col=c("red", "blue"), lty=1, cex=0.8)
  title(main = "test set genes, gnomad variants")


  save_tibble <- tibble(auc_Alph_train_gnomAD=roc(train_dataset$outcome, train_dataset$predicted_Alph)$auc, 
                        Alph_OOB=ifelse((opt$method_pred!="xgboost"), gnomad_model_Alph$prediction.error, NA),
                        auc_CADD_interim_CV=roc(interim_dataset$outcome, interim_dataset$CADD_raw)$auc, 
                        auc_Alph_interim_CV=roc(interim_dataset$outcome, interim_dataset$predicted_Alph)$auc,
                        auc_CADD_test_CV=roc(test_dataset$outcome, test_dataset$CADD_raw)$auc,
                        auc_Alph_test_CV=roc(test_dataset$outcome, test_dataset$predicted_Alph)$auc, 
                        auc_CADD_test_gnomad=roc(test_dataset_gnomad$outcome, test_dataset_gnomad$CADD_raw)$auc, 
                        auc_Alph_test_gnomad=roc(test_dataset_gnomad$outcome, test_dataset_gnomad$predicted_Alph)$auc, 
                        gnomad_train_nrow=nrow(train_dataset),
                        interim_nrow=nrow(interim_dataset),
                        test_nrow=nrow(test_dataset),
                        test_gnomad_nrow=nrow(test_dataset_gnomad),
                        gnomad_train_nProteins=train_dataset<-length(unique((variants %>% 
                                                                               filter(gnomad_train))$Uniprot_acc_split)),
                        interim_nProteins=length(unique(interim_dataset$Uniprot_acc_split)),
                        test_nProteins=length(unique(test_dataset$Uniprot_acc_split)),
                        test_gnomad_nProteins=length(unique(test_dataset_gnomad$Uniprot_acc_split)),
                        condition=opt$prefix )
}

if (opt$write_dataset){
  write_csv(x=variants, file=paste0(opt$prefix,"_variants.csv.gz"))
}

if (opt$method_pred %in% c("randomforest", "extratree")){
  var_imp<-tibble(importance=as.vector(gnomad_model_Alph$variable.importance), 
                  variable=names(gnomad_model_Alph$variable.importance))
  write_tsv(x=var_imp,
            file = paste0(opt$prefix,"_", opt$importance, "_importance.tsv"))
}

if (opt$write_model){
  save_model(gnomad_model_Alph, file=paste0(opt$prefix,"_written_full_model.RData"))
  saveRDS(colsCorrRemoved, file=paste0(opt$prefix,"_colnames_to_use.RData"))
  saveRDS(toAS_properties,file=paste0(opt$prefix,"_toAS_properties.RData") )
}

write_tsv(x=save_tibble, file=paste0(opt$prefix, "_results.tsv"))

R tidyverse optparse readxl pROC caret ranger From line 1 of scripts/prediction.R

library(readr)
library(dplyr)
library(stringr)
library(optparse)
library(ranger)
source("workflow/scripts/existing_scores_glm_functions.R")
set.seed(1)

option_list = list(
  make_option(c("-c", "--csv_location"), type="character", default="results/combine2_protein/A0A1B0GUS4_w_AFfeatures.csv.gz", 
              help="csv.gz file"),
  make_option(c("-m", "--model_location"), type="character", default="results/prediction/final_written_full_model.RData", 
              help="File containing the full model for prediction"),
  make_option(c("-u", "--use_cols_file"), type="character", default="results/prediction/final_colnames_to_use.RData", 
              help="File containing the colnames to use in the final model"),
  make_option(c("-t", "--toAS_properties"), type="character", default="results/prediction/final_toAS_properties.RData", 
              help="File containing the properties of the alternative amino acids"),
  make_option(c("-b", "--combined_model"), type="character", default="results/prediction_final/combined_model.RData", 
              help="File where the final combined logistic regression model should be saved to"),
  make_option(c("-v", "--training_var_ids"), type="character", default="results/prediction_final/training_var_ids.tsv", 
              help="IDs of variants that were in the training / test set"),
  make_option(c("-o", "--output_file"), type="character", default="results/prediction/predicted.csv.gz", 
              help="Excel file listing columns to use"),
  make_option(c("-r", "--reduced"), type="logical", default="FALSE", 
              help="just save essential columns")
)

opt = parse_args(OptionParser(option_list=option_list))

to_AS_table<-read_tsv("resources/to_AS_table.txt")
variants<-read_csv(opt$csv_location, na=c(".","NA", NA), col_types = cols(.default = "?", REVEL_score = "d"))

# if variant file is empty, stop
if (nrow(variants)==0){
  system(paste("touch", opt$output_file))
}else{

model_to_use<-readRDS(opt$model_location)
colnames_to_use<-readRDS(opt$use_cols_file)
toAS_properties<-readRDS(opt$toAS_properties)

set_of_models<-readRDS(opt$combined_model)

training_var_ids<-read_tsv(opt$training_var_ids)
gnomad_var_ids<-(training_var_ids %>% filter(gnomadSet==1))$var_id_genomic
ClinVar_var_ids<-(training_var_ids %>% filter(gnomadSet==0))$var_id_genomic

# function to add to AS properties
addToAS<-function(variants_par, toAsProp, colToUse){
  variants_mod<-variants
  variants_mod<-variants_mod%>%
    left_join(toAsProp, by=c("from_AS"="from_AS_toAS"))

  colnames_toAS<-colnames(toAsProp)
  colnames_toAS<-colnames_toAS[colnames_toAS!="from_AS_toAS"]

  sel_vars_to<-str_replace(colnames_toAS, fixed("_toAS"),"")

  variants_mod[, colnames_toAS]<-variants_mod[, sel_vars_to] - variants_mod[, colnames_toAS]
  colToUse_mod<-colToUse[colToUse!="outcome"]

  return(variants_mod[, colToUse_mod])
}


# external function that is also used by preprocess
variants<-prepareVariantsForPrediction(variants, to_AS_table)

#add to AS properties
variants_alph<-addToAS(variants, toAS_properties, colnames_to_use)

# just keep complete cases
variants<-variants[complete.cases(variants_alph),]
variants_alph<-variants_alph[complete.cases(variants_alph),]

# predict AlphScore
variants$AlphScore<-predict(model_to_use, variants_alph)$predictions
rm(variants_alph)

index_col<-get_index_col(variants)

variants$DEOGEN2_score_med<- unlist_score(variants$DEOGEN2_score, index_col)

variants<-predict_set_of_models(set_of_models = set_of_models, variants_to_predict = variants)
colNamesCombinedModels<-unlist(set_of_models[2])

# flag variants that are in the training data set or that are in dbNSFP ClinVar
variants<-variants %>% 
  mutate(in_gnomad_train=var_id_genomic %in% gnomad_var_ids)%>%
  mutate(in_clinvar_ds=var_id_genomic %in% ClinVar_var_ids)


# if wanted, reduce the output to have smaller files
if (opt$reduced==TRUE){
 variants<-variants %>% 
  mutate(ID=paste(`#chr`, `pos(1-based)`, ref, alt, sep=":"))%>%
  select(any_of(colnames(variants)[2:10]), 
         ID, genename, Uniprot_acc_split,Uniprot_acc,HGVSp_VEP_split, HGVSp_VEP, CADD_raw, REVEL_score, DEOGEN2_score, 
         b_factor, SOLVENT_ACCESSIBILITY_core, in_gnomad_train, in_clinvar_ds, AlphScore, any_of(colNamesCombinedModels))
}

# write to disk
write_csv(x=variants,
          file=opt$output_file)
}

R dplyr stringr readr optparse ranger From line 1 of scripts/predict_w_ranger_model_Final.R

library(readr)
library(dplyr)
library(stringr)
library(optparse)
library(ranger)
source("workflow/scripts/existing_scores_glm_functions.R")
set.seed(1)

option_list = list(
  make_option(c("-c", "--csv_location"), type="character", default="results/combine2_protein/P38398_w_AFfeatures.csv.gz", 
              help="csv.gz file"),
  make_option(c("-m", "--model_location"), type="character", default="results/prediction/final_written_full_model.RData", 
              help="File containing the full model for prediction"),
  make_option(c("-u", "--use_cols_file"), type="character", default="results/prediction/final_colnames_to_use.RData", 
              help="File containing the colnames to use in the final model"),
  make_option(c("-t", "--toAS_properties"), type="character", default="results/prediction/final_toAS_properties.RData", 
              help="File containing the properties of the alternative amino acids"),
  make_option(c("-n", "--model_location_null"), type="character", default="results/prediction/final_written_full_model.RData", 
              help="File containing the full model for prediction, null model"),
  make_option(c("-x", "--use_cols_file_null"), type="character", default="results/prediction/final_colnames_to_use.RData", 
              help="File containing the colnames to use in the final model, null model"),
  make_option(c("-v", "--toAS_properties_null"), type="character", default="results/prediction/final_toAS_properties.RData", 
              help="File containing the properties of the alternative amino acids, null model"),
  make_option(c("-r", "--reduced"), type="logical", default="FALSE", 
              help="just save essential columns"),
  make_option(c("-o", "--output_file"), type="character", default="results/prediction/predicted.csv.gz", 
              help="Excel file listing columns to use")
)

opt = parse_args(OptionParser(option_list=option_list))

to_AS_table<-read_tsv("resources/to_AS_table.txt")
variants<-read_csv(opt$csv_location, na=c(".","NA", NA))

# if variant file is empty, stop
if (nrow(variants)==0){
  system(paste("touch", opt$output_file))
}else{

model_to_use<-readRDS(opt$model_location)
colnames_to_use<-readRDS(opt$use_cols_file)
toAS_properties<-readRDS(opt$toAS_properties)

model_to_use_null<-readRDS(opt$model_location_null)
colnames_to_use_null<-readRDS(opt$use_cols_file_null)
toAS_properties_null<-readRDS(opt$toAS_properties_null)

# function to add to AS properties
addToAS<-function(variants_par, toAsProp, colToUse){
  variants_mod<-variants
  variants_mod<-variants_mod%>%
    left_join(toAsProp, by=c("from_AS"="from_AS_toAS"))

  colnames_toAS<-colnames(toAsProp)
  colnames_toAS<-colnames_toAS[colnames_toAS!="from_AS_toAS"]

  sel_vars_to<-str_replace(colnames_toAS, fixed("_toAS"),"")

  variants_mod[, colnames_toAS]<-variants_mod[, sel_vars_to] - variants_mod[, colnames_toAS]
  colToUse_mod<-colToUse[colToUse!="outcome"]

  return(variants_mod[, colToUse_mod])
}

# external function that is also used by preprocess
variants<-prepareVariantsForPrediction(variants, to_AS_table)

#add to AS properties
variants_alph<-addToAS(variants, toAS_properties, colnames_to_use)

# just keep complete cases
variants<-variants[complete.cases(variants_alph),]
variants_alph<-variants_alph[complete.cases(variants_alph),]

# predict AlphScore
variants$AlphScore<-predict(model_to_use, variants_alph)$predictions
rm(variants_alph)

# predict null model
variants_alph_null<-addToAS(variants, toAS_properties_null, colnames_to_use_null)
variants$Alph_null<-predict(model_to_use_null, variants_alph_null)$predictions

# if wanted, reduce the output to have smaller files
if (opt$reduced==TRUE){
 variants<-variants %>% 
  mutate(ID=paste(`#chr`, `pos(1-based)`, ref, alt, sep=":"))%>%
  select(any_of(colnames(variants)[2:10]), 
         ID, genename, Uniprot_acc_split,Uniprot_acc,HGVSp_VEP_split, HGVSp_VEP, CADD_raw, REVEL_score, DEOGEN2_score, 
         b_factor, SOLVENT_ACCESSIBILITY_core, Alph_null, AlphScore)
}

# write to disk
write_csv(x=variants,
          file=opt$output_file)
}

R dplyr stringr readr optparse ranger From line 1 of scripts/predict_w_ranger_model_for_Evaluation.R

library(tidyverse)
library(VennDiagram)
futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger") # prevent VennDiagram to write lots of log messages
library(data.table)
library(optparse)
source("workflow/scripts/existing_scores_glm_functions.R")
set.seed(1)

option_list = list(
  make_option(c("-i", "--input"), type="character", default="/media/axel/Dateien/Arbeit_Gen/alphafold2/data_from_xcat_v2/gnomad_extracted_v2.csv.gz", 
              help="csv.gz file"),
  make_option(c("-o", "--output"), type="character", default="results/train_testset1/gnomad_extracted_prepro.csv.gz", 
              help="csv.gz file for output")
)
opt = parse_args(OptionParser(option_list=option_list))

variants<-read_csv(opt$input)
to_AS_table<-read_tsv("resources/to_AS_table.txt")

dir.create(dirname(opt$output))
setwd(dirname(opt$output))

# preprocess variants (see sourced file)
variants<-prepareVariantsForPrediction(variants, to_AS_table)
variants<-splitKFold(variants, 5)
variants<-setTrainTestSet(variants, 1)

# check for overlaps
gnomad_train<-variants %>% filter(gnomad_train)
clinvar_interim_test<-variants %>% filter(clinvar_interim_test)
clinvar_holdout_test<-variants %>% filter(clinvar_holdout_test)

venn.diagram(x=list(gnomad_train$Uniprot_acc_split, clinvar_interim_test$Uniprot_acc_split, clinvar_holdout_test$Uniprot_acc_split),category.names = c("gnomAD train" , "ClinVar interim", "ClinVar hold out"), filename = 'Protein_id_level.png')
venn.diagram(x=list(gnomad_train$var_id_prot, clinvar_interim_test$var_id_prot, clinvar_holdout_test$var_id_prot),category.names = c("gnomAD train" , "ClinVar interim", "ClinVar hold out"), filename = 'Protein_variant_id_level.png')
venn.diagram(x=list(gnomad_train$var_id_genomic, clinvar_interim_test$var_id_genomic, clinvar_holdout_test$var_id_genomic),category.names = c("gnomAD train" , "ClinVar interim", "ClinVar hold out"), filename = 'Variant_id_level.png')

# check for duplicates
n_occur <- data.frame(table(paste(variants$var_id_genomic, variants$gnomadSet)))
double_ids<-str_split(n_occur[n_occur$Freq > 1,]$Var1, " ", simplify=TRUE)[,1]
double_variants<-variants %>% filter(var_id_genomic %in% double_ids) %>% select(var_id_genomic, Uniprot_acc_split, RESN, RESN_RESI, outcome, gnomadSet, alt)
head(double_variants)
# duplicates are different alternative alleles at one position

write_csv(variants, file=basename(opt$output))

R tidyverse data.table optparse VennDiagram From line 1 of scripts/preprocess.R

library(tidyverse)
library(optparse)
set.seed(1)

option_list = list(
  make_option(c("-i", "--input"), type="character", default="results/train_testset1/gnomad_extracted_prepro.csv.gz", 
              help="csv.gz file"),
  make_option(c("-o", "--output"), type="character", default="results/preprocess/gnomad_extracted_prepro_rec.csv.gz", 
              help="csv.gz file for output"),
  make_option(c("-u", "--undersample"), type="logical", default="TRUE", 
              help="Undersample to 50% - 50 % patho / benign")
)
opt = parse_args(OptionParser(option_list=option_list))

base_file_name=tools::file_path_sans_ext(basename(opt$input))
variants_org<-read_csv(opt$input)

dir.create(dirname(opt$output))
setwd(dirname(opt$output))


proportion_patho<-variants_org %>% 
  filter(gnomadSet==TRUE)%>%
  group_by(from_AS) %>%
  summarize(mean_outcome=mean(outcome), anz=sum(outcome))%>%
  arrange(mean_outcome)
print(proportion_patho)

# set the ratio patho / beningn
if (opt$undersample){
  PROP_PATHO_FACTOR=0.5
}else{
  PROP_PATHO_FACTOR=min(proportion_patho$mean_outcome)
}

amino_acids<-unique(variants_org$from_AS)

new_variants<-data.frame()
for (amino_acid in amino_acids){
  temp_patho<-variants_org %>% 
    filter(outcome==1, from_AS==amino_acid)%>%
    filter(gnomadSet==1)

  temp_benign<-variants_org %>% 
    filter(outcome==0, from_AS==amino_acid) %>%
    filter(gnomadSet==1)

  non_training_variants<-variants_org%>%
    filter(from_AS==amino_acid) %>%
    filter(gnomadSet==0)

  number_ben<-nrow(temp_benign)
  number_patho_to_sample<-as.integer((PROP_PATHO_FACTOR/(1-PROP_PATHO_FACTOR))*number_ben)

  new_temp_patho<-sample_n(temp_patho,number_patho_to_sample, replace = FALSE)

  new_variants<-rbind(new_variants, temp_benign, new_temp_patho, non_training_variants)

}

print(new_variants %>%
  filter(gnomadSet==1)%>%
  group_by(from_AS) %>%
  summarize(mean_outcome=mean(outcome))%>%
  arrange(mean_outcome))

print("total number of variants:")
print(nrow(new_variants))



print("Number of variants in gnomAD set after recalibration:")
print(nrow(new_variants %>%
             filter(gnomadSet==1)))

print("Number of variants in gnomAD set before recalibration:")
print(nrow(variants_org%>%
             filter(gnomadSet==1)))

print("Number of variants in ClinVar set:")
print(nrow(new_variants %>%
             filter(gnomadSet==0)))

write_csv(new_variants, file=basename(opt$output))

R tidyverse optparse From line 1 of scripts/recalibrate_variants.R

library(tidyverse)
library(optparse)

option_list = list(
  make_option(c("-v", "--dup_vars"), type="character", default="all_possible_values_dups.csv.gz", 
              help="list of duplicated variants, final set"),
  make_option(c("-o", "--out_file"), type="character", default="deduplicated_vars_bind.tsv", 
              help="Output file name"))

opt = parse_args(OptionParser(option_list=option_list))


duplicted_vars<-read_tsv(opt$dup_vars)

collapse_as_text<-c("Uniprot_acc_split", "HGVSp_VEP_split", "b_factor", "SOLVENT_ACCESSIBILITY_core","in_gnomad_train","in_clinvar_ds")
scoreVals_to_combine<-c("CADD_raw", "REVEL_score", "AlphScore", "glm_AlphCadd", "glm_AlphRevel", "glm_RevelCadd", "glm_AlphRevelCadd",
                        "glm_AlphDeogen", "glm_CaddDeogen", "glm_DeogenRevel", "glm_AlphDeogenRevel", "glm_AlphCaddDeogen", "glm_CaddDeogenRevel", "glm_AlphCaddRevel")
any_action_required<-c(collapse_as_text, scoreVals_to_combine)

cols_vars<-colnames(duplicted_vars)
constant_cols<-cols_vars[!cols_vars %in% any_action_required]

deduplicate_vars<-function(varID_of_duplicate){
  duplicated_rows<-duplicted_vars %>% 
    mutate(temp_var_ID=paste(ID, genename))%>%
    filter(temp_var_ID == varID_of_duplicate)%>%
    select(-temp_var_ID)

  merged_as_text<-duplicated_rows %>% 
    select(all_of(collapse_as_text)) %>%
    summarise_all(paste ,collapse=",")

  merged_as_numeric_val<-duplicated_rows %>% 
    select(all_of(scoreVals_to_combine)) %>%
    summarise_all(mean)

  constant_vals<-duplicated_rows %>% 
    select(all_of(constant_cols)) %>% distinct()

  deduplicated<-cbind(
        constant_vals,
        merged_as_text,
        merged_as_numeric_val
        )%>%
    relocate(any_of(cols_vars))
  if (nrow(deduplicated)>1){
    print("Deduplication not successful!")
    print(constant_vals)
  }
  return(deduplicated)
}

var_ids<-unique(paste(duplicted_vars$ID, duplicted_vars$genename))
#var_ids<-var_ids[1:50]

deduplicated_vars<-lapply(var_ids, deduplicate_vars)

deduplicated_vars_bind<-do.call("rbind", deduplicated_vars)

if (sum(!colnames(deduplicated_vars_bind)==cols_vars)>0) {
  print("Order of columns does not match the requirements!")
}

write_tsv(x=deduplicated_vars_bind,
          file=opt$out_file,
          col_names = FALSE)

R tidyverse optparse From line 1 of scripts/remove_duplicates_from_final_score.R

shell:
	"""
	homedir=$(pwd)

	mkdir -p "{params.alphafold}"

	cd "{params.alphafold}"
	rm -f UP000005640_9606_HUMAN_v2.tar
	wget {config[alphafold_download_adress]}
	tar -xf UP000005640_9606_HUMAN_v2.tar
	rm UP000005640_9606_HUMAN_v2.tar
	"""

SnakeMake From line 58 of workflow/Snakefile

shell:
	"""
	homedir=$(pwd)
	mkdir -p "{params.dbNSFP}"
	cd {params.dbNSFP}

	rm -f dbNSFP4.2a.zip
	wget {config[dbNSFP_download_adress]}
	unzip dbNSFP4.2a.zip
	rm dbNSFP4.2a.zip

	cd $homedir
	touch {output}
	"""

SnakeMake From line 78 of workflow/Snakefile

shell:
	"""
	homedir=$(pwd)
	mkdir -p "{params.tmp}"
	mkdir -p "{params.outdir}"
	cd "{params.tmp}"
	mkdir -p header

	if (( $(zcat $homedir"/{params.db_file}" | head -n1 > header/header.txt) )) # pipe / head combination produces non-0-exit
	then
	echo "not ok"
	fi

	zcat $homedir"/{params.db_file}" | tail -n+2 | split -l 20000 - {wildcards.chr}

	for file in $(find * -maxdepth 0 -type f)
	do
	python $homedir"/workflow/scripts/split_dbNSFP_unnest.py" $homedir"/config/pdb_ids.txt" $(pwd)"/header/header.txt" $file $homedir"/"{params.outdir}
	done

	cd $homedir
	touch "{output}"
	"""

SnakeMake From line 105 of workflow/Snakefile

shell:
	"""
	feature="$(pwd)/"{config[feature_folder]}"bin/featurize"
	export FEATURE_DIR="$(pwd)/"{config[feature_folder]}"data/"
	work_dir="results/pdb_features/{wildcards.pdb_name}/"
	temp_pdb=$work_dir"1A1A.pdb"

	export DSSP_DIR=$work_dir
	export PDB_DIR=$work_dir

	mkdir -p $work_dir
	gunzip -c "{input}" > $temp_pdb

	#run dssp
	cat $temp_pdb | {config[dssp_bin]} -i /dev/stdin > $work_dir"1A1A.dssp"
	echo 1A1A > $work_dir"pdb_id"

	$feature $temp_pdb -n1 -w0.2 | grep ":A@CA$" > $work_dir"{wildcards.pdb_name}_core.txt"
	$feature $temp_pdb -n1 -w3 | grep ":A@CA$" > $work_dir"{wildcards.pdb_name}_3A.txt"
	$feature $temp_pdb -n1 -w6 | grep ":A@CA$" > $work_dir"{wildcards.pdb_name}_6A.txt"
	$feature $temp_pdb -n1 -w9 | grep ":A@CA$" > $work_dir"{wildcards.pdb_name}_9A.txt"
	$feature $temp_pdb -n1 -w12 | grep ":A@CA$" > $work_dir"{wildcards.pdb_name}_12A.txt"

	rm $temp_pdb*
	"""

SnakeMake From line 138 of workflow/Snakefile

shell:
	"""
	protinter="$(pwd)/"{config[protinter_bin]}
	work_dir="results/pdb_features/{wildcards.pdb_name}/"
	mkdir -p $work_dir

	gunzip -c "{input}" > $work_dir"/pdbfile.pdb"
	cd $work_dir

	python3 $protinter pdbfile.pdb -hydrophobic -csv
	python3 $protinter pdbfile.pdb -disulphide -csv
	python3 $protinter pdbfile.pdb -ionic -csv
	python3 $protinter pdbfile.pdb -aroaro -csv
	python3 $protinter pdbfile.pdb -arosul -csv
	python3 $protinter pdbfile.pdb -catpi -csv
	#python3 $protinter pdbfile.pdb -hb1 -csv
	python3 $protinter pdbfile.pdb -hb2 -csv
	python3 $protinter pdbfile.pdb -hb3 -csv
	python3 $protinter pdbfile.pdb -within_radius -csv

	sed -i "s/ //g" result_*
	rm pdbfile.pdb
	"""

SnakeMake From line 172 of workflow/Snakefile

run:
	import os
	from Bio.PDB.PDBParser import PDBParser
	import Bio.PDB as bpd
	import pandas as pd
	pdb_path=params[0]
	os.system('gunzip -c "' + pdb_path + '.gz" > "' + pdb_path + 'X"' )

	parser = PDBParser()
	structure = parser.get_structure("test", pdb_path + 'X')
	hse = bpd.HSExposure
	exp_ca = hse.HSExposureCB(structure)
	os.system('rm "' + pdb_path + 'X"' )
	resi_count=len(exp_ca.keys())
	list_RESI_HSE=[ [exp_ca.keys()[i][1][1], exp_ca.property_list[i][1][0], exp_ca.property_list[i][1][1] ] for i in range(0, resi_count)]
	df_HSE=pd.DataFrame(list_RESI_HSE)
	# manual check: HSE1=upper sphere, HSE2=lower sphere
	df_HSE.columns =['RESI','HSE1','HSE2']
	df_HSE.index=df_HSE.RESI
	df_HSE.to_csv(output[0], index=False)

SnakeMake Pandas Biopython pdbparser From line 205 of workflow/Snakefile

run:
	import pandas as pd 
	from biopandas.pdb import PandasPdb
	import numpy as np
	ppdb_load = PandasPdb()
	ppdb_load.read_pdb(input[0])
	pd_df=ppdb_load.df["ATOM"][["residue_number","atom_name","b_factor"]].to_numpy()
	np.savetxt(output[0], pd_df[pd_df[:,1]=="CA"], fmt='%s')

SnakeMake Pandas numpy biopandas From line 235 of workflow/Snakefile

script:
	"scripts/combine_features_pdb_level.py"

SnakeMake From line 258 of workflow/Snakefile

script:
	"scripts/graph_based_analysis_pdb_level.py"

SnakeMake From line 274 of workflow/Snakefile

run:
	import pandas as pd
	import os
	li =[]
	for filename in input:
		try:
			df = pd.read_csv(filename, low_memory=False)
			li.append(df)
		except:
			print(filename + " error")
	try:
		frame = pd.concat(li, axis=0, ignore_index=True)
	except:
		print("empty df")
		os.system('touch ' + output[0])

SnakeMake Pandas From line 287 of workflow/Snakefile

script:
	"scripts/extract_clinvar_gnomad_sets.R"

SnakeMake From line 326 of workflow/Snakefile

shell:
	"""
	Rscript workflow/scripts/preprocess.R -i {input} -o {output.preprocessed}
	Rscript workflow/scripts/recalibrate_variants.R -i {output.preprocessed} -o {output.recalibrated} \
	--undersample FALSE
	"""

SnakeMake From line 338 of workflow/Snakefile

shell:
	"""
	Rscript workflow/scripts/Fig_recal.R \
	--input_preprocessed {input.preprocessed} \
	--input_recalibrated {input.recalibrated} \
	--out_folder {params.out_folder} 

	Rscript workflow/scripts/AA_freq.R \
	--input_recalibrated {input.recalibrated} \
	--excel_location {input.excel} \
	--out_folder {params.out_folder} 

	Rscript workflow/scripts/char_variants.R \
	--csv_location {input.recalibrated} \
	--out_folder {params.out_folder} 
	"""

SnakeMake From line 359 of workflow/Snakefile

run:
	import os
	parameters=grid_search_table[grid_search_table["prefix"]==wildcards[0]]
	os.system('Rscript workflow/scripts/prediction.R --prefix ' + 
		wildcards[0] + " --excel_location " + input[0] + 
		" --csv_location " + input[1] + 
		" --out_folder " + params[1] +
		" " + " ".join(parameters.iloc[0,1:].to_list()))

SnakeMake From line 386 of workflow/Snakefile

shell:
	"""
	mkdir -p "results/joined_grid"
	cat {input} | tail -n3 | grep "auc_Alph_train_gnomAD" > {output}
	cat {input} | grep -v "auc_Alph_train_gnomAD" >> {output}
	"""

SnakeMake From line 403 of workflow/Snakefile

run:
	import pandas as pd
	import os

	# uncomment for automatic selection:
	grid_results=pd.read_csv("results/joined_grid/joined_grid.tsv", sep="\t")
	grid_results=grid_results.sort_values(["auc_Alph_interim_CV"])
	best_model=grid_results.iloc[-1]
	parameters=grid_search_table[grid_search_table["prefix"]==best_model.condition]

	#manual selection:
	#parameters=grid_search_table[grid_search_table["prefix"]=="randomforest_4000_12_500_0.999999_0_0_90_0_0_regCols"] 

	base_command=('Rscript workflow/scripts/prediction.R ' +
		' --csv_location ' + input[1] + 
		' --out_folder ' + params[1] +
		' ' + ' '.join(parameters.iloc[0,1:].to_list()) )

	# overwrite excel location if NullModel should be fit:
	if wildcards[0] == "_NullModel":
		base_command = base_command + ' --excel_location resources/available_colnames_NullModel.xlsx'
	if wildcards[0] == "_nopLDDT":
		base_command = base_command + ' --excel_location resources/available_colnames_nopLDDT.xlsx'

	print("base command:")
	print(base_command)

	pre_final_k_fold_com=base_command + ' --prefix pre_final_model_k_fold'  + wildcards[0] +' --k_fold_cross_val TRUE '

	pre_final_write_dataset_imp=base_command + ' --prefix pre_final_model' + wildcards[0] +' --write_model TRUE --write_dataset TRUE --importance impurity '

	pre_final_permutation=base_command + ' --prefix pre_final_model' + wildcards[0] +' --importance permutation '

	final_write_model=base_command + ' --prefix final' + wildcards[0] + ' --full_model TRUE --write_dataset TRUE --write_model TRUE '

	print("run k-fold cross validation, command:")
	os.system("echo " + pre_final_k_fold_com)
	os.system(pre_final_k_fold_com)

	print("run pre final model importance, command:")
	os.system("echo " + pre_final_write_dataset_imp)
	os.system(pre_final_write_dataset_imp)

	print("run pre final model importance, command:")
	os.system("echo " + pre_final_permutation)
	os.system(pre_final_permutation)

	print("run final model, command:")
	os.system("echo " + final_write_model)
	os.system(final_write_model)

SnakeMake Pandas From line 432 of workflow/Snakefile

shell:
	"""
	Rscript workflow/scripts/predict_w_ranger_model_for_Evaluation.R \
	--csv_location {input.csv} \
	--model_location {input.model} \
	--use_cols_file {input.colnames} \
	--toAS_properties {input.to_AS} \
	--model_location_null {input.model_null} \
	--use_cols_file_null {input.colnames_null} \
	--toAS_properties_null {input.to_AS_null} \
	--output_file {output} \
	--reduced {wildcards.reduced}
	"""

SnakeMake From line 499 of workflow/Snakefile

shell:
	"""
	Rscript workflow/scripts/predict_w_ranger_model_Final.R \
	--csv_location {input.csv} \
	--model_location {input.model_Full} \
	--use_cols_file {input.colnames_Full} \
	--toAS_properties {input.to_AS_Full} \
	--combined_model {input.combined_model} \
	--training_var_ids {input.training_var_ids} \
	--output_file {output} \
	--reduced {wildcards.reduced}
	"""

SnakeMake From line 526 of workflow/Snakefile

shell:
	"""
	Rscript workflow/scripts/create_final_combined_models.R \
	--training_dataset {input.training_dataset} \
	--combined_model {output.combined_model} \
	--training_var_ids {output.training_var_ids}
	"""

SnakeMake From line 548 of workflow/Snakefile

shell:
	"""
	Rscript workflow/scripts/compile_scores.R \
	--prot_folder="results/predicted_prots_eval/" \
	--suffix="_w_AlphScore_red_FALSE.csv.gz" \
	--mapping={input.map_file}
	"""

SnakeMake From line 565 of workflow/Snakefile

shell:
	"""
	Rscript workflow/scripts/analyse_DMS_score.R \
	--variants {input.test_dataset} \
	--validation_set {input.compiled} \
	--out_folder {params.out_folder}
	"""

SnakeMake From line 583 of workflow/Snakefile

shell:
	"""
	Rscript workflow/scripts/Fig_feat_imp.R \
	--input_impurity {input.impurity} \
	--input_permutation {input.permutation} \
	--prefix {params.prefix}  \
	--out_folder {params.out_folder} 

	Rscript workflow/scripts/Fig_auc_cv.R \
	--tsv_location {input.tsv_location} \
	--out_folder {params.out_folder} 
	"""

SnakeMake From line 606 of workflow/Snakefile

shell:
	"workflow/scripts/combine_all_proteins_to_one_file.sh {params.in_folder} {params.out_folder}"

SnakeMake From line 631 of workflow/Snakefile

shell:
	"""
	old_wd=$(pwd)
	mkdir -p {params.out_folder}
	cd {params.out_folder}

	#get clinvar2022
	wget -nc {params.clinvar_adress}

	# filter for benign / pathogenic missense variants
	zcat clinvar_20220109.vcf.gz | grep "^#CHR" | sed "s/#//g" > header_clinvar_vcf.txt
	zcat clinvar_20220109.vcf.gz | grep missense_variant | egrep "CLNSIG=Pathogenic|CLNSIG=Likely_pathogenic" | cat header_clinvar_vcf.txt - | gzip > clinvar_2022_pathogenic.vcf.gz
	zcat clinvar_20220109.vcf.gz | grep missense_variant | egrep "CLNSIG=Benign|CLNSIG=Likely_benign" | cat header_clinvar_vcf.txt - | gzip > clinvar_2022_benign.vcf.gz

	# create list of these clinvar variants, use to extract predicted scores
	zcat clinvar_2022_pathogenic.vcf.gz clinvar_2022_benign.vcf.gz | grep -v "^CHROM" | awk '{{print $1, $2 }}'  >  varlist_clinvar_2022.txt

	cat $old_wd"/"{input.header} | tr "," "\t" > headerPostTabix.txt

	tabix $old_wd"/"{input.values} -R varlist_clinvar_2022.txt | cat headerPostTabix.txt - | gzip > values_of_clinvar_variants.tsv.gz
	"""

SnakeMake tabix From line 649 of workflow/Snakefile

	shell:
		"""
		mkdir -p {params.temp}

		zcat {input.values} | cut -f10 | \
		 uniq -d | uniq > {output.duplicate_list}

		  cat {output.duplicate_list}
		cat {input.header} | tr "," "\t" > {output.tab_header}

 		lines_dupl=$(wc -l {output.duplicate_list} | cut -f1 -d" " )

		if [ $lines_dupl -gt 0 ]
		then

		echo "duplicated variants present"

		zcat {input.values} | grep -f {output.duplicate_list} | \
		 cat {output.tab_header} - | bgzip > {output.duplicated_vars}

		# deduplicate variants
		Rscript workflow/scripts/remove_duplicates_from_final_score.R \
		 --dup_vars {output.duplicated_vars} \
		 --out_file {output.deduplicated_duplicates}

		# merge unique and deduplicated vars
		zcat {input.values} | tail -n +2 | \
		 grep -v -f {output.duplicate_list} | \
		 cat - {output.deduplicated_duplicates} | \
		 sort -V -t, -T {params.temp} -k1 -k2 - | \
		 cat  {output.tab_header} - | \
		 bgzip > {output.all_deduplicated}

		else

		echo "no duplicated variants present"

		touch {output.duplicated_vars}
		touch {output.deduplicated_duplicates}

		zcat {input.values} | tail -n +2 | \
		 cat  {output.tab_header} - | \
		 bgzip > {output.all_deduplicated}

		fi

		tabix -s 1 -b 2 -e 2 {output.all_deduplicated}
		rm -rf {params.temp}				

		"""

SnakeMake tabix From line 686 of workflow/Snakefile

shell:
	"""
	tabix {input.values} -R {input.varlist} | cat {input.tab_header} - | gzip > {output.cv_vars}
	"""

SnakeMake tabix From line 749 of workflow/Snakefile

shell:
	"""
	Rscript workflow/scripts/get_performance_clinvar_2022.R \
	--clinvar_benign {input.cv_ben} \
	--clinvar_pathogenic {input.cv_patho} \
	--AlphaFold_scores {input.cv_scores} \
	--variants {input.variants} \
	--protein_subset {input.membrane_proteins} \
	--out_folder {params.out_folder} 
	"""

SnakeMake From line 769 of workflow/Snakefile

shell:
	"""
	Rscript workflow/scripts/histogramsFinalModel.R \
	--clinvar_benign {input.cv_ben} \
	--clinvar_pathogenic {input.cv_patho} \
	--AlphaFold_scores {input.cv_scores} \
	--variants {input.variants} \
	--out_folder {params.out_folder} 
	"""

SnakeMake From line 793 of workflow/Snakefile

shell:
	"""
	Rscript workflow/scripts/plot_pLDDT_vs_CV_gnomAD.R \
	--variants {input.variants} \
	--out_folder {params.out_folder}
	"""

SnakeMake From line 813 of workflow/Snakefile

Improving pathogenicity prediction of missense variants by using AlphaFold-derived features

Help improve this workflow!

Workflow

Install the requiered dependencies

Run the snakemake pipeline

Code Snippets

Comments

Support

Free

Related Workflows

public

public

public

public

public

public