Genome-to-genome analysis reveals associations between human and mycobacterial genetic variation in tuberculosis patients from Tanzania

public public 1yr ago 0 bookmarks

Preprint: https://doi.org/10.1101/2023.05.11.23289848

Summary Statistics

Full summary statistics: https://doi.org/10.5281/zenodo.6373034

Reported associations: summary_stats

Code

Snak

Code Snippets

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
library(ape)
library(parallel)
GetGeneCoverage <- function(VCF_Files,ref_annot,n_cores){
  pbmclapply(1:length(VCF_Files),function(i){
    cur_vcf <- VCF_Files[i]
    cur_out <- gsub(x=cur_vcf,pattern = '.vcf.gz','.coverage')
    system(glue::glue("bcftools query -f '%POS [ %SDP]\n' {cur_vcf} > {cur_out}"))
  },mc.cores = n_cores)

  gff_file <- read.gff(ref_annot)
  gff_file_genes <- gff_file[gff_file$type=='gene',]

  gene_df <- data.frame(Gene = sapply(gff_file_genes$attributes,function(x) strsplit(x=strsplit(x=x,split = ';')[[1]][3],split = '=')[[1]][2]),start = gff_file_genes$start, end = gff_file_genes$end,length = gff_file_genes$end - gff_file_genes$start + 1 ,stringsAsFactors = F)

  cov_matrix <- matrix(NA,nrow = length(VCF_Files),ncol = nrow(gene_df))
  sample_names <- sapply(VCF_Files,function(x) strsplit(x=strsplit(x=x,split = '/')[[1]][length(strsplit(x=x,split = '/')[[1]])],split = '\\.')[[1]][1])
  rownames(cov_matrix) <- sample_names
  colnames(cov_matrix) <- gene_df$Gene

  for(i in 1:length(VCF_Files)){
    print(i)
    cur_vcf <- VCF_Files[i]
    cur_out <- gsub(x=cur_vcf,pattern = '.vcf.gz','.coverage')

    cur_coverage <- data.table::fread(cur_out)
    gene_cov <- unlist(pbmclapply(1:nrow(gene_df),function(k) {
      start <- gene_df$start[k]
      end = gene_df$end[k]
      return(mean(cur_coverage$V2[cur_coverage$V1 >= start & cur_coverage$V1 <= end]))
    },mc.cores = n_cores))
    cov_matrix[i,] <- gene_cov
    system(glue::glue("rm {cur_out}"))
  }
  return(list(cov_matrix = cov_matrix,gene_df=gene_df))
}

#Get positions for each samples which contain missing call
GetMissingMatrix <- function(missing_files){
  #Get data frame for each sample, containing missing positions
  missing_tbl <- lapply(missing_files,function(x) data.table::fread(x))
  names(missing_tbl) <- sapply(missing_files,function(x) {
    split_path <- strsplit(x=x,split = '/')[[1]]
    file_name <- split_path[length(split_path)]
    ID <- strsplit(file_name,split = '\\.')[[1]][1]
    return(ID)
  })

  uniq_pos <- sort(unique(do.call(rbind, missing_tbl)$V1))
  missing_mat <- matrix(F,nrow = length(uniq_pos),ncol = length(missing_tbl))
  rownames(missing_mat) <- as.character(uniq_pos)
  colnames(missing_mat) <- names(missing_tbl)

  for(i in 1:ncol(missing_mat)){
    missing_mat[as.character(missing_tbl[[i]]$V1),i] <- T
  }
  return(missing_mat)
}
args <- commandArgs(trailingOnly = TRUE) 
vcf_files <- args[grepl(args,pattern = '.vcf.gz')]
missing_files <- args[grepl(args,pattern = '.missing')]
params <- args[!grepl(args,pattern = '.vcf.gz') & !grepl(args,pattern = '.missing')]
ref_annot <- params[[1]]
out_dir <- params[[2]]
n_cores <- as.numeric(params[[3]])
system(glue::glue("mkdir -p {out_dir}"))

#Get average coverage for each gene
# Gene_Result <- GetGeneCoverage(vcf_files,ref_annot,n_cores)
# Gene_Cov_Matrix <- Gene_Resul$cov_matrix
# saveRDS(Gene_Cov_Matrix,glue::glue("{out_dir}/Mtb_gene_coverage.rds"))

#Get missing positions for each sample
missing_mat <- GetMissingMatrix(missing_files)
saveRDS(missing_mat,glue::glue("{out_dir}missing_mat.rds"))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
library(glue)
library(dplyr)
#Convert VCF file to AA list for each sample with non-synonymous variants
WriteAATable <- function(cur_file,locus_to_excl,snpEff,cur_id,out_path){
  DATA_DIR <- paste0(strsplit(cur_file,split = '/')[[1]][1:(length(strsplit(cur_file,split = '/')[[1]]) - 1)],collapse = '/')
  cur_prefix <- strsplit(cur_file,split = '/')[[1]][length(strsplit(cur_file,split = '/')[[1]])]
  cur_file_unzip <- gsub(pattern = '.gz',replacement = '',cur_prefix)
  system(glue::glue("mkdir -p {DATA_DIR}/rep_filt/"))

  out_dir <- gsub(x=out_path,pattern = paste0(cur_id,'.txt'),replacement = '')
  system(glue::glue("mkdir -p {out_dir}"))

  #Loop through all possible reading frames for each nucleotide variant
  cur_annot_table <- list()
  loop_tbl = T
  cnt <- 0
  #Loop through all frames
  while(loop_tbl){
    #Remove SNPs in repetitive regions 
    system(glue::glue("bedtools subtract -header -A -a {cur_file} -b {locus_to_excl} > {DATA_DIR}/rep_filt/{cur_file_unzip}"))
    #Get SNP consequences 
    cur_table <- system(glue::glue("java -jar {snpEff} extractFields {DATA_DIR}/rep_filt/{cur_file_unzip} POS REF ALT ANN[{cnt}].EFFECT ANN[{cnt}].GENE EFF[{cnt}].AA GEN[0].GT GEN[0].FREQ> {DATA_DIR}{cur_file_unzip}.tbl"),intern = T)
    cur_table_parsed <- data.table::fread(glue::glue("{DATA_DIR}{cur_file_unzip}.tbl"),sep = '\t',fill = F)
    colnames(cur_table_parsed) <- c('POS','REF','ALT','EFFECT','GENE','AA_Change','GENOTYPE','FREQ')
    cur_table_parsed$FREQ <- sapply(cur_table_parsed$FREQ,function(x) as.numeric(gsub(x=x,pattern = '%',replacement = '')))
    #End loop if there are no longer have any frames to search on. 
    if(!all(is.na(cur_table_parsed$EFFECT))){
      cnt <- cnt + 1
      cur_annot_table <- c(cur_annot_table,list(cur_table_parsed))
    }else{
      loop_tbl <- F
    }
  }
  #Cleanup
  system(glue::glue("rm {DATA_DIR}{cur_file_unzip}.tbl"))
  if(length(cur_annot_table) == 0){
    system(glue::glue("touch {out_path}"))
    return(NA)
  }
  cur_annot_table_full <- do.call(rbind,cur_annot_table) %>% dplyr::filter(EFFECT != 'synonymous_variant' & AA_Change != '')
  data.table::fwrite(cur_annot_table_full,file = out_path)
}

#Convert VCF file to Nucleotide list for each sample with synonymous variants
WriteNucTableSyn <- function(cur_file,locus_to_excl,snpEff,cur_id,out_path){
  DATA_DIR <- paste0(strsplit(cur_file,split = '/')[[1]][1:(length(strsplit(cur_file,split = '/')[[1]]) - 1)],collapse = '/')
  cur_prefix <- strsplit(cur_file,split = '/')[[1]][length(strsplit(cur_file,split = '/')[[1]])]
  cur_file_unzip <- gsub(pattern = '.gz',replacement = '',cur_prefix)

  system(glue::glue("mkdir -p {DATA_DIR}/rep_filt/"))

  out_dir <- gsub(x=out_path,pattern = paste0(cur_id,'.txt'),replacement = '')
  system(glue::glue("mkdir -p {out_dir}"))

  #Loop through all possible reading frames for each nucleotide variant
  cur_annot_table <- list()
  loop_tbl = T
  cnt <- 0
  #Loop through all frames
  while(loop_tbl){
    #Remove SNPs in repetitive regions 
    system(glue::glue("bedtools subtract -header -A -a {cur_file} -b {locus_to_excl} > {DATA_DIR}/rep_filt/{cur_file_unzip}"))
    #Get SNP consequences 
    cur_table <- system(glue::glue("java -jar {snpEff} extractFields {DATA_DIR}/rep_filt/{cur_file_unzip} POS REF ALT ANN[{cnt}].EFFECT ANN[{cnt}].GENE GEN[0].GT GEN[0].FREQ > {DATA_DIR}{cur_file_unzip}.tbl"),intern = T)
    cur_table_parsed <- data.table::fread(glue::glue("{DATA_DIR}{cur_file_unzip}.tbl"),sep = '\t',fill = F)
    colnames(cur_table_parsed) <- c('POS','REF','ALT','EFFECT','GENE','GENOTYPE','FREQ')
    cur_table_parsed$FREQ <- sapply(cur_table_parsed$FREQ,function(x) as.numeric(gsub(x=x,pattern = '%',replacement = '')))

    #End loop if there are no longer have any frames to search on. 
    if(!all(is.na(cur_table_parsed$EFFECT))){
      cnt <- cnt + 1
      cur_annot_table <- c(cur_annot_table,list(cur_table_parsed))
    }else{
      loop_tbl <- F
    }
  }
  #Cleanup
  system(glue::glue("rm {DATA_DIR}{cur_file_unzip}.tbl"))
  if(length(cur_annot_table) == 0){
    system(glue::glue("touch {out_path}"))
    return(NA)
  }
  cur_annot_table_full <- do.call(rbind,cur_annot_table)
  syn_variants <- dplyr::filter(cur_annot_table_full,EFFECT == 'synonymous_variant')
  data.table::fwrite(syn_variants,file = out_path)
}


args <- commandArgs(trailingOnly = TRUE) 
vcf_file <- args[[1]]
locus_to_excl <- args[[2]]
snpEff <- args[[3]]
cur_id <- args[[4]]
out_path <- args[[5]]
out_path_syn <- args[[6]]

#Get VCF Files and write into AA tables
WriteAATable(vcf_file,locus_to_excl,snpEff,cur_id,out_path)
WriteNucTableSyn(vcf_file,locus_to_excl,snpEff,cur_id,out_path_syn)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
library(glue)
library(stringr)
library(ape)
library(parallel)
library(pbmcapply)
library(filematrix)
library(data.table)

#Filter variants which only appear in one lineage, and show no variability within the lineage.
RemoveStratVariants <- function(AA_Matrix,Lineage_Df){
  samples_to_keep <- intersect(Lineage_Df$G_NUMBER,rownames(AA_Matrix))
  Lineage_Df <- Lineage_Df[match(samples_to_keep,Lineage_Df$G_NUMBER),]
  #Keep samples where Host data exists
  AA_Matrix_filt <- AA_Matrix[samples_to_keep,]
  #Remove Missing Variants/Genes
  AA_Matrix_filt <- AA_Matrix_filt[,!apply(AA_Matrix_filt,2,function(x) all(is.na(x)))]

  #Assign variants to lineages
  strat_table <- lapply(1:ncol(AA_Matrix_filt),function(i) table(ifelse(AA_Matrix_filt[,i]==0,0,1),Lineage_Df$Lineage))
  strat_assng <- vector(mode = 'logical',length = length(strat_table))
  for(i in 1:length(strat_table)){
    cur_strat <- strat_table[[i]]
    MAC_by_lineage <- apply(cur_strat,2,function(x) min(x,na.rm = T))
    #If a variant is perfectly stratified by lineage and there are no variability within a lineage, exclude from analysis
    if(sum(MAC_by_lineage > 0) == 0){
      strat_assng[i] <- F
    }else
      strat_assng[i] <- T
  }

  return(AA_Matrix_filt[,strat_assng])
}

#Merge AA table of each Mtb sequence into a matrix
AATblToMatrix <- function(AA_Tbl_Files,phylo_snps = NULL,sift_table = NULL,sift_thresh = 0.05,het_thresh = het_thresh,n_cores = 10,missing_matrix){
  if(!is.null(sift_table)){
    #Include SNPs with SIFT score < threshold (Deleterious)
    sift_incl <- dplyr::filter(sift_table,SIFT_score > sift_thresh) %>% dplyr::select(POS = `#Position`,REF=Ref_allele,ALT=New_allele) %>% dplyr::filter(REF != ALT)
  }
  #Parse all sample IDs
  all_sample_ids <- sapply(AA_Tbl_Files,function(x) strsplit(x=x,split = '/')[[1]][length(strsplit(x=x,split = '/')[[1]])])
  all_sample_ids <- sapply(all_sample_ids,function(x) gsub(pattern = '.txt',replacement = '',x=x))

  #Order missing matrix
  missing_matrix <- missing_matrix[,all_sample_ids]

  #Get all non-syn variants for each Mtb sample
  all_variants <- pbmclapply(AA_Tbl_Files,function(x){
    tbl <- data.table::fread(x)
    if(nrow(tbl) == 0){
      return(data.frame(ID = character(),Genotype = numeric()))
    }
    if(!is.null(phylo_snps)){
      tbl <- dplyr::anti_join(tbl,phylo_snps,by=c('POS'='POS','REF'='REF','ALT'='ALT'))
    }
    if(!is.null(sift_table)){
      tbl <- dplyr::inner_join(tbl,sift_incl,by=c('POS'='POS','REF'='REF','ALT'='ALT'))
    }
    return(data.frame(ID = paste0(tbl$GENE,':',tbl$POS,':',tbl$AA_Change),Genotype = ifelse(tbl$GENOTYPE=='1/1',2,ifelse(tbl$GENOTYPE!='1/1' & tbl$FREQ > het_thresh,1,0)))) #Homozygotes as 2, heterzygotes (mixed calls) as 1
  },mc.cores = n_cores)
  names(all_variants) <- all_sample_ids

  #Construct AA dosage matrix (0,1,or s)
  uniq_variant_ids <- unique(unlist(lapply(all_variants,function(x) x$ID)))
  uniq_variant_genes <- sapply(uniq_variant_ids,function(x) strsplit(x=x,split = ':')[[1]][1])
  uniq_variant_nuc_pos <- as.numeric(sapply(uniq_variant_ids,function(x) strsplit(x=x,split = ':')[[1]][2]))
  uniq_variant_pos <- sapply(uniq_variant_ids,function(x) str_extract(strsplit(x=x,split = 'p\\.')[[1]][2],"[[:digit:]]+"))
  uniq_variant_df <- data.frame(ID = uniq_variant_ids,Gene = uniq_variant_genes, Pos = as.numeric(uniq_variant_pos),Nuc_pos = uniq_variant_nuc_pos)
  uniq_variant_df_sorted <- dplyr::group_by(uniq_variant_df,Gene) %>% dplyr::arrange(Pos,.by_group = T)
  sorted_variant_ids <- uniq_variant_df_sorted$ID

  #Initialize AA Matrix
  AA_Matrix <- matrix(0,nrow = length(all_variants),ncol = length(sorted_variant_ids))
  rownames(AA_Matrix) <- names(all_variants)
  colnames(AA_Matrix) <- sorted_variant_ids

  #Fill in AA Matrix
  for(i in 1:nrow(AA_Matrix)){
    AA_Matrix[i,all_variants[[i]]$ID] <- all_variants[[i]]$Genotype
    #Set missing positions
    missing_pos <- rownames(missing_matrix)[missing_matrix[,i]==T]
    missing_variants <- dplyr::filter(uniq_variant_df_sorted,Nuc_pos %in% missing_pos)
    AA_Matrix[i,missing_variants$ID] <- NA
  }
  return(AA_Matrix)
}

#Merge Synonymous Nuc table of each Mtb sequence into a matrix
NucSynTblToMatrix <- function(AA_Tbl_Files,phylo_snps = NULL,missing_matrix,het_thresh=het_thresh,n_cores = n_cores){
  #Parse all sample IDs
  all_sample_ids <- sapply(AA_Tbl_Files,function(x) strsplit(x=x,split = '/')[[1]][length(strsplit(x=x,split = '/')[[1]])])
  all_sample_ids <- sapply(all_sample_ids,function(x) gsub(pattern = '.txt',replacement = '',x=x))

  #Get all non-syn variants for each Mtb sample
  all_variants <- pbmclapply(AA_Tbl_Files,function(x){
    tbl <- data.table::fread(x)
    if(nrow(tbl) == 0){
      return(data.frame(ID = character(),Genotype = numeric()))
    }

    if(!is.null(phylo_snps)){
      tbl <- dplyr::anti_join(tbl,phylo_snps,by=c('POS'='POS','REF'='REF','ALT'='ALT'))
    }

    return(data.frame(ID = paste0(tbl$GENE,':',tbl$POS,':',tbl$REF,'>',tbl$ALT),Genotype = ifelse(tbl$GENOTYPE=='1/1',2,ifelse(tbl$GENOTYPE!='1/1' & tbl$FREQ > het_thresh,1,0))))
  },mc.cores = n_cores)
  names(all_variants) <- all_sample_ids

  #Construct AA dosage matrix (0s or 1s)
  uniq_variant_ids <- unique(unlist(lapply(all_variants,function(x) x$ID)))
  uniq_variant_genes <- sapply(uniq_variant_ids,function(x) strsplit(x=x,split = ':')[[1]][1])
  uniq_variant_pos <- sapply(uniq_variant_ids,function(x) strsplit(x=x,split = ':')[[1]][2])
  uniq_variant_df <- data.frame(ID = uniq_variant_ids,Gene = uniq_variant_genes, Pos = as.numeric(uniq_variant_pos))
  uniq_variant_df_sorted <- dplyr::group_by(uniq_variant_df,Gene) %>% dplyr::arrange(Pos,.by_group = T)
  sorted_variant_ids <- uniq_variant_df_sorted$ID

  #Initialize AA Matrix
  AA_Matrix <- matrix(0,nrow = length(all_variants),ncol = length(sorted_variant_ids))
  rownames(AA_Matrix) <- names(all_variants)
  colnames(AA_Matrix) <- sorted_variant_ids


  #Fill in AA Matrix
  for(i in 1:nrow(AA_Matrix)){
    AA_Matrix[i,all_variants[[i]]$ID] <- all_variants[[i]]$Genotype
    #Set missing positions
    missing_pos <- rownames(missing_matrix)[missing_matrix[,i]==T]
    missing_variants <- dplyr::filter(uniq_variant_df_sorted,Pos %in% missing_pos)
    AA_Matrix[i,missing_variants$ID] <- NA
  }
  return(AA_Matrix)
}

GetGeneBurden <- function(AA_Matrix,AA_Matrix_Syn,metadata){
  #Remove SNPs that are lineage markers
  AA_Matrix <- RemoveStratVariants(AA_Matrix,metadata)

  #Get Variant-Gene mapping for all non-syn variants
  non_syn_variants <- colnames(AA_Matrix)
  non_syn_genes <- sapply(non_syn_variants,function(x) strsplit(x=x,split = ':')[[1]][1])
  non_syn_variant_df <- data.frame(SNP = non_syn_variants,Gene = non_syn_genes,stringsAsFactors = F)

  #Get non-syn burden per Gene
  non_syn_genes_uniq <- sort(unique(non_syn_variant_df$Gene))
  non_syn_burden <- matrix(data = NA,nrow = nrow(AA_Matrix),ncol = length(non_syn_genes_uniq))
  rownames(non_syn_burden) <- rownames(AA_Matrix)
  colnames(non_syn_burden) <- non_syn_genes_uniq
  non_syn_burden_homo <- non_syn_burden

  for(i in 1:length(non_syn_genes_uniq)){
    cur_gene <- non_syn_genes_uniq[i]
    cur_burden <- apply(AA_Matrix[,dplyr::filter(non_syn_variant_df,Gene == cur_gene)$SNP,drop = F],1,function(x) sum(x != 0,na.rm = T))
    non_syn_burden[,i] <- cur_burden
  }

  #Get Variant-Gene mapping for all syn variants, where gene contains syn variants
  syn_variants <- colnames(AA_Matrix_Syn)
  syn_genes <- sapply(syn_variants,function(x) strsplit(x=x,split = ':')[[1]][1])
  syn_variant_df <- data.frame(SNP = syn_variants,Gene = syn_genes) %>% dplyr::filter(Gene %in% non_syn_genes_uniq)

  #Get syn burden per Gene
  syn_genes_uniq <- sort(unique(syn_variant_df$Gene))
  syn_burden <- matrix(data = NA,nrow = nrow(AA_Matrix_Syn),ncol = length(syn_genes_uniq))
  rownames(syn_burden) <- rownames(AA_Matrix_Syn)
  colnames(syn_burden) <- syn_genes_uniq
  syn_burden_homo <- syn_burden

  for(i in 1:length(syn_genes_uniq)){
    cur_gene <- syn_genes_uniq[i]
    cur_burden <- apply(AA_Matrix_Syn[,dplyr::filter(syn_variant_df,Gene == cur_gene)$SNP,drop = F],1,function(x) sum(x!=0,na.rm = T))
    syn_burden[,i] <- cur_burden

  }

  return(list(non_syn_burden=non_syn_burden,syn_burden=syn_burden,non_syn_variant_df=non_syn_variant_df,syn_variant_df=syn_variant_df))
}

args <- commandArgs(trailingOnly = TRUE)
AA_variant_files <- args[grepl(args,pattern = 'AA_')]
Syn_variant_files <- args[grepl(args,pattern = 'Syn_')]
params <- args[!grepl(args,pattern = 'AA_') & !grepl(args,pattern = 'Syn_')]

missing_mat <- readRDS(params[[1]])
phylo_snps <- params[[2]]
del_tbl <- data.table::fread(params[[3]])
sift_path <- params[[4]]
IS_SIFT <- as.logical(params[[5]])
IS_Burden <- as.logical(params[[6]])
IS_Deletion <- as.logical(params[[7]])
out_path <- params[[8]]
metadata <- data.table::fread(params[[9]])
het_thresh = as.numeric(params[[10]])
n_cores <- as.numeric(params[[11]])

# missing_mat <- readRDS('../scratch/Mtb_Coverage/HomoOnly_True/missing_mat.rds')
# phylo_snps <- '../data/Mtb/PositionsPhylogeneticSNPs_20171004.txt'
# del_tbl <- data.table::fread('../data/Mtb/binary_table_genes_Sinergia_final_dataset_human_bac_genome_available.txt')
# sift_path <- '../data/Mtb/SIFT/GCA_000195955.2.22/Chromosome.gz'
# IS_SIFT <- T
# IS_Burden <- T
# IS_Deletion <- F
# out_path <- '../scratch//Mtb_Var_Tbl.rds'
# metadata <- data.table::fread('../data/pheno/metadata_Sinergia_final_dataset_human_bac_genome_available_QCed.txt')
# n_cores <- 1
# het_thresh <- 10
# AA_variant_files <- paste0('../scratch/AA_HomoOnly_True/',metadata$G_NUMBER,'.txt')
# Syn_variant_files <- paste0('../scratch/Syn_HomoOnly_True/',metadata$G_NUMBER,'.txt')


if(is.na(IS_SIFT) | is.na(IS_Burden) | is.na(IS_Deletion)){
  stop('Please specify options')
}
phylo_snps <- data.table::fread(phylo_snps) %>%
  dplyr::filter(PhylogeneticSNP %in% c('PhyloMarkerANCL1','PhyloMarkerANCL234','PhyloMarkerL1','PhyloMarkerL2','PhyloMarkerL3','PhyloMarkerL4')) %>%
  dplyr::select(POS = Position_ref,REF = anc, ALT = derived)

if(!IS_Burden){
  #Convert AA Tables into a Matrix
  AA_Matrix <- AATblToMatrix(AA_Tbl_Files = AA_variant_files,phylo_snps=phylo_snps,missing_matrix=missing_mat,het_thresh = het_thresh,n_cores = n_cores)
  #Add deleted genes as variants
  if(IS_Deletion){
    del_matrix <- as.matrix(del_tbl[,-c('GNUMBER','LINEAGE','NUMBER_OF_DELETED_GENES')])
    del_matrix[del_matrix==1] <- 2 #Set all deletions as Homo calls
    colnames(del_matrix) <- paste0(colnames(del_matrix),':NA:del')
    rownames(del_matrix) <- del_tbl$GNUMBER

    matrix_to_append <- matrix(NA,nrow = nrow(AA_Matrix),ncol = ncol(del_matrix))
    rownames(matrix_to_append) <- rownames(AA_Matrix)
    colnames(matrix_to_append) <- colnames(del_matrix)

    matrix_to_append <- del_matrix[rownames(AA_Matrix),]
    AA_Matrix <- cbind(AA_Matrix,matrix_to_append)
  }
}else if(IS_Burden){
  if(IS_SIFT){
    sift_table <- data.table::fread(cmd = glue::glue("zcat {sift_path}"))
    AA_Matrix_NonSyn <- AATblToMatrix(AA_Tbl_Files = AA_variant_files,phylo_snps=phylo_snps,sift_table = sift_table,missing_matrix=missing_mat,het_thresh = het_thresh,n_cores = n_cores)
  }else{
    AA_Matrix_NonSyn <- AATblToMatrix(AA_Tbl_Files = AA_variant_files,phylo_snps=phylo_snps,missing_matrix=missing_mat,het_thresh = het_thresh,n_cores = n_cores)
  }
  AA_Matrix_Syn <- NucSynTblToMatrix(AA_Tbl_Files = Syn_variant_files,phylo_snps=phylo_snps,missing_matrix = missing_mat,het_thresh = het_thresh,n_cores = n_cores)
  if(IS_Deletion){
    del_matrix <- as.matrix(del_tbl[,-c('GNUMBER','LINEAGE','NUMBER_OF_DELETED_GENES')])
    del_matrix[del_matrix==1] <- 2 #Set all deletions as Homo calls

    colnames(del_matrix) <- paste0(colnames(del_matrix),':NA:del')
    rownames(del_matrix) <- del_tbl$GNUMBER

    matrix_to_append <- matrix(NA,nrow = nrow(AA_Matrix_NonSyn),ncol = ncol(del_matrix))
    rownames(matrix_to_append) <- rownames(AA_Matrix_NonSyn)
    colnames(matrix_to_append) <- colnames(del_matrix)

    matrix_to_append <- del_matrix[rownames(AA_Matrix_NonSyn),]
    AA_Matrix_NonSyn <- cbind(AA_Matrix_NonSyn,matrix_to_append)
  }
  Gene_Burden <- GetGeneBurden(AA_Matrix = AA_Matrix_NonSyn,AA_Matrix_Syn = AA_Matrix_Syn,metadata=metadata)
  AA_Matrix <- Gene_Burden

}
saveRDS(AA_Matrix,file = out_path)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
library(pbmcapply)
library(dplyr)
library(data.table)
library(glue)
library(stringr)
set.seed(12345)

GetResults <- function(OUT_DIR,suffix = 'glm.logistic.hybrid',p_thresh=5e-8,n_cores=5,is_interaction = F,is_ordinal = F,tool = 'PLINK'){
  all_files <- dir(OUT_DIR)
  result_files <- all_files[sapply(all_files,function(x) grepl(pattern = suffix,x=x))]
  if(!is_interaction){
    if((tool == 'PLINK' | tool == 'PLINK-FIRTH') & !grepl(x=suffix,pattern = 'gz')){
      results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0("awk '{ if (NR == 1 || $13 <= ",p_thresh,") {print} }' ",OUT_DIR,x)),mc.cores = n_cores)
    }else if ((tool == 'PLINK' | tool == 'PLINK-FIRTH') & grepl(x=suffix,pattern = 'gz')){
      results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0('zcat ',OUT_DIR,x," | awk '{ if (NR == 1 || $13 <= ",p_thresh,") {print} }' ")),mc.cores = n_cores)
    }else if(tool == 'HLA-PLINK' ){
      results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0('zcat ',OUT_DIR,x)),mc.cores = n_cores)
    }else if(tool == 'HLA-PLINK-PERM' ){
      results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0('zcat ',OUT_DIR,x),select = c('ID','P')),mc.cores = n_cores)
    }
  }else{
    if(is_ordinal){
      results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0("zcat ",OUT_DIR,x," | awk -F ',' '{ if ($6 <= ",p_thresh,") {print} }' ")),mc.cores = n_cores)
    }else{
      results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0("zcat ",OUT_DIR,x," | grep 'ADDx' | awk '{ if ($12 <= ",p_thresh,") {print} }' ")),mc.cores = n_cores)
    }
  }
  names(results) <- result_files
  return(results)
}

RunG2G <- function(G2G_Obj,SOFTWARE_DIR,OUT_DIR,tool,n_PC = 5,n_pPC = 6,n_cores = 20,covars_to_incl = c(),
                   lineage = c(),stratified = T,debug=F,chr = c(),test_snp = c(),model = NA,var_type = 'both'){
  system(glue::glue("mkdir -p {OUT_DIR}"))
  OUT_DIR <- glue::glue("{OUT_DIR}/{tool}/")
  system(glue::glue("mkdir -p {OUT_DIR}"))
  OUT_DIR <- glue::glue("{OUT_DIR}/PC_{n_PC}_pPC_{n_pPC}/")
  system(glue::glue("mkdir -p {OUT_DIR}"))
  # if(length(covars_to_incl) > 0){
  #   OUT_DIR <- glue::glue("{OUT_DIR}/PC_{n_PC}_pPC_{n_pPC}_cov_{paste0(sapply(covars_to_incl,function(x) gsub(x=x,pattern='_',replacement='')),collapse = '-')}/")
  # }else{
  #   OUT_DIR <- glue::glue("{OUT_DIR}/PC_{n_PC}_pPC_{n_pPC}/")
  # }
  OUT_DIR <- glue::glue("{OUT_DIR}/Stratified_{str_to_title(as.character(stratified))}/")
  system(glue::glue("mkdir -p {OUT_DIR}"))

  #Initialize results object
  saveRDS(list(),glue::glue("{OUT_DIR}/G2G_results.rds"))

  if (tool == 'PLINK' | tool == 'PLINK-FIRTH' | tool == 'HLA-PLINK' | tool == 'HLA-PLINK-PERM'){
    if(length(lineage)==0 & stratified){
      lineages_to_run <- unique(names(G2G_Obj$aa_matrix_filt))
    }else if(length(lineage) > 0 & stratified){
      lineages_to_run <- lineage
    }else if(!stratified){
      lineages_to_run <- 'ALL'
    }
    for(i in 1:length(lineages_to_run)){
      cur_lineage <- lineages_to_run[i]
      OUT_PATH_Lineage <- glue::glue("{OUT_DIR}/LINEAGE_{cur_lineage}/")
      system(glue::glue("mkdir -p {OUT_PATH_Lineage}"))
      system(glue::glue("mkdir -p {OUT_PATH_Lineage}/tmp"))

      #Get IDD and FID, ensure they're in correct order
      fam_file <- data.table::fread(glue::glue("{G2G_Obj$host_path[cur_lineage]}.fam"))
      colnames(fam_file)[1:2] <- c('FID','IID')
      if(!all(fam_file$IID == G2G_Obj$both_IDs_to_keep[[cur_lineage]]$FAM_ID)){
        stop('Sample order incorrect')
      }

      #Extract AA matrix for current lineage
      if(stratified){
        cur_aa_matrix_filt <- G2G_Obj$aa_matrix_filt[[cur_lineage]]
      }else{
        cur_aa_matrix_filt <- G2G_Obj$aa_matrix_full
      }

      #Extract/convert dosage
      #If burden, > 1 (more than one variant in gene) also as present
      #Var_Type = Both, treat homo and hetero calls as present
      #Var_Type = Homo, treat homo calls as present, hetero calls as missing (encoded as NA)
      if(grepl(pattern = 'Burden_True',x=OUT_DIR)){
        cur_aa_matrix_filt[cur_aa_matrix_filt > 1] <- 1
      }
      else if(var_type == 'both' | var_type == 'homo'){
        cur_aa_matrix_filt[cur_aa_matrix_filt==2] <- 1
      }else{
        stop('Invalid Var Type')
      }

      #Append IDs to matrix
      cur_aa_Matrix_mat <- cbind(fam_file[,c(1,2)],as.matrix(cur_aa_matrix_filt))
      colnames(cur_aa_Matrix_mat) <- c('FID','IID',colnames(cur_aa_matrix_filt))

      data.table::fwrite(cur_aa_Matrix_mat,col.names = T,row.names = F,sep = ' ',file = glue::glue("{OUT_PATH_Lineage}/tmp/AA_outcome.txt"),na = 'NA',quote = F)

      #Store PCs and covars
      host_PCs <- G2G_Obj$host_PCs[[cur_lineage]]
      host_PCs <- dplyr::select(host_PCs,'IID',paste0('PC',1:n_PC))

      if(n_pPC != 0){
        pPCs <- G2G_Obj$vir_pPCs[[cur_lineage]] %>% dplyr::inner_join(G2G_Obj$both_IDs_to_keep[[cur_lineage]]) %>% dplyr::rename(IID=FAM_ID)
        pPCs <-dplyr::select(pPCs,IID,paste0('PC',1:n_pPC))
        colnames(pPCs) <- c('IID',paste0('pPC',1:n_pPC))
      }
      cur_covars_to_incl <- covars_to_incl
      #Don't include lineage if only running in single lineage.
      if((nrow(str_locate_all(pattern = 'L',cur_lineage)[[1]]) == 1) & cur_lineage != 'ALL'){
        cur_covars_to_incl <- setdiff(covars_to_incl,'LINEAGE')
      }

      if(length(cur_covars_to_incl) > 0){
        covars_num_discrete <- G2G_Obj$covars[[cur_lineage]]
        #Add lineage as covariate if more than one lineage in current group
        if(stratified){
          if('LINEAGE' %in% cur_covars_to_incl & (nrow(str_locate_all(pattern = 'L',cur_lineage)[[1]]) > 1 | cur_lineage == 'ALL')){
            lineage_df <- G2G_Obj$both_IDs_to_keep[[cur_lineage]] %>% dplyr::select(IID=FAM_ID,LINEAGE)
            lineage_df$LINEAGE <- as.factor(lineage_df$LINEAGE)
            lineage_df <- one_hot(as.data.table(lineage_df),cols = 'LINEAGE')
            lineage_df <- lineage_df %>% dplyr::select(-colnames(lineage_df)[ncol(lineage_df)])

            covars_num_discrete <- covars_num_discrete %>% dplyr::left_join(lineage_df)
          }
        }

        covars_num_discrete <- dplyr::select(covars_num_discrete,'IID',contains(cur_covars_to_incl))
        covars_num_discrete[is.na(covars_num_discrete)] <- 'NONE'

        if(n_pPC != 0){
          covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% 
            dplyr::left_join(pPCs,by=c('IID'='IID')) %>% 
            dplyr::left_join(covars_num_discrete,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID)
        }else{
          covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% 
            dplyr::left_join(covars_num_discrete,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID)
        }
      }else{
        if(n_pPC != 0){
          covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% 
            dplyr::left_join(pPCs,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID)
        }else{
          covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID)
        }
      }
      covars[covars==''] <- NA
      data.table::fwrite(covars,col.names = F,row.names = F,sep = '\t',file = glue::glue("{OUT_PATH_Lineage}/tmp/plink-covars.txt"),na = 'NA',quote = F)

      #Run association study for each AA variant in the current lineage
      AA_Matrix_No_ID <- cur_aa_Matrix_mat[,-c(1,2)]
      if(debug){
        end_index <- min(c(ncol(AA_Matrix_No_ID),5))
      }else{
        end_index <- ncol(AA_Matrix_No_ID)
      }

      #Generate Permutations if specified
      if(tool == 'HLA-PLINK-PERM'){
        N_Perm = 100

        Alleles_Path <- gsub(x=G2G_Obj$host_path[cur_lineage],pattern = 'G2G',replacement = 'HLA_Alleles_G2G')
        AA_Path <- gsub(x=G2G_Obj$host_path[cur_lineage],pattern = 'G2G',replacement = 'HLA_AA_G2G')
        system(glue::glue("mkdir -p {OUT_PATH_Lineage}/HLA_Allele/"))
        system(glue::glue("mkdir -p {OUT_PATH_Lineage}/HLA_AA/"))

        true_fam_file <- data.table::fread(glue::glue("{Alleles_Path}.fam"))

        perm_fam_file <- lapply(1:N_Perm,function(x) true_fam_file[sample(1:nrow(true_fam_file),size = nrow(true_fam_file),replace = F),])

        lapply(1:N_Perm,function(x) data.table::fwrite(perm_fam_file[[x]],glue::glue("{OUT_PATH_Lineage}/perm_{x}.fam"),sep = '\t',col.names = F,row.names = F))
      }


      pbmclapply(1:end_index,function(k){
        cur_pathogen_variant <- colnames(AA_Matrix_No_ID)[k]
        if(tool == 'PLINK'){
          if(is.na(model)){
              tryCatch(system(
                glue::glue(
                  "~/Software/plink2 --threads {min(c(n_cores,22))} --bfile {G2G_Obj$host_path[cur_lineage]} --glm cc-residualize hide-covar no-x-sex --covar-variance-standardize --1 --pheno {OUT_PATH_Lineage}/tmp/AA_outcome.txt --pheno-col-nums {k+2} --covar {OUT_PATH_Lineage}/tmp/plink-covars.txt --out {OUT_PATH_Lineage}{cur_pathogen_variant}"
                )
              ))
          }else{
            tryCatch(system(
              glue::glue(
                "~/Software/plink2 --threads {min(c(n_cores,22))} --bfile {G2G_Obj$host_path[cur_lineage]} --glm {model} cc-residualize hide-covar no-x-sex --covar-variance-standardize --1 --pheno {OUT_PATH_Lineage}/tmp/AA_outcome.txt --pheno-col-nums {k+2} --covar {OUT_PATH_Lineage}/tmp/plink-covars.txt --out {OUT_PATH_Lineage}{cur_pathogen_variant}.{model}"
              )
            ))

          }
          system(glue::glue('pigz --fast {OUT_PATH_Lineage}{cur_pathogen_variant}*.hybrid'))

        }else if(tool == 'HLA-PLINK'){
          Alleles_Path <- gsub(x=G2G_Obj$host_path[cur_lineage],pattern = 'G2G',replacement = 'HLA_Alleles_G2G')
          AA_Path <- gsub(x=G2G_Obj$host_path[cur_lineage],pattern = 'G2G',replacement = 'HLA_AA_G2G')
          system(glue::glue("mkdir -p {OUT_PATH_Lineage}/HLA_Allele/"))
          system(glue::glue("mkdir -p {OUT_PATH_Lineage}/HLA_AA/"))

          tryCatch(system(
            glue::glue(
              "~/Software/plink2 --threads {min(c(n_cores,22))} --bfile {Alleles_Path} --glm dominant cc-residualize hide-covar no-x-sex --covar-variance-standardize --1 --pheno {OUT_PATH_Lineage}/tmp/AA_outcome.txt --pheno-col-nums {k+2} --covar {OUT_PATH_Lineage}/tmp/plink-covars.txt --out {OUT_PATH_Lineage}/HLA_Allele/{cur_pathogen_variant}"
            )
          ))

          tryCatch(system(
            glue::glue(
              "~/Software/plink2 --threads {min(c(n_cores,22))} --bfile {AA_Path} --glm dominant cc-residualize hide-covar no-x-sex --covar-variance-standardize --1 --pheno {OUT_PATH_Lineage}/tmp/AA_outcome.txt --pheno-col-nums {k+2} --covar {OUT_PATH_Lineage}/tmp/plink-covars.txt --out {OUT_PATH_Lineage}/HLA_AA/{cur_pathogen_variant}"
            )
          ))
          system(glue::glue('pigz --fast {OUT_PATH_Lineage}/HLA_Allele/{cur_pathogen_variant}*.hybrid'))
          system(glue::glue('pigz --fast {OUT_PATH_Lineage}/HLA_AA/{cur_pathogen_variant}*.hybrid'))

        }else if(tool == 'HLA-PLINK-PERM'){
          for (q in 1:N_Perm){
            system(glue::glue("mkdir -p {OUT_PATH_Lineage}/HLA_Allele/Perm_{q}/"))
            system(glue::glue("mkdir -p {OUT_PATH_Lineage}/HLA_AA/Perm_{q}/"))


            tryCatch(system(
              glue::glue(
                "~/Software/plink2 --threads {min(c(n_cores,22))} --bfile {Alleles_Path} --fam {OUT_PATH_Lineage}/perm_{q}.fam --glm dominant cc-residualize hide-covar no-x-sex --covar-variance-standardize --1 --pheno {OUT_PATH_Lineage}/tmp/AA_outcome.txt --pheno-col-nums {k+2} --covar {OUT_PATH_Lineage}/tmp/plink-covars.txt --out {OUT_PATH_Lineage}/HLA_Allele/Perm_{q}/{cur_pathogen_variant}"
              )
            ))

            tryCatch(system(
              glue::glue(
                "~/Software/plink2 --threads {min(c(n_cores,22))} --bfile {AA_Path} --fam {OUT_PATH_Lineage}/perm_{q}.fam --glm dominant cc-residualize hide-covar no-x-sex --covar-variance-standardize --1 --pheno {OUT_PATH_Lineage}/tmp/AA_outcome.txt --pheno-col-nums {k+2} --covar {OUT_PATH_Lineage}/tmp/plink-covars.txt --out {OUT_PATH_Lineage}/HLA_AA/Perm_{q}/{cur_pathogen_variant}"
              )
            ))
            system(glue::glue('pigz --fast {OUT_PATH_Lineage}/HLA_Allele/Perm_{q}/{cur_pathogen_variant}*.hybrid'))
            system(glue::glue('pigz --fast {OUT_PATH_Lineage}/HLA_AA/Perm_{q}/{cur_pathogen_variant}*.hybrid'))
          }
        }else if(tool == 'PLINK-FIRTH'){
          tryCatch(system(
            glue::glue(
              "~/Software/plink2 --threads {min(c(n_cores,22))} --bfile {G2G_Obj$host_path[cur_lineage]} --no-sex --glm firth hide-covar --1 --pheno {OUT_PATH_Lineage}/tmp/AA_outcome.txt --pheno-col-nums {k+2} --covar {OUT_PATH_Lineage}/tmp/plink-covars.txt --out {OUT_PATH_Lineage}{cur_pathogen_variant}"
            )
          ))
        }
      },mc.cores = max(floor(n_cores / 22),1))

      if(tool == 'PLINK'){
        results <- list(GetResults(OUT_PATH_Lineage,suffix = 'glm.logistic.hybrid.gz',p_thresh=5e-8,n_cores=n_cores,is_interaction = F,is_ordinal = F,tool = tool))
        names(results) <- cur_lineage
        all_res <- readRDS(glue::glue("{OUT_DIR}/G2G_results.rds"))
        all_res <- c(all_res,results)
        saveRDS(all_res,glue::glue("{OUT_DIR}/G2G_results.rds"))
      }else if(tool == 'HLA-PLINK'){
        results_allele <- list(GetResults(paste0(OUT_PATH_Lineage,'HLA_Allele/'),suffix = 'glm.logistic.hybrid.gz',p_thresh=1,n_cores=n_cores,is_interaction = F,is_ordinal = F,tool = tool))
        names(results_allele) <- cur_lineage

        results_AA <- list(GetResults(paste0(OUT_PATH_Lineage,'HLA_AA/'),suffix = 'glm.logistic.hybrid.gz',p_thresh=1,n_cores=n_cores,is_interaction = F,is_ordinal = F,tool = tool))
        names(results_AA) <- cur_lineage

        all_res <- readRDS(glue::glue("{OUT_DIR}/G2G_results.rds"))
        all_res <- c(all_res,list(HLA_Allele = results_allele,HLA_AA = results_AA))
        saveRDS(all_res,glue::glue("{OUT_DIR}/G2G_results.rds"))
      }
    }
  }
}

args <- commandArgs(trailingOnly = TRUE)
G2G_Obj <- readRDS(args[[1]])
OUT_DIR <- args[[2]]
tool <- args[[3]]
n_PC <- as.numeric(args[[4]])
n_pPC <- as.numeric(args[[5]])
covars_to_incl <- args[[6]]
if(covars_to_incl == 'NA'){
  covars_to_incl <- c()
}else{
  covars_to_incl <- strsplit(covars_to_incl,split = ',')[[1]]
}
stratified <- as.logical(args[[7]])
homo_only <- as.logical(args[[8]])
n_cores <- as.numeric(args[[9]])

# G2G_Obj <- readRDS('../scratch/Burden_False_SIFT_False_Del_False_HomoOnly_True/G2G_Obj.rds')
# OUT_DIR <- '../results/Burden_False_SIFT_False_Del_True_HomoOnly_True/'
# tool <- 'PLINK'
# n_PC <- 3
# n_pPC <- 0
# covars_to_incl <- c('patient_sex')
# if(covars_to_incl == 'NA'){
#   covars_to_incl <- c()
# }else{
#   covars_to_incl <- strsplit(covars_to_incl,split = ',')[[1]]
# }
# stratified <- F
# homo_only <- T
# n_cores <- 10

RunG2G(G2G_Obj,SOFTWARE_DIR,OUT_DIR,tool = tool,n_cores = n_cores,n_PC = n_PC,n_pPC = n_pPC,debug = F,covars_to_incl = covars_to_incl,
       model = NA,stratified = stratified,var_type = ifelse(homo_only,'homo','both'))
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
library(pbmcapply)
library(dplyr)
library(data.table)
library(glue)
library(stringr)

GetResults <- function(OUT_DIR,suffix = 'glm.logistic.hybrid',p_thresh=5e-8,n_cores=5,is_interaction = F,is_ordinal = F,tool = 'PLINK'){
  all_files <- dir(OUT_DIR)
  result_files <- all_files[sapply(all_files,function(x) grepl(pattern = suffix,x=x))]
  if(!is_interaction){
    if((tool == 'PLINK' | tool == 'PLINK-FIRTH') & !grepl(x=suffix,pattern = 'gz')){
      results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0("awk '{ if (NR == 1 || $13 <= ",p_thresh,") {print} }' ",OUT_DIR,x)),mc.cores = n_cores)
    }else if ((tool == 'PLINK' | tool == 'PLINK-FIRTH') & grepl(x=suffix,pattern = 'gz')){
      results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0('zcat ',OUT_DIR,x," | awk '{ if (NR == 1 || $13 <= ",p_thresh,") {print} }' ")),mc.cores = n_cores)
    }else if(tool == 'GMMAT-SCORE'){
      results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0('zcat ',OUT_DIR,x," | awk '{ if (NR == 1 || $11 <= ",p_thresh,") {print} }' ")),mc.cores = n_cores)
    }
  }else{
    if(is_ordinal){
      results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0("zcat ",OUT_DIR,x," | awk -F ',' '{ if ($6 <= ",p_thresh,") {print} }' ")),mc.cores = n_cores)
    }else{
      results <- pbmclapply(result_files,function(x) data.table::fread(cmd = paste0("zcat ",OUT_DIR,x," | grep 'ADDx' | awk '{ if ($12 <= ",p_thresh,") {print} }' ")),mc.cores = n_cores)
    }
  }
  names(results) <- result_files
  return(results)
}

RunInteraction <- function(G2G_Obj,OUT_DIR,tool = 'PLINK',n_PC = 3,n_pPC = 0,n_cores = 20,covars_to_incl = c(),lineage = c(),by_lineage = F,stratified = T,debug=F,model = NA,var_type = 'both',pheno = 'TB_score'){
  OUT_DIR <- glue::glue("{OUT_DIR}/{tool}/")
  system(glue::glue("mkdir -p {OUT_DIR}"))
  OUT_DIR <- glue::glue("{OUT_DIR}/PC_{n_PC}_pPC_{n_pPC}/")
  system(glue::glue("mkdir -p {OUT_DIR}"))
  OUT_DIR <- glue::glue("{OUT_DIR}/Stratified_{str_to_title(as.character(stratified))}/")
  system(glue::glue("mkdir -p {OUT_DIR}"))

  saveRDS(list(),glue::glue("{OUT_DIR}/{pheno}_int_results.rds"))

  if(length(lineage)==0 & stratified){
    lineages_to_run <- unique(names(G2G_Obj$aa_matrix_filt))
  }else if(length(lineage) > 0 & stratified){
    lineages_to_run <- lineage
  }else if(!stratified){
    lineages_to_run <- 'ALL'
  }

  for(i in 1:length(lineages_to_run)){
    cur_lineage <- lineages_to_run[i]
    OUT_DIR_Lineage <- glue::glue("{OUT_DIR}/LINEAGE_{cur_lineage}/")
    system(glue::glue("mkdir -p {OUT_DIR_Lineage}"))
    system(glue::glue("mkdir -p {OUT_DIR_Lineage}/tmp"))

    #Get IDD and FID, ensure they're in correct order
    fam_file <- data.table::fread(glue::glue("{G2G_Obj$host_path[cur_lineage]}.fam"))
    colnames(fam_file)[1:2] <- c('FID','IID')
    if(!all(fam_file$IID == G2G_Obj$both_IDs_to_keep[[cur_lineage]]$FAM_ID)){
      stop('Sample order incorrect')
    }

    #Store AA matrix
    if(!by_lineage & stratified){
      cur_aa_matrix_filt <- G2G_Obj$aa_matrix_filt[[cur_lineage]]
    }else if(!by_lineage & !stratified){
      cur_aa_matrix_filt <- G2G_Obj$aa_matrix_full
    }
    else if(by_lineage){
      #SNP x LINEAGE interaction
      cur_aa_matrix_filt <- as.matrix(one_hot(as.data.table(data.frame(LINEAGE = G2G_Obj$vir_pPCs$ALL$LINEAGE))))
    }

    #Extract/convert dosage
    #Var_Type = Both, treat homo and hetero calls as present
    #Var_Type = Homo, treat homo calls as present, hetero calls as absent
    if(var_type == 'both'){
      cur_aa_matrix_filt[cur_aa_matrix_filt==2] <- 1
    }else if (var_type == 'homo'){
      cur_aa_matrix_filt[cur_aa_matrix_filt==1] <- 0
      cur_aa_matrix_filt[cur_aa_matrix_filt==2] <- 1
    }else{
      stop('Invalid Var Type')
    }

    #Store PCs and covars
    host_PCs <- G2G_Obj$host_PCs[[cur_lineage]]
    host_PCs <- dplyr::select(host_PCs,'IID',paste0('PC',1:n_PC))

    if(n_pPC != 0){
      pPCs <- G2G_Obj$vir_pPCs[[cur_lineage]]
      pPCs <-dplyr::select(pPCs,IID,paste0('PC',1:n_pPC))
      colnames(pPCs) <- c('IID',paste0('pPC',1:n_pPC))
    }
    if(length(covars_to_incl) > 0){
      covars_num_discrete <- G2G_Obj$covars[[cur_lineage]]
      covars_num_discrete <- dplyr::select(covars_num_discrete,'IID',covars_to_incl)
      if(n_pPC != 0){
        covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% 
          dplyr::left_join(pPCs,by=c('IID'='IID')) %>% 
          dplyr::left_join(covars_num_discrete,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID)
      }else{
        covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% 
          dplyr::left_join(covars_num_discrete,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID)
      }
    }else{
      if(n_pPC != 0){
        covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% 
          dplyr::left_join(pPCs,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID)
      }else{
        covars <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),host_PCs,by=c('IID'='IID')) %>% dplyr::relocate(FID,IID)
      }
    }
    #Change categorial binary covariates to numeric (0 and 1)
    cat_covars <- which(sapply(3:ncol(covars),function(q) any(is.character(as.vector(t(covars[,..q]))))))
    if(length(cat_covars) > 0){
      for(col in (cat_covars+2)){
        covars[,col] <- as.integer(as.factor(t(covars[,..col]))) - 1
      }
    }

    #Write out Phenotype
    if(pheno == 'TB_score'){
      tb_score <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),G2G_Obj$tb_score[[cur_lineage]] %>% dplyr::select(IID,TB_score),by=c('IID'='IID')) %>% dplyr::relocate(FID,IID,TB_score)
      data.table::fwrite(tb_score,col.names = T,quote = F,row.names = F,sep = ' ',file = glue::glue("{OUT_DIR_Lineage}/tmp/TB_score.txt"),na = 'NA')
    }else if(pheno == 'Xray_score'){
      xray_score <- dplyr::left_join(fam_file %>% dplyr::select(FID,IID),G2G_Obj$xray_score[[cur_lineage]] %>% dplyr::select(IID,Xray_score),by=c('IID'='IID')) %>% dplyr::relocate(FID,IID,Xray_score)
      data.table::fwrite(xray_score,col.names = T,quote = F,row.names = F,sep = ' ',file = glue::glue("{OUT_DIR_Lineage}/tmp/Xray_score.txt"),na = 'NA')

    }else{
      stop('Invalid Pheno')
    }


    #Run GWAS with lineage as covariate
    if(by_lineage){
      cur_covar <- cbind(covars[,1:2],data.frame(LINEAGE = G2G_Obj$vir_pPCs$ALL$LINEAGE),covars[,-c(1,2)])
      data.table::fwrite(cur_covar,col.names = T,quote = F,row.names = F,sep = ' ',na = 'NA',file = glue::glue("{OUT_DIR_Lineage}/tmp/lineage_covar.txt"))

      system(
        glue::glue(
          "plink2 --threads {n_cores} --bfile {G2G_Obj$host_path[cur_lineage]} --covar-variance-standardize --linear no-x-sex --pheno {OUT_DIR_Lineage}/tmp/{pheno}.txt --covar {OUT_DIR_Lineage}/tmp/lineage_covar.txt --out {OUT_DIR_Lineage}Lineage_GWAS"
        )
      )

      discrete_covar <- dplyr::select(cur_covar,c('FID','IID',intersect(colnames(cur_covar),c('Patient_Sex','HIV_Status','LINEAGE'))))
      num_covar <- dplyr::select(cur_covar,c('FID','IID',setdiff(colnames(cur_covar),colnames(discrete_covar))))
      data.table::fwrite(discrete_covar,col.names = T,quote = F,row.names = F,sep = ' ',na = 'NA',file = glue::glue("{OUT_DIR_Lineage}/tmp/lineage_discrete_covar.txt"))
      data.table::fwrite(num_covar,col.names = T,quote = F,row.names = F,sep = ' ',na = 'NA',file = glue::glue("{OUT_DIR_Lineage}/tmp/lineage_num_covar.txt"))
      system(
        glue::glue(
          "gcta64 --bfile {G2G_Obj$host_path[cur_lineage]} --make-grm --out {G2G_Obj$host_path[cur_lineage]} --thread-num {n_cores}"
        )
      )

      system(
        glue::glue(
          "gcta64 --mlma --bfile {G2G_Obj$host_path[cur_lineage]} --grm {G2G_Obj$host_path[cur_lineage]} --autosome --pheno {OUT_DIR_Lineage}/tmp/{pheno}.txt --covar {OUT_DIR_Lineage}/tmp/lineage_discrete_covar.txt --qcovar {OUT_DIR_Lineage}/tmp/lineage_num_covar.txt --thread-num {n_cores} --out {OUT_DIR_Lineage}Lineage_GWAS"
        )
      )
    }
    if(debug){
      end_index <- min(c(ncol(cur_aa_matrix_filt),2))
    }else{
      end_index <- ncol(cur_aa_matrix_filt)
    }

    for(k in 1:end_index){
      tryCatch({
        if(by_lineage){
          cur_pathogen_variant <- colnames(cur_aa_matrix_filt)[k]
          cur_covar <- cbind(covars[,1:2],cur_aa_matrix_filt[,k,drop=FALSE],cur_aa_matrix_filt[,-k][,-1],covars[,-c(1,2)])
          data.table::fwrite(cur_covar,col.names = T,quote = F,row.names = F,sep = ' ',na = 'NA',file = glue::glue("{OUT_DIR_Lineage}/tmp/{cur_pathogen_variant}.txt"))

          system(
            glue::glue(
              "plink2 --threads {n_cores} --bfile {G2G_Obj$host_path[cur_lineage]} --covar-variance-standardize --linear no-x-sex interaction --parameters 1-{ncol(cur_covar)} --pheno {OUT_DIR_Lineage}/tmp/{pheno}.txt --covar {OUT_DIR_Lineage}/tmp/{cur_pathogen_variant}.txt --out {OUT_DIR_Lineage}{cur_pathogen_variant}"
            )
          )
        }else{
          cur_pathogen_variant <- colnames(cur_aa_matrix_filt)[k]
          cur_covar <- cbind(covars[,1:2],cur_aa_matrix_filt[,k,drop=FALSE],covars[,-c(1,2)])
          data.table::fwrite(cur_covar,col.names = T,quote = F,row.names = F,sep = ' ',na = 'NA',file = glue::glue("{OUT_DIR_Lineage}/tmp/{cur_pathogen_variant}.txt"))

          system(
            glue::glue(
              "plink2 --threads {n_cores} --bfile {G2G_Obj$host_path[cur_lineage]} --covar-variance-standardize --linear no-x-sex interaction --parameters 1-{ncol(cur_covar)} --pheno {OUT_DIR_Lineage}/tmp/{pheno}.txt --covar {OUT_DIR_Lineage}/tmp/{cur_pathogen_variant}.txt --out {OUT_DIR_Lineage}{cur_pathogen_variant}"
            )
          )


          system(glue::glue("grep 'ADD' {OUT_DIR_Lineage}{cur_pathogen_variant}.{pheno}.glm.linear > {OUT_DIR_Lineage}{cur_pathogen_variant}.{pheno}.glm.linear.add"))
          system(glue::glue("pigz --fast {OUT_DIR_Lineage}{cur_pathogen_variant}.{pheno}.glm.linear.add"))
          system(glue::glue("rm {OUT_DIR_Lineage}{cur_pathogen_variant}.{pheno}.glm.linear"))
        }
      }
      )
    }
    results <- list(GetResults(OUT_DIR_Lineage,suffix = glue::glue('{pheno}.glm.linear.add.gz'),p_thresh = 5e-8,n_cores = n_cores,is_interaction = T,is_ordinal = F,tool = tool))
    names(results) <- cur_lineage
    all_res <- readRDS(glue::glue("{OUT_DIR}/{pheno}_int_results.rds"))
    all_res <- c(all_res,results)
    saveRDS(all_res,glue::glue("{OUT_DIR}/{pheno}_int_results.rds"))

  }

}

args <- commandArgs(trailingOnly = TRUE)
G2G_Obj <- readRDS(args[[1]])
OUT_DIR <- args[[2]]
tool <- args[[3]]
n_PC <- as.numeric(args[[4]])
n_pPC <- as.numeric(args[[5]])
covars_to_incl <- args[[6]]
if(covars_to_incl == 'NA'){
  covars_to_incl <- c()
}else{
  covars_to_incl <- strsplit(covars_to_incl,split = ',')[[1]]
}
stratified <- as.logical(args[[7]])
homo_only <- as.logical(args[[8]])
pheno <- args[[9]]
n_cores <- as.numeric(args[[10]])

# G2G_Obj <- readRDS('../scratch/Burden_False_SIFT_False_Del_False_HomoOnly_True/G2G_Obj.rds')
# OUT_DIR <- '../results/Burden_False_SIFT_False_Del_False_HomoOnly_True/'
# tool <- 'PLINK'
# n_PC <- 3
# n_pPC <- 0
# covars_to_incl <- c('patient_sex')
# if(covars_to_incl == 'NA'){
#   covars_to_incl <- c()
# }else{
#   covars_to_incl <- strsplit(covars_to_incl,split = ',')[[1]]
# }
# stratified <- F
# homo_only <- T
# n_cores <- 10
# pheno <- 'TB_score'

RunInteraction(G2G_Obj,OUT_DIR,
               tool = tool,
               n_cores = n_cores,n_PC = n_PC,n_pPC = n_pPC,
               covars_to_incl = covars_to_incl,
               debug = F,by_lineage = F,stratified=stratified,var_type = ifelse(homo_only,'homo','both'),pheno = pheno)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
library(dplyr)
library(parallel)

Map1KG <- function(bim_1KG,cur_lineage,scratch_dir){
  cur_lineage <- paste0('LINEAGE_',cur_lineage)

  if(!file.exists(glue::glue("{scratch_dir}{cur_lineage}/mapped_ids.txt"))){
    bim_1KG <- data.table::fread(bim_1KG,header = F)
    bim_1KG$V1 <- as.character(bim_1KG$V1)
    cur_host_bim <- data.table::fread(glue::glue("{scratch_dir}{cur_lineage}/TB_DAR_G2G.bim"),header = F)
    jned_file <- dplyr::left_join(cur_host_bim,bim_1KG,by=c('V1'='V1','V4'='V4','V5'='V5','V6'='V6'))

    data.table::fwrite(jned_file %>% dplyr::select(ID_Host=V2.x,ID_1KG=V2.y),sep = '\t',
                       file = glue::glue("{scratch_dir}{cur_lineage}/mapped_ids.txt"))

    cur_mapped_ids <- data.table::fread(glue::glue("{scratch_dir}{cur_lineage}/mapped_ids.txt"))

    ID_1KG <- cur_mapped_ids$ID_1KG
    ID_1KG[ID_1KG == ""] <- NA
    write(ID_1KG,glue::glue("{scratch_dir}{cur_lineage}/mapped_1KG_ids.txt"))
    remove(cur_mapped_ids)
    gc()

  }
}
RunPASCAL <- function(GWAS_file,variant,scratch_dir,results_dir,PASCAL_path,filt = T,gene_scoring = 'max',cur_lineage='ALL'){
  system(glue::glue("mkdir -p {results_dir}"))
  cur_lineage <- paste0('LINEAGE_',cur_lineage)

  #Write summary stats
  if(grepl(pattern = 'glm.logistic.hybrid.gz',x=GWAS_file)){
    system(glue::glue("zcat {GWAS_file} | awk '{{if(NR > 1) print $13}}' > {results_dir}/{variant}.tmp"))
  }else{
    system(glue::glue("zcat {GWAS_file} | grep 'ADDx' | awk '{{print $12}}' > {results_dir}/{variant}.tmp"))
  }
  system(glue::glue("paste {scratch_dir}{cur_lineage}/mapped_1KG_ids.txt {results_dir}/{variant}.tmp | grep -v 'NA' > {results_dir}/{variant}.pascal"))
  system(glue::glue("pigz --fast {results_dir}/{variant}.pascal"))
  system(glue::glue("rm {results_dir}/{variant}.tmp"))

  #Run PASCAL
  cur_wd = getwd()
  setwd(PASCAL_path)
  if(gene_scoring == 'max'){
    system(glue::glue("./Pascal --pval {results_dir}{variant}.pascal.gz --outdir={results_dir} --customdir=./resources/1kg_AFR/ --custom=AFR --runpathway=on --genescoring=max > {results_dir}{variant}.log"))
  }else if(gene_scoring == 'sum'){
    system(glue::glue("./Pascal --pval {results_dir}{variant}.pascal.gz --outdir={results_dir} --customdir=./resources/1kg_AFR/ --custom=AFR --runpathway=on --genescoring=sum  > {results_dir}{variant}.log"))
  }else{
    stop('gene_scoring misspecified')
  }
  setwd(cur_wd)

}

args <- commandArgs(trailingOnly = T)
gwas_file <- args[[1]]
variant <- args[[2]]
bim_1KG <- args[[3]]
scratch_dir <- args[[4]]
Map1KG(bim_1KG,'ALL',scratch_dir)
results_dir <- args[[5]]
pascal_path <- args[[6]]
RunPASCAL(gwas_file,variant,scratch_dir,results_dir,pascal_path,filt = T)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
library(dplyr)
library(ape)
library(phylobase)
library(adephylo)

#Minor allele count of a TB variant
GetMAC <- function(AA_Matrix_filt){
  MAC <- apply(AA_Matrix_filt,2,function(x) {
    no_na <- x[!is.na(x)]
    if(length(no_na) == 0){
      return(0)
    }
    no_na_binary <- ifelse(no_na == 0,0,1) #Set to binary matrix (0 for absent, 1 more present - either homo or hetero call or burden)
    return(ifelse(length(unique(no_na_binary)) != 1,min(table(no_na_binary)),length(no_na_binary) - min(table(no_na_binary))))
  })
  return(MAC)
}
#Phylogenetic PC 
ConstructpPCA <- function(AA_Table,Phylo_Tree_Path,ID_Mapping_df,pPCA = T,mac_thresh = 5){
  tree <- ape::read.tree(Phylo_Tree_Path)
  ID_Mapping_df_filt <- dplyr::filter(ID_Mapping_df,G_NUMBER %in% tree$tip.label & !is.na(LINEAGE))

  #Separate based on LINEAGES
  unique_lineages <- sort(unique(ID_Mapping_df_filt$LINEAGE))
  unique_lineages <- c(unique_lineages,unlist(sapply(2:(length(unique_lineages)-1),function(x) apply(combn(unique_lineages,x),2,function(y) paste0(sort(y),collapse = '_')))))
  lineage_samples <- lapply(unique_lineages,function(x) dplyr::filter(ID_Mapping_df_filt,LINEAGE %in% strsplit(x,split = '_')[[1]])$G_NUMBER)
  #Add in index which cover all LINEAGES
  lineage_samples <- c(list(ID_Mapping_df_filt$G_NUMBER),lineage_samples)
  names(lineage_samples) <- c('ALL',unique_lineages)

  vir_pca <- vector(mode = 'list',length = length(lineage_samples))
  names(vir_pca) <- c('ALL',unique_lineages)
  for(i in 1:length(lineage_samples)){
    tree_subset <-ape::keep.tip(tree, lineage_samples[[i]])
    AA_Table_filt <- AA_Table[tree_subset$tip.label,]
    MAC <- GetMAC(AA_Table_filt)

    if(pPCA){
      vir_4d <- phylobase::phylo4d(tree_subset, AA_Table_filt[,MAC > mac_thresh])
      vir_pca[[i]] <- adephylo::ppca(
        vir_4d,
        scale = FALSE,
        center = T,
        scannf = F,
        nfposi = 10,
        method = "Abouheif"
      )
    }else{
      vir_pca[[i]] <- prcomp(AA_Table_filt[,MAC > mac_thresh],center = T,scale = F)
    }
  }
  return(vir_pca)
}
#Filter AA variants according to MAC and Group variants according to lineage
#If a variant appears in multiple lineages, do unstratified analysis.
#If a variant appears in only 1 lineage, do stratified analysis.
FilterAAMatrix <- function(AA_Matrix,Lineage_Df,MAC_Thresh){
  #Keep samples where Host data exists
  AA_Matrix_filt <- AA_Matrix[Lineage_Df$ALL$G_NUMBER,,drop=FALSE]
  #Keep variants which pass unstratified MAC threshold
  MAC_AA_Matrix_filt <- GetMAC(AA_Matrix_filt)
  AA_Matrix_filt <- AA_Matrix_filt[,MAC_AA_Matrix_filt > MAC_Thresh,drop=FALSE]
  #Remove Missing Variants/Genes
  AA_Matrix_filt <- AA_Matrix_filt[,!apply(AA_Matrix_filt,2,function(x) all(is.na(x))),drop=FALSE]

  #Assign variants to lineages
  strat_table <- lapply(1:ncol(AA_Matrix_filt),function(i) table(ifelse(AA_Matrix_filt[,i]==0,0,1),Lineage_Df$ALL$LINEAGE))
  strat_assng <- vector(mode = 'character',length = length(strat_table))
  for(i in 1:length(strat_table)){
    cur_strat <- strat_table[[i]]
    MAC_by_lineage <- apply(cur_strat,2,function(x) min(x,na.rm = T))
    #If a variant is perfectly stratified by lineage and there are no variability within a lineage, exclude from analysis
    if(sum(MAC_by_lineage > MAC_Thresh) == 0){
      strat_assng[i] <- NA
      #If a variant only show variability in a single lineage, assign it to that lineage
    }else if (sum(MAC_by_lineage > MAC_Thresh) == 1){
      strat_assng[i] <- names(MAC_by_lineage)[which(MAC_by_lineage > MAC_Thresh)]
    }else{
      strat_assng[i] <- paste0(sort(names(MAC_by_lineage)[which(MAC_by_lineage > MAC_Thresh)]),collapse = '_')
      if(strat_assng[i] == 'L1_L2_L3_L4'){
        strat_assng[i] <- 'ALL'
      }
    }
  }

  #Construct AA Matrix for each lineage
  uniq_assgn <- sort(unique(strat_assng))
  uniq_assgn <- uniq_assgn[!is.na(uniq_assgn)]

  #Exclude ALL lineage (Will be run seperately anyways)
  #uniq_assgn <- uniq_assgn[uniq_assgn != paste0(sort(unique(Lineage_Df$ALL$LINEAGE)),collapse = '_')] 
  #For each gene, assign to lineage groups
  AA_Matrix_filt_by_lineage <- lapply(uniq_assgn,function(x) AA_Matrix_filt[Lineage_Df[[x]]$G_NUMBER,which(strat_assng == x),drop=FALSE])
  names(AA_Matrix_filt_by_lineage) <- uniq_assgn

  return(AA_Matrix_filt_by_lineage)
}

SetUpG2G <- function(OUT_DIR,Metadata_Path,Host_PC_Path,Var_Tbl_Path,Phylo_Tree_Path,Mtb_Nuc_Out,Host_MAF,Pathogen_MAC_pPCA,Pathogen_MAC_AA_Lineage,Pathogen_MAC_AA,Pathogen_Missing,BFILE_Path,Host_Files,n_cores){

  #Get metadata file
  parsed_mapping <- data.table::fread(Metadata_Path) %>% dplyr::select(PATIENT_ID,G_NUMBER,LINEAGE=Lineage)

  #Get nucleotide variant matrix
  nuc_matrix <- data.table::fread(Mtb_Nuc_Out)
  colnames(nuc_matrix) <- sapply(colnames(nuc_matrix),function(x) strsplit(x=x,split = ']')[[1]][2])
  nuc_matrix_trans <- nuc_matrix[,-c('CHROM','POS','REF','ALT')]
  nuc_matrix_trans <- t(as.matrix(nuc_matrix_trans))
  nuc_matrix_trans[nuc_matrix_trans==2] <- 1
  rownames(nuc_matrix_trans) <- colnames(nuc_matrix[,-c('CHROM','POS','REF','ALT')])
  colnames(nuc_matrix_trans) <- paste0(nuc_matrix$POS,'_',nuc_matrix$REF,'_',nuc_matrix$ALT)

  #Calculate Mtb pPCs
  vir_pPCA <- ConstructpPCA(nuc_matrix_trans,Phylo_Tree_Path,parsed_mapping,Pathogen_MAC_pPCA)
  vir_pPCs <- lapply(vir_pPCA,function(x) cbind(G_NUMBER=rownames(x$li),as.data.frame(x$li)) %>% dplyr::left_join(parsed_mapping))

  #Get consensus samples 
  host_IDs <- data.table::fread(glue::glue("{BFILE_Path}.fam"))$V2
  host_IDs_simple <- sapply(host_IDs,function(x) gsub(x=x,pattern = 'WGS_Fellay.',replacement = ''))
  host_IDs_simple <- sapply(host_IDs_simple,function(x) gsub(x=x,pattern = 'Batch1_',replacement = ''))
  host_IDs_simple <- sapply(host_IDs_simple,function(x) gsub(x=x,pattern = 'Batch2_',replacement = ''))

  #Initialize variables to store
  both_IDs_to_keep <- vector(mode = 'list',length = length(vir_pPCs)); names(both_IDs_to_keep) <- names(vir_pPCs)
  host_PCs <- vector(mode = 'list',length = length(vir_pPCs)); names(host_PCs) <- names(vir_pPCs)
  covars <- vector(mode = 'list',length = length(vir_pPCs)); names(covars) <- names(vir_pPCs)

  #Get Host PCs
  host_PCs_raw <- data.table::fread(Host_PC_Path)

  #Get Covars
  covars_discrete <- data.table::fread(glue::glue("{Host_Files}/covars_discrete"),sep = '\t',na.strings = c("",'unknown')) 
  covars_numeric <- data.table::fread(glue::glue("{Host_Files}/covars_numeric"),sep = '\t',na.strings = c("",'unknown')) 
  covars_raw <- dplyr::inner_join(covars_discrete,covars_numeric)

  #Loop through lineages, store PLINK files, covars, and host PCs
  for(i in 1:length(vir_pPCs)){
    system(glue::glue('mkdir -p {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/'))
    pathogen_IDs_to_keep <- dplyr::filter(parsed_mapping,G_NUMBER %in% vir_pPCs[[i]]$G_NUMBER)
    both_IDs_to_keep[[i]] <- dplyr::filter(pathogen_IDs_to_keep,PATIENT_ID %in% host_IDs_simple)
    both_IDs_to_keep[[i]]$FAM_ID <- host_IDs[match(both_IDs_to_keep[[i]]$PATIENT_ID,host_IDs_simple)]
    data.table::fwrite(data.frame(FID=both_IDs_to_keep[[i]]$FAM_ID,IID=both_IDs_to_keep[[i]]$FAM_ID),
                       file = glue::glue("{OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt"),sep = '\t',quote = F)

    #Write out PLINK files with samples to keep
    system(
      glue::glue(
        "~/Software/plink2 --bfile {BFILE_Path} --keep {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt --indiv-sort f {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt --make-bed --out {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/TB_DAR_G2G_Raw"
      )
    )

    #Apply MAF Filter
    system(
      glue::glue(
        "~/Software/plink2 --bfile {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/TB_DAR_G2G_Raw --maf {Host_MAF} --indiv-sort f {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt --make-bed --out {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/TB_DAR_G2G"
      )
    )

    #Write out HLA PLINK files with samples to keep
    system(
      glue::glue(
        "~/Software/plink2 --bfile {Host_Files}/TB_DAR_HLA_Alleles --keep {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt --indiv-sort f {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt --make-bed --out {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/TB_DAR_HLA_Alleles_G2G"
      )
    )
    system(
      glue::glue(
        "~/Software/plink2 --bfile {Host_Files}/TB_DAR_HLA_AA --keep {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt --indiv-sort f {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/G2G_Samples.txt --make-bed --out {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/TB_DAR_HLA_AA_G2G"
      )
    )

    system(glue::glue("rm {OUT_DIR}/LINEAGE_{names(vir_pPCs)[i]}/TB_DAR_G2G_Raw*"))
    #Order and filter host PCs
    host_PCs[[i]] <- dplyr::left_join(data.frame('IID' = both_IDs_to_keep[[i]]$FAM_ID),host_PCs_raw[,-1],by=c('IID'='V2'))
    colnames(host_PCs[[i]]) <- c('IID',paste0('PC',seq(1,ncol(host_PCs[[i]])-1,1)))

    #Order and filter pPCs
    vir_pPCs[[i]] <- dplyr::left_join(data.frame(G_NUMBER=both_IDs_to_keep[[i]]$G_NUMBER),vir_pPCs[[i]])

    #Order and filter covars
    covars[[i]] <- dplyr::left_join(data.frame(IID=both_IDs_to_keep[[i]]$FAM_ID),covars_raw) %>% dplyr::select(-contains("PC"))
  }
  #Unstratified AA matrix
  aa_matrix <- readRDS(Var_Tbl_Path)
  if(grepl(pattern = 'Burden_True',x=OUT_DIR)){
    aa_matrix <- aa_matrix$non_syn_burden
  }
  aa_matrix_full <- aa_matrix[both_IDs_to_keep[['ALL']]$G_NUMBER,,drop=FALSE]
  aa_matrix_full<- aa_matrix_full[,GetMAC(aa_matrix_full) > Pathogen_MAC_AA,drop=FALSE]

  #Filter by missingness
  aa_missingness <- apply(aa_matrix_full,2,function(x) sum(is.na(x)) / nrow(aa_matrix_full))
  aa_matrix_full<- aa_matrix_full[,aa_missingness < Pathogen_Missing,drop=FALSE]


  #Filter AA Matrix, decide for each variant whether to do stratified or stratified analysis
  aa_matrix_filt <- FilterAAMatrix(aa_matrix_full,both_IDs_to_keep,MAC_Thresh = Pathogen_MAC_AA_Lineage) 

  #Filter AA Matrix based on MAC per lineage
  # aa_matrix_mac_by_lineage <- sapply(1:ncol(aa_matrix_full),function(x) max(apply(table(aa_matrix_full[,x],both_IDs_to_keep[['ALL']]$LINEAGE),2,min)))
  aa_matrix_mac_by_lineage <- sapply(1:ncol(aa_matrix_full),function(x) max(sapply(unique(both_IDs_to_keep[['ALL']]$LINEAGE),
                                                                                   function(y) GetMAC(aa_matrix_full[both_IDs_to_keep[['ALL']]$LINEAGE == y,x,drop=F]))))
  aa_matrix_full <- aa_matrix_full[,aa_matrix_mac_by_lineage > Pathogen_MAC_AA]

  #Path to VCF file (for each lineage)
  host_path <- glue::glue("{OUT_DIR}/LINEAGE_{c(names(vir_pPCs),'ALL')}/TB_DAR_G2G")
  names(host_path) <- c(names(vir_pPCs),'ALL')

  #Load TB Score
  tb_score_df <- data.table::fread(glue::glue("{Host_Files}/TB_score"))
  tb_score_by_lineage <- lapply(both_IDs_to_keep,function(x) dplyr::left_join(x %>% dplyr::select(IID=FAM_ID),tb_score_df,c('IID'='IID')) %>% dplyr::relocate(FID,IID,TB_score))

  #Load X-Ray Score
  xray_score_df <- data.table::fread(glue::glue("{Host_Files}/Xray_score"))
  xray_score_by_lineage <- lapply(both_IDs_to_keep,function(x) dplyr::left_join(x %>% dplyr::select(IID=FAM_ID),xray_score_df,c('IID'='IID')) %>% dplyr::relocate(FID,IID,Xray_score))

  return(list(host_PCs=host_PCs,
              vir_pPCs=vir_pPCs,
              covars = covars,
              tb_score=tb_score_by_lineage,
              xray_score = xray_score_by_lineage,
              aa_matrix_filt=aa_matrix_filt,
              aa_matrix_full=aa_matrix_full,
              nuc_matrix = nuc_matrix_trans,
              both_IDs_to_keep=both_IDs_to_keep,
              host_path = host_path,
              raw_pPCA = vir_pPCA))
}


args <- commandArgs(trailingOnly = TRUE)
OUT_DIR <- args[[1]]
Metadata_Path <- args[[2]]
Host_PC_Path <- args[[3]]
Var_Tbl_Path <- args[[4]]
Phylo_Tree_Path <- args[[5]]
Mtb_Nuc_Out <- args[[6]]
Host_MAF <- as.numeric(args[[7]])
Pathogen_MAC_pPCA <- as.numeric(args[[8]])
Pathogen_MAC_AA_Lineage <- as.numeric(args[[9]])
Pathogen_MAC_AA <- as.numeric(args[[10]])
Pathogen_Missing <- as.numeric(args[[11]])
BFILE_Path <- gsub(args[[12]],pattern = '.bed',replacement = '')
Host_Files <- args[[13]]
n_cores <- as.numeric(args[[14]])

# OUT_DIR <- '../../scratch/Burden_False_SIFT_False_Del_False_HomoOnly_True_HetThresh_10/'
# Metadata_Path <- '../../data/pheno/metadata_Sinergia_final_dataset_human_bac_genome_available_QCed.txt'
# Host_PC_Path <- '../../scratch/Host/TB_DAR_GWAS_PCA.eigenvec'
# Var_Tbl_Path <- '../../scratch/Burden_False_SIFT_False_Del_False_HomoOnly_True_HetThresh_10/Mtb_Var_Tbl.rds'
# Phylo_Tree_Path <- '../../data/Mtb/RAxML_bestTree.Sinergia_final_dataset_human_bac_genome_available_rerooted.nwk'
# Mtb_Nuc_Out <- '../../data/Mtb/merged/merged.var.homo.SNPs.vcf.dosage'
# Host_MAF <- 0.05
# Pathogen_MAC_pPCA <- 15
# Pathogen_MAC_AA_Lineage <- 15
# Pathogen_MAC_AA <- 15
# Pathogen_Missing <- 0.05
# BFILE_Path <- '../../data/Genotyping_WGS/TBDAR.WGS.Imputed.GWASReady'
# Host_Files <- '../../scratch/Host/'
# n_cores <- 1

G2G_Obj <- SetUpG2G(OUT_DIR=OUT_DIR,Metadata_Path=Metadata_Path,Host_PC_Path=Host_PC_Path,Var_Tbl_Path=Var_Tbl_Path,Phylo_Tree_Path=Phylo_Tree_Path,Mtb_Nuc_Out=Mtb_Nuc_Out,Host_MAF=Host_MAF,Pathogen_MAC_pPCA=Pathogen_MAC_pPCA,Pathogen_MAC_AA_Lineage=Pathogen_MAC_AA_Lineage,Pathogen_MAC_AA=Pathogen_MAC_AA,Pathogen_Missing=Pathogen_Missing,BFILE_Path=BFILE_Path,Host_Files=Host_Files,n_cores=n_cores)
saveRDS(G2G_Obj,glue::glue("{OUT_DIR}/G2G_Obj.rds"))
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
library(dplyr)
library(tidyr)
library(dummies)
library(data.table)

SetUpHost <- function(Metadata_Path,BFILE_Path,Out_Path,excl_regions,covars_discrete_to_incl = c('patient_sex','HIV_status'),covars_numeric_to_incl = c('age','BMI'),pheno_name = c('TB_score','Xray_score'),n_PC = 3,not_chr_6 = T,n_cores){
  system(glue::glue("mkdir -p {Out_Path}"))
  system(glue::glue("mkdir -p {Out_Path}/Host/"))

  #File for PCA (Prune and remove long-range LD)
  system(
    glue::glue(
      "plink2 --bfile {BFILE_Path} --threads {n_cores} --keep-allele-order --new-id-max-allele-len 50 --set-all-var-ids @:#[b37]\\$r,\\$a --rm-dup force-first --make-bed --out {Out_Path}/Host/TB_DAR_GWAS_PCA_tmp"
    )
  )

  system(
    glue::glue(
      "plink2 --bfile {Out_Path}/Host/TB_DAR_GWAS_PCA_tmp --threads {n_cores} --keep-allele-order --indep-pairwise 200 100 0.2 --out {Out_Path}/Host/TB_DAR_GWAS_PCA_tmp"
    )
  )

  if(not_chr_6){
    system(
      glue::glue(
        "plink2 --bfile {Out_Path}/Host/TB_DAR_GWAS_PCA_tmp --threads {n_cores} --exclude range {excl_regions} --not-chr 6 --keep-allele-order --extract {Out_Path}/Host/TB_DAR_GWAS_PCA_tmp.prune.in --make-bed --out {Out_Path}/Host/TB_DAR_GWAS_PCA"
      )
    )

  }else{
    system(
      glue::glue(
        "plink2 --bfile {Out_Path}/Host/TB_DAR_GWAS_PCA_tmp --threads {n_cores} --exclude range {excl_regions} --keep-allele-order --extract {Out_Path}/Host/TB_DAR_GWAS_PCA_tmp.prune.in --make-bed --out {Out_Path}/Host/TB_DAR_GWAS_PCA"
      )
    )

  }

  ### Calculate PCs ###
  system(
    glue::glue(
      "gcta64 --bfile {BFILE_Path} --make-grm --out {Out_Path}/Host/TB_DAR_GWAS --thread-num {n_cores}"
    )
  )

  system(
    glue::glue(
      "gcta64 --bfile {Out_Path}/Host/TB_DAR_GWAS_PCA --make-grm --out {Out_Path}/Host/TB_DAR_GWAS_PCA --thread-num {n_cores}"
    )
  )
  system(
    glue::glue(
      "gcta64 --grm {Out_Path}/Host/TB_DAR_GWAS_PCA --pca 20 --out {Out_Path}/Host/TB_DAR_GWAS_PCA --thread-num {n_cores}"
    )
  )

  ### Set-up Phenotype ###
  #Load metadata file
  metadata <- data.table::fread(Metadata_Path,na.strings = c("",'unknown','NA'),stringsAsFactors = T) %>%
    dplyr::select(any_of(c('PATIENT_ID',covars_discrete_to_incl,covars_numeric_to_incl))) %>% dplyr::mutate(PATIENT_ID = as.character(PATIENT_ID)) 

  #Parse sample ID
  samples_for_GWAS <- data.table::fread(glue::glue("{BFILE_Path}.fam")) %>% dplyr::select(IID=V2)
  samples_for_GWAS_simple <- sapply(samples_for_GWAS$IID,function(x) gsub(x=x,pattern = 'WGS_Fellay.',replacement = ''))
  samples_for_GWAS_simple <- sapply(samples_for_GWAS_simple,function(x) gsub(x=x,pattern = 'Batch1_',replacement = ''))
  samples_for_GWAS_simple <- sapply(samples_for_GWAS_simple,function(x) gsub(x=x,pattern = 'Batch2_',replacement = ''))

  if(is.null(covars_discrete_to_incl)){
    covars_discrete <- NULL
  }else{
    covars_discrete <- dplyr::left_join(data.frame(ID = samples_for_GWAS_simple,stringsAsFactors = F),metadata %>% 
                                          dplyr::select(any_of(c('PATIENT_ID',covars_discrete_to_incl))),by=c('ID'='PATIENT_ID')) %>% dplyr::select(-ID)
  }

  if(is.null(covars_numeric_to_incl)){
    covars_numeric <- NULL
  }else{
    covars_numeric <- dplyr::left_join(data.frame(ID = samples_for_GWAS_simple,stringsAsFactors = F),metadata %>% 
                                         dplyr::select(any_of(c('PATIENT_ID',covars_numeric_to_incl))),by=c('ID'='PATIENT_ID')) %>% dplyr::select(-ID)

  }

  PCs <- data.table::fread(glue::glue("{Out_Path}/Host/TB_DAR_GWAS_PCA.eigenvec"))
  colnames(PCs) <- c('FID','IID',paste0('PC',seq(1,ncol(PCs) - 2)))

  #Top N PCs and numeric covars
  data.table::fwrite(cbind(PCs[,1:(2+n_PC)],covars_numeric),sep = '\t',col.names = T,na = 'NA',file = glue::glue("{Out_Path}/Host/covars_numeric"))

  #discrete covars
  data.table::fwrite(cbind(PCs[,1:2],covars_discrete),sep = '\t',col.names = T,na = 'NA',file = glue::glue("{Out_Path}/Host/covars_discrete"))

  for(i in 1:length(pheno_name)){
    metadata_pheno <- data.table::fread(Metadata_Path,na.strings = c("",'unknown'),stringsAsFactors = T) %>%
      dplyr::select(any_of(c('PATIENT_ID',pheno_name[i]))) %>% dplyr::mutate(PATIENT_ID = as.character(PATIENT_ID)) %>% 
      tidyr::drop_na()

    #Join with pheno table, extract relevant covars
    pheno_vect <- dplyr::left_join(data.frame(ID = samples_for_GWAS_simple,stringsAsFactors = F),metadata_pheno %>% dplyr::select(PATIENT_ID,any_of(pheno_name[i])),by=c('ID'='PATIENT_ID'))[,pheno_name[i],drop=T]

    #pheno 
    if(pheno_name[i] == 'Lineage'){
      pheno_df <- cbind(PCs[,1:2],data.frame(Lineage = pheno_vect)) %>% tidyr::drop_na()
      pheno_df <- cbind(pheno_df[,1:2],dummy.data.frame(pheno_df[,-c(1,2)]))
    }else if(pheno_name[i] == 'Sublineage'){
      pheno_df <- cbind(PCs[,1:2],data.frame(Lineage = pheno_vect)) %>% tidyr::drop_na()
      dummy_data <- dummy.data.frame(pheno_df[,-c(1,2)])
      #Filter Sublineage with less than 30 counts
      cnts <- apply(dummy_data,2,function(x) min(c(sum(x==1),sum(x==0))))
      dummy_data <- dummy_data[,cnts > 30]
      pheno_df <- cbind(pheno_df[,1:2],dummy_data)
    }
    else{
      pheno_df <- cbind(PCs[,1:2],data.frame(pheno=pheno_vect))
      colnames(pheno_df)[3] <- pheno_name[i]
    }
    data.table::fwrite(pheno_df,sep = ' ',col.names = T,na = 'NA',file = glue::glue("{Out_Path}/Host/{pheno_name[i]}"),quote = F)
  }

}


SetUpHLA <- function(VCF_File,Out_Path,MAF_Thresh = 0.01,Info_Thresh = 0.8){
  info_file <- data.table::fread(cmd = glue::glue("zcat {gsub(VCF_File,pattern = '.dose.vcf.gz',replacement = '.info.gz')}"))

  hla_alleles <- dplyr::filter(info_file,Rsq > Info_Thresh & grepl('HLA',SNP)) %>% dplyr::select(SNP)
  hla_AA <- dplyr::filter(info_file,Rsq > Info_Thresh & grepl('AA',SNP)) %>% dplyr::select(SNP)

  write(hla_AA$SNP,file = glue::glue("{Out_Path}Host/AA_to_keep.txt"))
  write(hla_alleles$SNP,file = glue::glue("{Out_Path}Host/Alleles_to_keep.txt"))

  system(glue::glue("plink2 --vcf {VCF_File} --extract {Out_Path}Host/AA_to_keep.txt --maf {MAF_Thresh} --make-bed --double-id --out {Out_Path}Host/TB_DAR_HLA_AA"))
  system(glue::glue("plink2 --vcf {VCF_File} --extract {Out_Path}Host/Alleles_to_keep.txt --maf {MAF_Thresh} --make-bed --double-id --out {Out_Path}Host/TB_DAR_HLA_Alleles"))

}

# Metadata_Path <- '../../data/pheno/metadata_Sinergia_final_dataset_human_bac_genome_available_QCed.txt'
# BFILE_Path <- '../../data/Genotyping_WGS/TBDAR.WGS.Imputed.GWASReady'
# HLA_Path <-'../../data/HLA/FourDigits/chr6.dose.vcf.gz'
# OUT_dir <- '../../scratch/'
# excl_regions <- '../../data/exclusion_regions_hg19.txt'
# n_cores <- 1


args <- commandArgs(trailingOnly = TRUE)
Metadata_Path <- args[[1]]
BFILE_Path <- gsub(args[[2]],pattern = '.bed',replacement = '')
HLA_Path <- args[[3]]
OUT_dir <- args[[4]]
excl_regions <- args[[5]]
n_cores <- as.numeric(args[[6]])

SetUpHost(Metadata_Path,BFILE_Path,OUT_dir,excl_regions=excl_regions,covars_discrete_to_incl = c('patient_sex','HIV_status','TB_RF_smoking'),covars_numeric_to_incl = c('age','BMI'),n_cores=n_cores)
SetUpHLA(VCF_File = HLA_Path,Out_Path = OUT_dir)
32
33
34
35
36
37
38
39
40
41
42
43
44
shell:
  '''
    bcftools filter -i 'GT=="1/1"' -O z -o {output.homo_snps} {input.VCF_file} #Keep both homo calls only.
    bcftools index -t  {output.homo_snps}

    bcftools filter -i 'GT=="1/1" || GT=="0/1" || GT=="1/0"' -O z -o {output.homo_het_snps} {input.VCF_file} #Keep both homo and mixed calls.  
    bcftools index -t  {output.homo_het_snps}

    bcftools filter -i 'GT=="./." || GT=="0/1" || GT=="1/0"' {input.VCF_file} | bcftools query -f '%POS %REF %ALT\n' > {output.missing_snps_homo} 

    bcftools filter -i 'GT=="./."' {input.VCF_file} | bcftools query -f '%POS %REF %ALT\n' > {output.missing_snps_homo_het} 
    bcftools query -f '%POS %REF %ALT [%AD] [%RD] [%SDP] [%DP] [%RBQ] [%ABQ] [%RDF] [%RDR] [%ADF] [%ADR] [%FREQ] [%GT] [%GQ]\n' {input.VCF_file} | pigz --fast > {output.reads}
  '''
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
shell:
  '''
    bcftools merge --threads {threads} -0 -m none {input.homo_snps} -O v -o {output.homo_snps}.tmp
    bedtools subtract -header -A -a {output.homo_snps}.tmp -b {input.locus_to_excl} > {output.homo_snps}
    bgzip -c {output.homo_snps} > {output.homo_snps}.gz
    bcftools index -t {output.homo_snps}.gz
    rm {output.homo_snps}.tmp
    bcftools +dosage {output.homo_snps}.gz > {output.homo_snps}.dosage

    bcftools merge --threads {threads} -0 -m none {input.homo_het_snps} -O v -o {output.homo_het_snps}.tmp
    bedtools subtract -header -A -a {output.homo_het_snps}.tmp -b {input.locus_to_excl} > {output.homo_het_snps}
    bgzip -c {output.homo_het_snps} > {output.homo_het_snps}.gz
    bcftools index -t {output.homo_het_snps}.gz
    rm {output.homo_het_snps}.tmp
    bcftools +dosage {output.homo_het_snps}.gz > {output.homo_het_snps}.dosage
  '''
 99
100
shell:
    "Rscript ./scripts/GetCoverage.R {input.VCF_Files} {input.missing_files} {input.gff} {params.out_dir} {threads}"
122
123
124
125
shell:
  '''
      Rscript ./scripts/MakeAATable.R {input.snps} {input.locus_to_excl} {params.snpEff} {wildcards.id} {output.AA_variant} {output.Syn_variant}
  '''
147
148
shell:
  "Rscript ./scripts/MakeVariantMatrix.R {input.AA_variant} {input.Syn_variant} {input.missing_mat} {input.phylo_snps} {input.del_tbl} {input.sift_path} {params.IS_SIFT} {params.IS_Burden} {params.IS_Deletion} {output.Mtb_Var_Tbl} {params.metadata} {params.Het_Thresh} {threads}"
172
173
174
175
shell:
  '''
    Rscript ./scripts/SetUpHost.R {params.metadata} {input.host_bfile} {input.host_hla_vcf} {params.Out_DIR} {params.excl_region} {threads}
  '''
199
200
shell:
  "Rscript ./scripts/SetUpG2G.R {params.Out_DIR} {params.metadata} {input.host_pc} {input.Mtb_Var_Tbl} {params.Phylo_Tree_Path} {input.merged_dosage} {params.Host_MAF} {params.Pathogen_MAC_pPCA} {params.Pathogen_MAC_AA_Lineage} {params.Pathogen_MAC_AA} {params.Pathogen_Missing} {input.host_bfile} {params.Host_Files} {threads}"
220
221
shell:
  " Rscript ./scripts/RunG2G.R {input.g2g_obj} {params.out_dir} {params.tool} {params.N_PC} {params.N_pPC} {params.covars_to_incl} {params.stratified} {params.homo_only} {threads}"
242
243
shell:
  "Rscript ./scripts/RunInteraction.R {input.g2g_obj} {params.out_dir} {params.tool} {params.N_PC} {params.N_pPC} {params.covars_to_incl} {params.stratified} {params.homo_only} {params.pheno} {threads}"
258
259
shell:
    "Rscript ./scripts/RunPASCAL.R {input.res_obj} {input.bim_1KG} {params.scratch_dir} {params.output}"
ShowHide 12 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/zmx21/G2G-TB
Name: g2g-tb
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Accessed: 1
Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...