A snakemake pipeline to generate the transcriptomic signatures of drug susceptibility used in beyondcell

public public 1yr ago 0 bookmarks

The purpose of this repository and the snakemake pipeline associated with it is to regenerate the whole collection of transcriptomics signatures.

This collection is currently featured at the following BU's tools:

  1. Beyondcell

Code Snippets

13
14
script:
    "../scripts/common_annotate_lines.py"
32
33
script:
    "../scripts/prism_raw_counts_from_expected_counts.R"
22
23
script:
    "../scripts/ctrp_generate_curves.R"
38
39
script:
    "../scripts/ctrp_generate_ebayes_model.R"
57
58
script:
    "../scripts/ctrp_signature_from_ebayes.R"
75
76
script:
    "../scripts/ctrp_signature_from_ebayes.R"
17
18
script:
    "../scripts/ctrp_generate_drug_db.R"
38
39
script:
    "../scripts/gdsc_generate_drug_db.R"
59
60
script:
    "../scripts/gdsc_generate_drug_db.R"
78
79
script:
    "../scripts/prism_generate_drug_db.R"
 99
100
script:
    "../scripts/common_build_drug_db.R"
115
116
script:
    "../scripts/dependencies_enrichment_from_ebayes.R"
131
132
script:
    "../scripts/dependencies_drug_enrichment_from_ebayes.R"
16
17
script:
    "../scripts/dependencies_annotate_dependencies.R"
34
35
script:
    "../scripts/dependencies_generate_ebayes_model.R"
51
52
script:
    "../scripts/dependency_signature_from_ebayes.R"
12
13
script:
    "../scripts/gdsc_download_raw_files.R"
29
30
script:
    "../scripts/gdsc_normalize_arrays.R"
50
51
script:
    "../scripts/gdsc_generate_curves.R"
66
67
script:
    "../scripts/gdsc_generate_ebayes_model.R"
83
84
script:
    "../scripts/probeID_hgnc_table.R"
105
106
script:
    "../scripts/gdsc_signature_from_ebayes.R"
126
127
script:
    "../scripts/gdsc_signature_from_ebayes.R"
20
21
script:
    "../scripts/gdsc_rna_generate_curves.R"
38
39
script:
    "../scripts/gdsc_rna_generate_ebayes_model.R"
59
60
script:
    "../scripts/gdsc_rna_signature_from_ebayes.R"
79
80
script:
    "../scripts/gdsc_rna_signature_from_ebayes.R"
17
18
script:
    "../scripts/metastasis_generate_models.R"
37
38
script:
    "../scripts/metastasis_generate_ebayes_model.R"
57
58
script:
    "../scripts/metastasis_generate_ebayes_model.R"
76
77
script:
    "../scripts/metastasis_geneset_from_ebayes.R"
95
96
script:
    "../scripts/metastasis_geneset_from_ebayes.R"
18
19
script:
    "../scripts/prism_generate_annotation.R"
36
37
script:
    "../scripts/prism_generate_ebayes_model.R"
59
60
script:
    "../scripts/prism_signature_from_ebayes.R"
79
80
script:
    "../scripts/prism_signature_from_ebayes.R"
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import pandas as pd


def main():

    ###  SNAKEMAKE I/O ###
    raw_celligner = snakemake.input["celligner_lines_data"]
    lines_info = snakemake.input["sample_info"]
    where_to_save = snakemake.output["cell_lines_annotation"]

    raw_celligner = pd.read_csv(
        raw_celligner,
        sep=",",
        usecols=["sampleID", "sampleID_CCLE_Name", "undifferentiated_cluster"],
    )

    ## Get rid of non ccle data
    is_line = raw_celligner["sampleID"].str.startswith("ACH-")

    undifferentiated_lines = raw_celligner.loc[
        (is_line & raw_celligner["undifferentiated_cluster"]), "sampleID"
    ]

    del raw_celligner

    ## Annotate lines data with data from the undifferentiated clusters
    lines_info = pd.read_csv(lines_info, sep=",")
    lines_info["is_undifferentiated"] = lines_info["DepMap_ID"].isin(
        undifferentiated_lines
    )

    # Preserve original lineage
    lines_info["original_lineage"] = lines_info["lineage"]

    # rename the lineages from the undif. lines to "undifferentiated"
    lines_info.loc[lines_info["is_undifferentiated"], "lineage"] = "undifferentiated"

    lines_info.to_csv(where_to_save, sep=",", index=False)


if __name__ == "__main__":
    main()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
library("tidyverse")

### SNAKEMAKE I/O ###
ctrp_db        <- snakemake@input[['ctrp_db']]
gdsc_db        <- snakemake@input[['gdsc_db']]
prism_db      <- snakemake@input[['prism_db']]
cell_line_annotation <- snakemake@input[['cell_line_annotation']]

db_save    <- snakemake@output[['full_database']]
db_save_rd <- snakemake@output[['full_database_rd']]

db_summary_save    <- snakemake@output[['drug_summary']]
db_summary_save_rd <- snakemake@output[['drug_summary_rd']]


prism <- read_csv(prism_db) %>%
    select(-cid) %>%
    mutate(project = "PRISM") %>%
    rename(drug_id = broad_id,
           cid = smiles
           )

ctrp  <- read_csv(ctrp_db) %>%
    mutate(project = "CTRP") %>%
    rename(drug_id = broad_id,
           cid = smile
           )

gdsc  <- read_csv(gdsc_db) %>%
    mutate(project = "GDSC") %>%
    rename(cid = pubchem) %>%
    mutate(drug_id = as.character(drug_id))

## Cell line annotation
cell_line_annotation <- read_csv(cell_line_annotation) %>%
    select(DepMap_ID,
           stripped_cell_line_name,
           original_lineage,
           lineage,
           lineage_subtype,
           primary_or_metastasis
           ) %>%
    rename(depmap_id = DepMap_ID,
           cell_line = stripped_cell_line_name,
           origin = original_lineage,
           origin_subtype = lineage_subtype,
           tumor_type = primary_or_metastasis
           )

## attempt to merge the data by outer join and add project 
combined_data <- prism %>%
    bind_rows(ctrp) %>%
    bind_rows(gdsc) %>%
    select(-lineage) %>%
    left_join(cell_line_annotation, "depmap_id") %>%
    rename(celligner_origin = lineage)


## generate summary
drug_summary <- combined_data %>%
    group_by(project, drug_id) %>%
    summarise(name = first(name),
              target = first(target),
              moa = first(moa),
              lines_tested = first(profiled_lines),
              min_auc = min(auc), 
              max_auc = max(auc),
              mean_auc = mean(auc),
              cid = cid
              ) %>%
    distinct()


write_csv(combined_data, file = db_save)
write_csv(drug_summary, file = db_summary_save)

save(combined_data, file = db_save_rd)    
save(drug_summary, file = db_summary_save_rd)    
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
library("tidyverse")

### SNAKEMAKE I/O ###
dose_response_curves           <- snakemake@input[["curves_data"]]
compound_meta                  <- snakemake@input[["compount_meta"]]
experiment_meta                <- snakemake@input[["experiment_meta"]]
cell_meta                      <- snakemake@input[["cell_meta"]]
cell_lines_annotation          <- snakemake@input[["cell_lines_annotation"]]
count_matrix                   <- snakemake@input[["rna_count_matrix"]]

auc_models_candidates          <- snakemake@output[["auc_models_candidates"]]
compounds_lines_profiled       <- snakemake@output[["compounds_lines_profiled"]]

# Setup log file
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

## Load tables
drug_data  <- data.table::fread(dose_response_curves) %>%
    as.data.frame()

drug_meta <- data.table::fread(compound_meta) %>%
    as.data.frame()

experiment_meta <- data.table::fread(experiment_meta) %>%
    as.data.frame()

# This table and the below one seem redundant, but CTRP data might be outdated.
# We are loading it ONLY to translate experiment_ids to cell line IDs to match
# to ccle_info
cell_meta <- data.table::fread(cell_meta) %>%
    as.data.frame()

ccle_info <- data.table::fread(cell_lines_annotation) %>%
    as.data.frame() ## I don"t like tibbles


## Load count matrix
ccle_counts <- readRDS(count_matrix)

## Add broad_cpd_id to compounds id
drug_data_annotated <- merge(x=drug_data,
                             y=drug_meta[,c("master_cpd_id",  "broad_cpd_id")],
                             by.x="master_cpd_id",
                             by.y = "master_cpd_id",
                             all.x=TRUE,
                             all.y=FALSE)


## create a named vector of master ccle ids
master_ids <- experiment_meta[!duplicated(experiment_meta$experiment_id), c("experiment_id", "master_ccl_id")] 

drug_data_annotated$master_ccl_id <- master_ids[drug_data_annotated$experiment_id, "master_ccl_id"]

## Use master_ccl_id to  annotated drug_data with ACH depmap ids
drug_data_annotated$human_ccl_name <- cell_meta[drug_data_annotated$master_ccl_id, "ccl_name"]

# get rid of experiments without ccl_name. These are collaborators lines and
# thus no RNA-Seq data is available.
drug_data_annotated_lines <- drug_data_annotated[!is.na(drug_data_annotated$human_ccl_name),]

# now get the ACH ids for the remaining lines
annotations_fields_of_interest <- c("DepMap_ID", "stripped_cell_line_name", "lineage", "is_undifferentiated")

drug_data_annotated_lines_depmap <- merge(x=drug_data_annotated_lines,
                                          y=ccle_info[,annotations_fields_of_interest],
                                          by.x="human_ccl_name",
                                          by.y="stripped_cell_line_name",
                                          all.x=TRUE)


## Get rid of lines withou ACH-id If somehow there is any missing left
drug_data_annotated_lines_depmap <- drug_data_annotated_lines_depmap[!is.na(drug_data_annotated_lines_depmap$DepMap_ID),]

## Get rid of compounds without area_under_curve
drug_data_annotated_lines_depmap <- drug_data_annotated_lines_depmap[!is.na(drug_data_annotated_lines_depmap$area_under_curve),]

## Get rid of cell lines and curves for which no RNASeq profiling was done
available_lines <- intersect(colnames(ccle_counts), drug_data_annotated_lines_depmap$DepMap_ID)
drug_data_annotated_lines_depmap <- drug_data_annotated_lines_depmap[drug_data_annotated_lines_depmap$DepMap_ID %in% available_lines,]

# If there are duplicate cell lines by compound, keep the one with the lowest auc
drug_data_annotated_lines_depmap <- drug_data_annotated_lines_depmap[order(drug_data_annotated_lines_depmap$area_under_curve),]
drug_line_duplicated             <- duplicated(drug_data_annotated_lines_depmap[,c("broad_cpd_id", "DepMap_ID")])
drug_data_annotated_lines_depmap <- drug_data_annotated_lines_depmap[!drug_line_duplicated,]

## Get the nº. lines/compound
lines_and_compounds <- drug_data_annotated_lines_depmap %>%
    group_by(broad_cpd_id) %>%
    mutate(area_under_curve = as.numeric(area_under_curve)) %>%
    summarise(
        profiled_lines = n_distinct(DepMap_ID),
        cv = sd(area_under_curve) / mean(area_under_curve)
        ) %>%
    as.data.frame()

# Save this table. It is definitely useful
write.csv(lines_and_compounds, file = compounds_lines_profiled, row.names=FALSE)

## Keep compounds with at least 10 profiled lines
compounds_to_test <- lines_and_compounds %>%
    filter(profiled_lines >= 10 & cv >= 0.1) %>%
    arrange(desc(cv)) %>% 
    distinct(broad_cpd_id, .keep_all = TRUE) %>%
    pull(broad_cpd_id)

## set lineage if undiff
drug_data_annotated_lines_depmap <- drug_data_annotated_lines_depmap %>% 
    filter(broad_cpd_id %in% compounds_to_test)

drug_data_annotated_lines_depmap[drug_data_annotated_lines_depmap$is_undifferentiated, "lineage"] <- "undifferentiated"

drug_data_annotated_lines_depmap$lineage          <- as.factor(drug_data_annotated_lines_depmap$lineage)
drug_data_annotated_lines_depmap$area_under_curve <- as.numeric(drug_data_annotated_lines_depmap$area_under_curve)

if(!dir.exists(file.path(auc_models_candidates))){
    dir.create(auc_models_candidates)}

by(drug_data_annotated_lines_depmap,
   drug_data_annotated_lines_depmap$broad_cpd_id,
   FUN=function(i) write.csv(i, paste0(auc_models_candidates, "/", i$broad_cpd_id[1], ".csv"), , row.names=FALSE))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
library("tidyverse")


### SNAKEMAKE I/O ###
compound_data        <- snakemake@input[['compound_data']]
compound_meta        <- snakemake@input[['compound_meta']]
lines_compounds      <- snakemake@input[['lines_compounds']]

comma_file <- snakemake@output[['csv_db']]
rdata      <- snakemake@output[['rdata_db']]

full_table <- compound_data %>%
    map(read_csv) %>%
    reduce(bind_rows)

## Annotate how many lines were profiled by comp.
lines_compounds <- read_csv(lines_compounds) 

## Now keep columns of interest
# broad_id, name, auc, ic50, depmap_id, lineage, moa
filtered_data <- full_table %>%
    select(master_cpd_id,
           broad_cpd_id,
           area_under_curve,
           apparent_ec50_umol,
           DepMap_ID,
           lineage
           ) %>%
    left_join(y = lines_compounds, by = "broad_cpd_id") %>%
    rename(broad_id = broad_cpd_id,
           auc = area_under_curve,
           ec50 = apparent_ec50_umol,
           depmap_id = DepMap_ID)


## We need to annotate the moa for CRTP
compound_meta <- read_tsv(compound_meta) %>%
    select(master_cpd_id,
           cpd_name,
           gene_symbol_of_protein_target,
           target_or_activity_of_compound,
           cpd_smiles,
           ) %>%
    rename(
        name = cpd_name,
        target = gene_symbol_of_protein_target,
        moa = target_or_activity_of_compound,
        smile = cpd_smiles
        )

filtered_annotated_data <- filtered_data %>%
    left_join(compound_meta, by = "master_cpd_id") %>%
    select(-master_cpd_id)


write_csv(x = filtered_annotated_data, file = comma_file)
save(filtered_annotated_data, file = rdata)    
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
library('edgeR')
library('limma')

### SNAKEMAKE I/O ###
compound_to_test      <- snakemake@input[['compound_to_test']]
raw_gene_counts       <- snakemake@input[['raw_gene_counts']]

ebayes_model          <- snakemake@output[['ebayes']]

## Load and set data types manually just in case
ccle_counts           <- readRDS(raw_gene_counts)
compound_to_test      <- read.csv(compound_to_test)

# This might be the only key difference between this script and Dep/PRISM generation
compound_to_test$area_under_curve            <- as.numeric(compound_to_test$area_under_curve)
compound_to_test$lineage                     <- as.factor(compound_to_test$lineage)

## Subset the counts
lines_to_test <- compound_to_test$DepMap_ID
count_matrix  <- ccle_counts[,lines_to_test]

rownames(compound_to_test) <- compound_to_test$DepMap_ID

## voom model
design <- model.matrix(~lineage + area_under_curve, data=compound_to_test)

## reorder count_matrix so that cols matches rows from design
count_matrix <- count_matrix[,rownames(design)]

dge  <- DGEList(counts=count_matrix)
keep <- filterByExpr(dge, design=design)

dge <- dge[keep, keep.lib.sizes=FALSE]
dge <- calcNormFactors(dge)

## TODO: save voom model plot
v <- voom(dge, plot=FALSE, normalize.method='quantile')

fit <- lmFit(v, design)
fit <- eBayes(fit)

saveRDS(fit, ebayes_model)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
library('GSEABase')
library('limma')
library('dplyr')


## SNAKEMAKE I/O  ##

eBayes_model       <- snakemake@input[['fitted_bayes']]
drug_info          <- snakemake@input[['treatment_info']]

geneset_directory <- snakemake@output[['bidirectional_geneset']]

compound_id       <- snakemake@wildcards[['broad_id']]

## SNAKEMAKE PARAMS ##
signature_type    <- tolower(as.character(snakemake@params[['signature_type']]))

## If signature_type == 'Classic',
## then we'll take the top/bottom 250 genes sorted by T.statistics
## If signature_type == 'fold', we'll perform FDR + log fold selection

generate_bidirectional_signature <- function(sig_name, deg_genes){

    ## Split the table between S_genes (negative logfold) 
    ## and R genes (positive)
    if(signature_type=='classic'){
        sensitivity_genes <- deg_genes[deg_genes$t <0, 'ID']
        resistance_genes  <- deg_genes[deg_genes$t >0, 'ID']
    }else{
        sensitivity_genes <- deg_genes[deg_genes$logFC < 0, 'ID']
        resistance_genes  <- deg_genes[deg_genes$logFC > 0, 'ID']
    }
    if(length(sensitivity_genes) >= 15){
       candidate_genes <- extract_top_genes(deg_genes, 'sensitivity', signature_type=signature_type)
       sensitivity_gset <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
       setName(sensitivity_gset) <- paste(sig_name, 'UP', sep='_')

       GSEABase::toGmt(sensitivity_gset, con = paste(geneset_directory, '/', sig_name, '_UP', '.gmt', sep=''))
    }
    if(length(resistance_genes) >= 15){
        candidate_genes <- extract_top_genes(deg_genes, 'resistance', signature_type=signature_type)
        resistance_set <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
        setName(resistance_set) <- paste(sig_name, 'DOWN', sep='_')

        GSEABase::toGmt(resistance_set, con = paste(geneset_directory, '/', sig_name, '_DOWN', '.gmt', sep='')) 
    }
}

#TODO: This function here is extremely ugly. Refactor it
extract_top_genes <- function(deg_genes, mode='sensitivity', signature_type='classic'){

    ## Extract and sort the top hits for the selected direction/mode
    if(mode=='sensitivity'){

        if(signature_type=='classic'){
            deg_genes <- head(deg_genes[order(deg_genes$t), 'ID'], n=250)
        }else{
            deg_genes <- head(deg_genes[order(deg_genes$logFC), 'ID'], n=250)
        }
    }else{
        if(signature_type=='classic'){
            deg_genes <- head(deg_genes[order(-deg_genes$t), 'ID'], n=250)
        }else{
        deg_genes <- head(deg_genes[order(-deg_genes$logFC), 'ID'], n=250)
        }
    }
    return(deg_genes)
}


drug_info <- data.table::fread(drug_info) %>%
    as.data.frame()

drug_info <- drug_info[!duplicated(drug_info$broad_cpd_id),]

bayes_results <- readRDS(eBayes_model)

if(signature_type == 'classic'){
    all_genes <- topTable(bayes_results, coef='area_under_curve', number = Inf, adjust.method='fdr')
}else{ 
    ## Set a threshold of the adjusted p-value if signature_type is not classic
    all_genes <- topTable(bayes_results, coef = 'area_under_curve', 
                     number = Inf, adjust.method = 'fdr',
                     p.value=0.05)
}
# Set the genes as a column
all_genes$ID <- rownames(all_genes)

if(!dir.exists(file.path(geneset_directory))){
    dir.create(geneset_directory)}

## Check if we have at least 15 genes left after FDR-cutoff. Does not matter
## If both directions are < 15 individually, we'll account for that later.
## Just make sure we are not evaluating empty results 

if(nrow(all_genes) >= 15){

    common_name <- drug_info[drug_info$broad_cpd_id == compound_id, 'cpd_name']

    if(length(common_name) == 0){
        common_name <- compound_id}

    brd_split   <- strsplit(x=compound_id, split='-', fixed=TRUE)[[1]][2]

    # This code deals with combinations of drugs, to keep names short
    if(grepl(pattern = "mol/mol", x = common_name, fixed = TRUE)){
     common_name <- strsplit(x = common_name, split = " ")[[1]][1]}

    sig_name <- paste(sep='_', common_name,'CTRP', brd_split)

    generate_bidirectional_signature(sig_name, all_genes)

}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
library("tidyverse")

###  SNAKEMAKE I/O ###
raw_crispr_data <- snakemake@input[["crispr_gene_dependency_chronos"]]
lines_info <- snakemake@input[["sample_info"]]
ccle_raw_reads <- snakemake@input[["expression_matrix"]]
where_to_save <- snakemake@output[["model_candidates"]]

ccle_counts <- readRDS(ccle_raw_reads)
crispr <- data.table::fread(raw_crispr_data)
annot <-  data.table::fread(lines_info)

## keep lines whose primary disease is cancer AND with available expr.
annot_cancer <- annot %>%
    filter(
        !primary_disease %in% c("Unknown", "Non-Cancerous") 
    )

## drop lineages with too few CCLs
all_lineages <- table(annot_cancer$lineage)
lineages_to_keep <- names(all_lineages[all_lineages >= 5])

## get the names of the CCLs whose lineages are OK AND with RNA-seq
lines_to_keep <- annot_cancer %>%
    filter(
        lineage %in% lineages_to_keep
    ) %>%
    pull(DepMap_ID)

lines_to_keep <- intersect(lines_to_keep, colnames(ccle_counts))

## Filter crispr data
crispr_filtered <- crispr %>%
    filter(
        ModelID %in% lines_to_keep
    )

crispr_lines <- crispr_filtered$ModelID
crispr_filtered$ModelID <- NULL

# CCLE nomenclature is HUGO (ENTREZ). Keep HUGO ONLY
colnames(crispr_filtered) <- stringr::str_remove_all(string = colnames(crispr_filtered),
                                                     pattern = " \\(.*$"
                                                     )

## Resolve duplicates keeping most variable gene
crispr_filtered <- t(crispr_filtered)
colnames(crispr_filtered) <- crispr_lines

vars <- apply(crispr_filtered, 1, var)
crispr_filtered <- cbind(crispr_filtered, vars)
crispr_filtered <- crispr_filtered[order(crispr_filtered[, "vars"],decreasing=TRUE),]

crispr_filtered_unique <- crispr_filtered[!duplicated(rownames(crispr_filtered)), ]
crispr_filtered_unique <- crispr_filtered_unique[, colnames(crispr_filtered_unique) != "vars"]

## Melt the dataset
crispr_probabilities <- crispr_filtered_unique %>%
    as_tibble(rownames = "Gene") %>%
    pivot_longer(
        cols = starts_with("ACH-"),
        names_to = "cell_line",
        values_to = "probability"
        )


# Filter out genes with less than 5 cell lines dependant on them
crispr_enough_power <- crispr_probabilities %>%
    group_by(Gene) %>%
    mutate(dependent_lines = sum(probability >= 0.5)) %>%
    filter(
        dependent_lines >= 5
    )

# annotate the lineage and stripped name
crispr_enough_power_annotated <- crispr_enough_power %>%
    left_join(
        y = annot_cancer[, c("DepMap_ID",
                             "lineage",
                             "stripped_cell_line_name",
                             "original_lineage"
                             )
                         ],
        by = c("cell_line" = "DepMap_ID")
    )


if(!dir.exists(file.path(where_to_save))){
    dir.create(where_to_save)}

by(crispr_enough_power_annotated,
   crispr_enough_power_annotated$Gene,
   FUN=function(i) write.csv(i, paste0(where_to_save, "/", i$Gene[1], ".csv"), , row.names=FALSE))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
library("limma")
library("fgsea")

### SNAKEMAKE I/O ###
ebayes_model    <- snakemake@input[["ebayes_model"]]
geneset_collection <- snakemake@input[["drugs"]]
enrichment_table  <- snakemake@output[["enrichments"]]

## Load and set data types manually just in case
ebayes_model <- readRDS(ebayes_model)
geneset_collection <- fgsea::gmtPathways(geneset_collection)

## Subset the collection to keep UP drug genesets only
is_up_geneset <- grepl(pattern = "\\.*_UP", x = names(geneset_collection))
geneset_collection <- geneset_collection[is_up_geneset]

## generate a named vector of gene-level stats
## An actual ranking is not needed
t_rank <- topTable(
    fit = ebayes_model,
    coef = "probability",
    number = Inf,
    sort.by = "t"
    )

named_gene_rank <- t_rank$t
names(named_gene_rank) <- rownames(t_rank)

fgsea_res <- fgsea(
    pathways = geneset_collection,
    stats = named_gene_rank,
    minSize = 15,
    maxSize = 500,
    gseaParam = 1,
    scoreType = "pos"
)

fgsea_res$leadingEdge <- sapply(
    fgsea_res$leadingEdge,
    FUN = paste,
    collapse = " "
    )

new_directory <- dirname(enrichment_table)

if (!dir.exists(file.path(new_directory))) {
    dir.create(new_directory)
    }

write.table(
    x = fgsea_res,
    file = enrichment_table,
    sep = "\t",
    row.names = FALSE,
    col.names = TRUE
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
library("limma")
library("fgsea")

### SNAKEMAKE I/O ###
ebayes_model    <- snakemake@input[["ebayes_model"]]
geneset_collection <- snakemake@input[["hallmarks"]]
enrichment_table  <- snakemake@output[["enrichments"]]

## Load and set data types manually just in case
ebayes_model <- readRDS(ebayes_model)
geneset_collection <- fgsea::gmtPathways(geneset_collection)

## generate a named vector of gene-level stats
## An actual ranking is not needed
t_rank <- topTable(
    fit = ebayes_model,
    coef = "probability",
    number = Inf,
    sort.by = "t"
    )

named_gene_rank <- t_rank$t
names(named_gene_rank) <- rownames(t_rank)

fgsea_res <- fgsea(
    pathways = geneset_collection,
    stats = named_gene_rank,
    minSize = 15,
    maxSize = 500,
    gseaParam = 1,
    scoreType = "pos"
)

fgsea_res$leadingEdge <- sapply(
    fgsea_res$leadingEdge,
    FUN = paste,
    collapse = " "
    )

new_directory <- dirname(enrichment_table)

if (!dir.exists(file.path(new_directory))) {
    dir.create(new_directory)
    }

write.table(
    x = fgsea_res,
    file = enrichment_table,
    sep = "\t",
    row.names = FALSE,
    col.names = TRUE
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
library('edgeR')
library('limma')

### SNAKEMAKE I/O ###
dep_to_test           <- snakemake@input[['dependency_to_test']]
raw_gene_counts       <- snakemake@input[['raw_gene_counts']]

ebayes_model          <- snakemake@output[['ebayes']]

## Load and set data types manually just in case
ccle_counts           <- readRDS(raw_gene_counts)
dep_to_test           <- read.csv(dep_to_test)

dep_to_test$probability         <- as.numeric(dep_to_test$probability)
dep_to_test$lineage             <- as.factor(dep_to_test$lineage)

## Subset the counts
lines_to_test <- dep_to_test$cell_line
count_matrix  <- ccle_counts[,lines_to_test]

rownames(dep_to_test) <- dep_to_test$cell_line

## voom model
design <- model.matrix(~lineage + probability, data=dep_to_test)

## reorder count_matrix so that cols matches rows from design
count_matrix <- count_matrix[,rownames(design)]

dge  <- DGEList(counts=count_matrix)
keep <- filterByExpr(dge, design=design, min.total.count = 100, min.count = 50)

dge <- dge[keep, keep.lib.sizes=FALSE]
dge <- calcNormFactors(dge, method = "TMM")

## TODO: save voom model plot
v <- voom(dge, design = design, normalize.method = "quantile", plot=FALSE)

fit <- lmFit(v, design)
fit <- eBayes(fit)

saveRDS(fit, ebayes_model)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
suppressMessages(library('GSEABase'))
suppressMessages(library('limma'))
suppressMessages(library('dplyr'))


## SNAKEMAKE I/O  ##

eBayes_model       <- snakemake@input[['fitted_bayes']]
geneset_directory  <- snakemake@output[['bidirectional_geneset']]

gene_name          <- snakemake@wildcards[['gene']]

## Logging
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")



generate_bidirectional_signature <- function(sig_name, deg_genes){

    ## Split the table between dep_associated_genes (positive logfold) 
    ## and non.dep associated genes (negative
    dependency_upregulated         <- deg_genes[deg_genes$logFC > 0, 'ID']
    dependency_downregulated       <- deg_genes[deg_genes$logFC < 0, 'ID']

    if(length(dependency_upregulated) >= 5){

       candidate_genes  <- extract_top_genes(deg_genes, 'dependent_up')
       sensitivity_gset <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
       setName(sensitivity_gset) <- paste(sig_name, 'UP', sep='_')

        GSEABase::toGmt(sensitivity_gset, con = paste(geneset_directory, '/', sig_name, '_dependency_UP', '.gmt', sep=''))
    }
    if(length(dependency_downregulated) >= 5){
        candidate_genes <- extract_top_genes(deg_genes, 'dependent_down')
        resistance_set  <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
        setName(resistance_set) <- paste(sig_name, 'DOWN', sep='_')

        GSEABase::toGmt(resistance_set, con = paste(geneset_directory, '/', sig_name, '_dependency_DOWN', '.gmt', sep='')) 
    }
}

extract_top_genes <- function(deg_genes, mode='dependent_up'){

    if(mode=='dependent_up'){
        deg_genes <- head(deg_genes[order(deg_genes$logFC, decreasing = TRUE), 'ID'], n=250)
    }else{
        deg_genes <- head(deg_genes[order(deg_genes$logFC, decreasing = FALSE), 'ID'], n=250)
    }

    return(deg_genes)
}


bayes_results <- readRDS(eBayes_model)

all_genes <- topTable(bayes_results, coef = 'probability', number = Inf, adjust.method = 'fdr')

all_genes <- all_genes[all_genes$adj.P.Val <= 0.05, ]

all_genes$ID <- rownames(all_genes)

if(!dir.exists(file.path(geneset_directory))){
    dir.create(geneset_directory)}


if(nrow(all_genes) >= 5){

    sig_name <- paste(sep='_', gene_name, 'DepMap', '22Q2')
    generate_bidirectional_signature(sig_name, all_genes)
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
library('ArrayExpress')

### SNAKEMAKE I/O ###
raw_cel_path      <- snakemake@output[['raw_cel_files']]

if(!dir.exists(file.path(raw_cel_path))){
    dir.create(raw_cel_path)}

# E-MTAB-3610 is the ArrayExpress ID of the GDSC-1000 profiling array
array_set = ArrayExpress::getAE('E-MTAB-3610', path=raw_cel_path, type='raw')
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
library("tidyverse")

### SNAKEMAKE I/O ###
gdsc_dose_response_curves      <- snakemake@input[["dose_response_curves"]]
cell_lines_annotation          <- snakemake@input[["cell_lines_annotation"]]
array_metadata                 <- snakemake@input[["array_metadata"]]

auc_models_candidates          <- snakemake@output[["auc_models_candidates"]]
compounds_lines_profiled       <- snakemake@output[["compounds_lines_profiled"]]

# Setup log file
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


## Load tables

candidate_curves <- readxl::read_xlsx(gdsc_dose_response_curves) %>%
                    filter(RMSE <= 0.3) %>%
                    as.data.frame()

cell_lines_annotation <- data.table::fread(cell_lines_annotation) %>%
                        as.data.frame()

array_metadata <- data.table::fread(array_metadata) %>%
                  as.data.frame()

## Filter out drug/lines associations where the line is not profiled
## use the same column as gdsc_normalize_arrays
available_lines    <- array_metadata[, "Characteristics[cell line]"]
rm(array_metadata)

## Match SANGER and CCLE info
## I Manually tested it and only 1 line cannot be matched
sanger_lines <- unique(candidate_curves$SANGER_MODEL_ID)
ccle_lines   <- unique(cell_lines_annotation$Sanger_Model_ID)

common_lines <- intersect(sanger_lines, ccle_lines)

## Keep curves present in both the array AND the CCLE metadata
candidate_curves <- filter(candidate_curves, SANGER_MODEL_ID %in% common_lines, CELL_LINE_NAME %in% available_lines)

## Annotate cell line histology
candidate_curves <- merge(x=candidate_curves,
                         y=cell_lines_annotation[,c("Sanger_Model_ID", "lineage")],
                         by.x="SANGER_MODEL_ID",
                         by.y="Sanger_Model_ID",
                         all.x=TRUE,
                         all.y=FALSE)

## get the number of profiled lines/compound
lines_by_compound <- candidate_curves %>%
                     group_by(DRUG_ID) %>%
                     summarise(
                        profiled_lines = n_distinct(SANGER_MODEL_ID),
                        cv  = sd(AUC) / mean(AUC)
                        ) %>%
                     as.data.frame()

##TODO: SAVE THIS
write.csv(lines_by_compound, file=compounds_lines_profiled, row.names=FALSE)

## Get rid of models with less than 10 profiled lines
compounds_to_test <- lines_by_compound %>%
    filter(profiled_lines >= 10 & cv >= 0.1) %>%
    arrange(desc(cv)) %>% 
    distinct(DRUG_ID, .keep_all = TRUE) %>%
    pull(DRUG_ID)

candidate_curves   <- filter(candidate_curves, DRUG_ID %in% compounds_to_test)

candidate_curves$lineage <- as.factor(candidate_curves$lineage)

if(!dir.exists(file.path(auc_models_candidates))){
    dir.create(auc_models_candidates)}

by(candidate_curves,
   candidate_curves$DRUG_ID,
   FUN=function(i) write.csv(i, paste0(auc_models_candidates, "/", i$DRUG_ID[1], ".csv"), , row.names=FALSE))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
library("tidyverse")


### SNAKEMAKE I/O ###
signatures_data        <- snakemake@input[["signatures_data"]]
cell_line_annotation <- snakemake@input[["cell_line_annotation"]]
lines_compounds      <- snakemake@input[["lines_compounds"]]
compound_meta        <- snakemake@input[["compound_meta"]]

comma_file <- snakemake@output[["csv_db"]]
rdata      <- snakemake@output[["rdata_db"]]

full_table <- signatures_data %>%
    map(read_csv) %>%
    reduce(bind_rows)

## Annotate how many lines were profiled by comp.
lines_compounds <- read_csv(lines_compounds) 

## load compound metadata
compound_meta <- read_csv(compound_meta) %>%
    filter(`Sample Size` == "GDSC2") %>%
    select(drug_id, PubCHEM)

## Now keep columns of interest
# broad_id, name, auc, ic50, depmap_id, lineage, moa
filtered_data <- full_table %>%
    select(DRUG_ID,
           DRUG_NAME,
           AUC,
           LN_IC50,
           SANGER_MODEL_ID,
           lineage,
           PUTATIVE_TARGET,
           PATHWAY_NAME
           ) %>%
    left_join(y = lines_compounds, by = "DRUG_ID") %>%
    rename(drug_id = DRUG_ID,
           name = DRUG_NAME,
           auc = AUC,
           ic50 = LN_IC50,
           cell_id = SANGER_MODEL_ID,
           moa = PATHWAY_NAME,
           target = PUTATIVE_TARGET) %>%
    left_join(y = compound_meta, by = "drug_id") %>%
    rename(
        pubchem = PubCHEM
    )

## Attempt to get DepMap IDs using SANGER passport
cell_line_annotation <- read_csv(cell_line_annotation) %>%
    select(Sanger_Model_ID, DepMap_ID) %>%
    rename(cell_id = Sanger_Model_ID,
           depmap_id = DepMap_ID) %>%
    filter(!is.na(cell_id))

filtered_annotated_data <- filtered_data %>%
    left_join(cell_line_annotation, by = "cell_id") %>%
    select(-cell_id)

write_csv(x = filtered_annotated_data, file = comma_file)
save(filtered_annotated_data, file = rdata)    
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
library('limma')

### SNAKEMAKE I/O ###
compound_to_test        <- snakemake@input[['compound_to_test']]
normalized_arrays       <- snakemake@input[['normalized_arrays']]
ebayes_model            <- snakemake@output[['ebayes']]

## Load and set data types manually just in case
normalized_arrays           <- readRDS(normalized_arrays)
compound_to_test            <- read.csv(compound_to_test)

compound_to_test$AUC            <- as.numeric(compound_to_test$AUC)
compound_to_test$lineage        <- as.factor(compound_to_test$lineage)


## Subset the arrays
lines_to_test      <- compound_to_test$CELL_LINE_NAME
normalized_arrays  <- normalized_arrays[,lines_to_test]

rownames(compound_to_test) <- compound_to_test$CELL_LINE_NAME

design <- model.matrix(~lineage + AUC, data=compound_to_test)

## reorder count_matrix so that cols matches rows from design
normalized_arrays <- normalized_arrays[,rownames(design)]

fit <- lmFit(normalized_arrays, design)
fit <- eBayes(fit)

saveRDS(fit, ebayes_model)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
library('oligo')
library('oligoClasses')

### SNAKEMAKE I/O ###
raw_cel_path      <- snakemake@input[['cel_files']]
normalized_arrays <- snakemake@output[['normalized_arrays']]

# Setup log file
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

sdrf_location <- file.path(raw_cel_path, "E-MTAB-3610.sdrf.txt")
SDRF          <- read.delim(sdrf_location)


rownames(SDRF) <- SDRF$Array.Data.File
SDRF           <- AnnotatedDataFrame(SDRF)

raw_exprset <- oligo::read.celfiles(filenames = file.path(raw_cel_path, 
                                    SDRF$Array.Data.File),
                                    verbose = FALSE, phenoData = SDRF)


norm_eset <- oligo::rma(raw_exprset, background = TRUE, normalize = TRUE)

## Save it as a matrix directly, no need to keep the whole object.
## Make sure to change colnames 
pdata <- phenoData(norm_eset) 
pdata <- as.data.frame(pdata@data)

eset            <- exprs(norm_eset)
colnames(eset)  <- pdata[colnames(eset), 'Characteristics.cell.line.']

saveRDS(eset, normalized_arrays)
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
library("tidyverse")

### SNAKEMAKE I/O ###
gdsc_dose_response_curves      <- snakemake@input[["dose_response_curves"]]
cell_lines_annotation          <- snakemake@input[["cell_lines_annotation"]]
count_matrix                   <- snakemake@input[["count_matrix"]]

auc_models_candidates          <- snakemake@output[["auc_models_candidates"]]
compounds_lines_profiled       <- snakemake@output[["compounds_lines_profiled"]]

# Setup log file
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


## Load tables

candidate_curves <- readxl::read_xlsx(gdsc_dose_response_curves) %>%
                    filter(RMSE <= 0.3) %>%
                    as.data.frame()

cell_lines_annotation <- data.table::fread(cell_lines_annotation) %>%
                        as.data.frame()

ccle_counts <- readRDS(count_matrix)

## Filter out drug/lines associations where the line is not profiled
available_lines    <- colnames(ccle_counts)
rm(ccle_counts)

## common lines check
common_lines <- intersect(
    candidate_curves$SANGER_MODEL_ID, 
    cell_lines_annotation$Sanger_Model_ID
    )

## Filter the curves based on expression availability and annotation
candidate_curves_filtered <- candidate_curves %>%
    filter(
        SANGER_MODEL_ID %in% common_lines
    ) %>%
    left_join(
        y = cell_lines_annotation[, c("Sanger_Model_ID", "DepMap_ID", "lineage")],
        by = c("SANGER_MODEL_ID" = "Sanger_Model_ID")
    ) %>%
    filter(
        DepMap_ID %in% available_lines
    )

## get the number of profiled lines/compound
lines_by_compound <- candidate_curves_filtered %>%
    group_by(DRUG_ID) %>%
    summarise(
        profiled_lines = n_distinct(SANGER_MODEL_ID),
        cv  = sd(AUC) / mean(AUC)
    ) %>%
    as.data.frame()

write.csv(lines_by_compound, file=compounds_lines_profiled, row.names=FALSE)

## Get rid of models with less than 10 profiled lines
compounds_to_test <- lines_by_compound %>%
    filter(profiled_lines >= 10 & cv >= 0.1) %>%
    arrange(desc(cv)) %>% 
    distinct(DRUG_ID, .keep_all = TRUE) %>%
    pull(DRUG_ID)

candidate_curves_filtered <- filter(candidate_curves_filtered,
                                    DRUG_ID %in% compounds_to_test
                                    )

candidate_curves_filtered$lineage <- as.factor(candidate_curves_filtered$lineage)

if(!dir.exists(file.path(auc_models_candidates))){
    dir.create(auc_models_candidates)}

by(candidate_curves_filtered,
   candidate_curves_filtered$DRUG_ID,
   FUN=function(i) write.csv(i, 
                             paste0(auc_models_candidates, 
                                    "/", i$DRUG_ID[1], ".csv"), ,
                             row.names=FALSE)
   )
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
suppressMessages(library("edgeR"))
suppressMessages(library("limma"))

### SNAKEMAKE I/O ###
compound_to_test      <- snakemake@input[["compound_to_test"]]
raw_gene_counts       <- snakemake@input[["raw_gene_counts"]]

ebayes_model          <- snakemake@output[["ebayes"]]

## Logging
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


## Load and set data types manually just in case
ccle_counts           <- readRDS(raw_gene_counts)
compound_to_test      <- read.csv(compound_to_test)

compound_to_test$AUC            <- as.numeric(compound_to_test$AUC)
compound_to_test$lineage        <- as.factor(compound_to_test$lineage)

## Subset the counts
lines_to_test <- compound_to_test$DepMap_ID
count_matrix  <- ccle_counts[, lines_to_test]

rownames(compound_to_test) <- compound_to_test$DepMap_ID

## voom model
design <- model.matrix(~lineage + AUC, data = compound_to_test)

## reorder count_matrix so that cols matches rows from design
count_matrix <- count_matrix[, rownames(design)]

dge  <- DGEList(counts = count_matrix)
keep <- filterByExpr(dge, design = design)

dge <- dge[keep, keep.lib.sizes = FALSE]
dge <- calcNormFactors(dge)

## TODO: save voom model plot
v <- voom(dge, plot = FALSE, normalize.method = "quantile")

fit <- lmFit(v, design)
fit <- eBayes(fit)

saveRDS(fit, ebayes_model)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
suppressMessages(library("openxlsx"))
suppressMessages(library("GSEABase"))
suppressMessages(library("limma"))
suppressMessages(library("tidyverse"))

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


## SNAKEMAKE I/O  ##
eBayes_model       <- snakemake@input[["fitted_bayes"]]
drug_info          <- snakemake@input[["treatment_info"]]
geneset_directory <- snakemake@output[["bidirectional_geneset"]]
compound_id       <- snakemake@wildcards[["drug_id"]]

## SNAKEMAKE PARAMS ##
signature_type    <- tolower(as.character(snakemake@params[["signature_type"]]))

# Logging
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

## If signature_type == "Classic",
## then we"ll take the top/bottom 250 genes sorted by T.statistics
## If signature_type == "fold", we"ll perform FDR + log fold selection

generate_bidirectional_signature <- function(sig_name, deg_genes){

    ## Split the table between S_genes (negative logfold) 
    ## and R genes (positive)
    if (signature_type == "classic") {
        sensitivity_genes <- deg_genes[deg_genes$t < 0, "ID"]
        resistance_genes  <- deg_genes[deg_genes$t > 0, "ID"]
    }else {
        sensitivity_genes <- deg_genes[deg_genes$logFC < 0, "ID"]
        resistance_genes  <- deg_genes[deg_genes$logFC > 0, "ID"]
    }
    if (length(sensitivity_genes) >= 15) {
       candidate_genes <- extract_top_genes(deg_genes, "sensitivity", signature_type=signature_type)
       sensitivity_gset <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
       setName(sensitivity_gset) <- paste(sig_name, "UP", sep = "_")

       GSEABase::toGmt(sensitivity_gset, con = paste(geneset_directory, "/", sig_name, "_UP", ".gmt", sep=""))
    }
    if (length(resistance_genes) >= 15){
        candidate_genes <- extract_top_genes(deg_genes, "resistance", signature_type=signature_type)
        resistance_set <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
        setName(resistance_set) <- paste(sig_name, "DOWN", sep = "_")

        GSEABase::toGmt(resistance_set, con = paste(geneset_directory, "/", sig_name, "_DOWN", ".gmt", sep="")) 
    }
}

#TODO: This function here is extremely ugly. Refactor it
extract_top_genes <- function(deg_genes, mode = "sensitivity", signature_type = "classic"){

    ## Extract and sort the top hits for the selected direction/mode
    if (mode == "sensitivity"){

        if(signature_type == "classic"){
            deg_genes <- head(deg_genes[order(deg_genes$t), "ID"], n=250)
        }else{
            deg_genes <- head(deg_genes[order(deg_genes$logFC), "ID"], n=250)
        }
    }else{
        if(signature_type=="classic"){
            deg_genes <- head(deg_genes[order(-deg_genes$t), "ID"], n=250)
        }else{
        deg_genes <- head(deg_genes[order(-deg_genes$logFC), "ID"], n=250)
        }
    }
    return(deg_genes)
}


# Get treatment annotation
drug_info <- read.xlsx(drug_info)
drug_info <- drug_info %>% dplyr::select(DRUG_ID, DRUG_NAME) %>% unique

bayes_results <- readRDS(eBayes_model)

if(signature_type == "classic"){
    all_genes <- topTable(bayes_results, coef="AUC", number = Inf, adjust.method="fdr")
}else{ 
    ## Set a threshold of the adjusted p-value if signature_type is not classic
    all_genes <- topTable(bayes_results, coef = "AUC", 
                     number = Inf, adjust.method = "fdr",
                     p.value=0.05)
}
# Set the genes as a column
all_genes$ID <- rownames(all_genes)

if(!dir.exists(file.path(geneset_directory))){
    dir.create(geneset_directory)}

## Check if we have at least 15 genes left after FDR-cutoff. Does not matter
## If both directions are < 15 individually, we"ll account for that later.
## Just make sure we are not evaluating empty results 

if(nrow(all_genes) >= 15){

    common_name <- drug_info[drug_info$DRUG_ID == compound_id, "DRUG_NAME"]

    if (length(common_name) == 0){
        common_name <- compound_id
    }

    sig_name <- paste(sep="_", common_name, "GDSC", compound_id)

    generate_bidirectional_signature(sig_name, all_genes)

}
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

suppressMessages(library("limma"))
suppressMessages(library("openxlsx"))
suppressMessages(library("tidyverse"))
suppressMessages(library("GSEABase"))

## SNAKEMAKE I/O ##
eBayes_model <- snakemake@input[["fitted_bayes"]]
drug_info <- snakemake@input[["treatment_info"]]
hgnc <- snakemake@input[["hgnc"]]

geneset_directory <- snakemake@output[["bidirectional_geneset"]]

## SNAKEMAKE PARAMS ##
signature_type <- tolower(as.character(snakemake@params[['signature_type']]))

## FUNCTION ##
create.gmt <- function(x, mode) {
  gset <- GeneSet(x, geneIdType = SymbolIdentifier())
  setName(gset) <- paste(sig_name, mode, sep = "_")
  collapsed_name <- str_replace_all(sig_name, pattern = " ", repl = "")
  toGmt(gset, con = paste0(geneset_directory, '/', collapsed_name, "_", mode, 
                           ".gmt"))
}

## CODE ##
# Create output directory
dir.create(geneset_directory, showWarnings = FALSE)

# Get Bayes result
bayes_results <- readRDS(eBayes_model)

# Get treatment annotation
drug_info <- read.xlsx(drug_info)
drug_info <- drug_info %>% dplyr::select(DRUG_ID, DRUG_NAME) %>% unique

# Get signature name
sigID <- gsub("_eBayes.rds", "", basename(eBayes_model))
common_name <- drug_info$DRUG_NAME[match(sigID, drug_info$DRUG_ID, 
                                         nomatch = sigID)]
sig_name <- paste(common_name, "GDSC", sigID, sep = "_")

# Get probe ID and HUGO symbol correspondence
hgnc <- read.table(hgnc, sep = "\t", header = TRUE)

# Variables depending on signature_type
if (signature_type == "classic") {
  p.val = 1
  magnitude <- "t"
} else if (signature_type == "fold") {
  p.val = 0.05
  magnitude <- "logFC"
}

# Get top genes table
top_genes <- topTable(bayes_results, coef = "AUC", number = Inf, 
                      adjust.method = "fdr", p.value = p.val)

# Continue if the table is not empty
if (nrow(top_genes) != 0) {
  # Make the rownames a new column
  top_genes$probe <- rownames(top_genes)

  # Match with HUGO symbols
  top_genes <- na.omit(merge(top_genes, hgnc[, c("probe", "hgnc")], all.x = TRUE))

  # If there are several values for the same gene, keep the one with the highest 
  # absolute value of "magnitude"
  top_genes <- top_genes %>% group_by(hgnc) %>% 
    dplyr::slice(which.max(get(magnitude)))

  # Create bidirectional signature
  if (signature_type == "classic") {
    sensitivity <- top_genes %>% filter(t < 0)
    resistance <- top_genes %>% filter(t > 0)
  } else if (signature_type == "fold") {
    sensitivity <- top_genes %>% filter(logFC < 0)
    resistance <- top_genes %>% filter(logFC > 0)
  }

  # Keep the 250 top/bottom genes
  sensitivity <- head(sensitivity, n = 250) %>% arrange(desc(get(magnitude))) %>% 
    dplyr::select(hgnc) %>% pull
  resistance <- tail(resistance, n = 250) %>% arrange(get(magnitude)) %>%
    dplyr::select(hgnc) %>% pull

  # Ugly fix to keep fold signatures at 250. We really should refactor these
  if (signature_type == "fold"){
    sensitivity <- head(sensitivity, n = 250)
    resistance <- head(resistance, n = 250)
  }

  # Create a gmt if the number of genes > 15
  if (length(sensitivity) >= 15) {
    create.gmt(sensitivity, "UP")
  }
  if (length(resistance) >= 15) {
    create.gmt(resistance, "DOWN")
  }
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
suppressMessages(library("edgeR"))
suppressMessages(library("limma"))

### SNAKEMAKE I/O ###
met_type_to_test      <- snakemake@input[["mets_to_test"]]
raw_gene_counts       <- snakemake@input[["raw_gene_counts"]]

ebayes_model          <- snakemake@output[["ebayes"]]

### SNAKEMAKE PARAMS ###
model_type <- snakemake@params[["model_type"]]

## Logging
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


## Load and set data types manually just in case
ccle_counts           <- readRDS(raw_gene_counts)
met_type_to_test      <- read.csv(met_type_to_test)

## Sanity check
met_type_to_test$mean            <- as.numeric(met_type_to_test$mean)
met_type_to_test$penetrance      <- as.numeric(met_type_to_test$penetrance)
met_type_to_test$lineage        <- as.factor(met_type_to_test$lineage)

## Subset the counts
lines_to_test <- met_type_to_test$DepMap_ID
count_matrix  <- ccle_counts[, lines_to_test]

rownames(met_type_to_test) <- met_type_to_test$DepMap_ID

## voom model
if(model_type == "mean"){
    design <- model.matrix(~lineage + mean, data=met_type_to_test)
}else{
    design <- model.matrix(~lineage + penetrance, data=met_type_to_test)
}

## reorder count_matrix so that cols matches rows from design
count_matrix <- count_matrix[ ,rownames(design)]

dge  <- DGEList(counts=count_matrix)
keep <- filterByExpr(dge, design=design)

dge <- dge[keep, keep.lib.sizes=FALSE]
dge <- calcNormFactors(dge)

## TODO: save voom model plot
v <- voom(dge, plot=FALSE, normalize.method="quantile")

fit <- lmFit(v, design)
fit <- eBayes(fit)

saveRDS(fit, ebayes_model)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
library("tidyverse")


### SNAKEMAKE I/O ###
metastasis_penetrance        <- snakemake@input[["metastasis_data"]]
cell_lines_annotation  <- snakemake@input[["cell_lines_annotation"]]
count_matrix           <- snakemake@input[["count_matrix"]]

met_model_candidates   <- snakemake@output[["met_model_candidates"]]
met_lines_profiled      <- snakemake@output[["met_lines_profiled"]]

## load tables
metastasis_penetrance <- readr:::read_csv(metastasis_penetrance) %>%
                  as.data.frame()

cell_lines_annotation <- data.table::fread(cell_lines_annotation) %>%
                         as.data.frame()

ccle_counts <- readRDS(count_matrix)

metastasis_penetrance <- left_join(
    x = metastasis_penetrance,
    y = cell_lines_annotation,
    by = c("cell_line" = "CCLE_Name")
)  

## Get rid of cell lines and met data for which no RNASeq profiling was done
available_lines <- intersect(colnames(ccle_counts), metastasis_penetrance$DepMap_ID)

selected_models <- metastasis_penetrance %>%
    filter(DepMap_ID %in% available_lines) %>%
    select(
        met_model,
        DepMap_ID,
        mean, 
        penetrance,
        cell_line,
        lineage,
        original_lineage,
        primary_or_metastasis
        ) %>%
    mutate(
        lineage = as_factor(lineage),
        met_model = as_factor(met_model)
    )

## Get the nº. lines/compound
lines_and_mets <- selected_models %>%
    group_by(met_model) %>%
    summarise(profiled_lines = n_distinct(DepMap_ID)) %>%
    as.data.frame()

write.csv(lines_and_mets, file = met_lines_profiled, row.names=FALSE)


if(!dir.exists(file.path(met_model_candidates))){
    dir.create(met_model_candidates)}

by(selected_models,
   selected_models$met_model,
   FUN=function(i) write.csv(i, paste0(met_model_candidates, '/', i$met_model[1], ".csv"), , row.names=FALSE))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
suppressMessages(library("GSEABase"))
suppressMessages(library("limma"))
suppressMessages(library("dplyr"))


## SNAKEMAKE I/O  ##

eBayes_model       <- snakemake@input[["fitted_bayes"]]
geneset_directory  <- snakemake@output[["bidirectional_geneset"]]

met_name          <- snakemake@wildcards[["met_type"]]
model_type        <- snakemake@params[["model_type"]]

## Logging
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")



generate_bidirectional_signature <- function(sig_name, deg_genes){

    ## Split the table between dep_associated_genes (positive logfold) 
    ## and non.dep associated genes (negative
    dependency_upregulated         <- deg_genes[deg_genes$logFC > 0, "ID"]
    dependency_downregulated       <- deg_genes[deg_genes$logFC < 0, "ID"]

    if(length(dependency_upregulated) >= 5){

       candidate_genes  <- extract_top_genes(deg_genes, "met_UP")
       sensitivity_gset <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
       setName(sensitivity_gset) <- paste(sig_name, "UP", sep="_")

        GSEABase::toGmt(sensitivity_gset, con = paste(geneset_directory, "/", sig_name, "_", model_type, "_met_UP", ".gmt", sep=""))
    }
    if(length(dependency_downregulated) >= 5){
        candidate_genes <- extract_top_genes(deg_genes, "met_DOWN")
        resistance_set  <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
        setName(resistance_set) <- paste(sig_name, "DOWN", sep="_")

        GSEABase::toGmt(resistance_set, con = paste(geneset_directory, "/", sig_name, "_", model_type, "_met_DOWN", ".gmt", sep="")) 
    }
}

extract_top_genes <- function(deg_genes, mode="met_UP"){

    if(mode=="met_UP"){
        deg_genes <- head(deg_genes[order(deg_genes$logFC, decreasing = TRUE), "ID"], n=250)
    }else{
        deg_genes <- head(deg_genes[order(deg_genes$logFC, decreasing = FALSE), "ID"], n=250)
    }

    return(deg_genes)
}


bayes_results <- readRDS(eBayes_model)

all_genes <- topTreat(bayes_results, coef = model_type, 
                     number = Inf, adjust.method = "fdr")

all_genes <- all_genes[all_genes$adj.P.Val <= 0.05, ]

all_genes$ID <- rownames(all_genes)

if(!dir.exists(file.path(geneset_directory))){
    dir.create(geneset_directory)}


if(nrow(all_genes) >= 5){

    sig_name <- paste(sep="_", met_name, model_type, "MetMap", "2022")
    generate_bidirectional_signature(sig_name, all_genes)
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
library("tidyverse")


### SNAKEMAKE I/O ###
response_curves        <- snakemake@input[["response_curves"]]
cell_lines_annotation  <- snakemake@input[["cell_lines_annotation"]]
count_matrix           <- snakemake@input[["count_matrix"]]

compounds_lines_profiled   <- snakemake@output[["compounds_lines_profiled"]]
auc_models_candidates      <- snakemake@output[["auc_models_candidates"]]

## load tables
response_curves <- data.table::fread(response_curves) %>%
                  as.data.frame()

cell_lines_annotation <- data.table::fread(cell_lines_annotation) %>%
                         as.data.frame()

ccle_counts <- readRDS(count_matrix)

## Filter out LOW QC response curves
## MTS are technical redos of some assays
## Keep HTS for now, in the future maybe mix, as it seems MTS-10 is better
response_curves <- response_curves[!is.na(response_curves$auc),]

response_curves <- response_curves %>%
    filter(screen_id == "HTS002" & !is.na(auc) & r2 >= 0.7 & lower_limit <= 1)

## Get rid of cell lines and curves for which no RNASeq profiling was done
available_lines <- intersect(colnames(ccle_counts), response_curves$depmap_id)
response_curves <- response_curves[response_curves$depmap_id %in% available_lines, ]

## Get the nº. lines/compound and the CV
lines_and_compounds <- response_curves %>%
    group_by(broad_id) %>%
    summarise(
        profiled_lines = n_distinct(depmap_id),
         cv = sd(auc) / mean(auc),
    ) %>%
    as.data.frame()

write.csv(lines_and_compounds, file = compounds_lines_profiled, row.names=FALSE)

## Keep compounds with at least 10 profiled lines and CV >= 0.1
## Also, get rid of duplicated compund assays. Keep the one with the highest CV
compounds_to_test <- lines_and_compounds %>%
    filter(profiled_lines >= 10 & cv >= 0.1) %>%
    arrange(desc(cv)) %>%
    distinct(broad_id, .keep_all = TRUE) %>%
    pull(broad_id)  

response_curves   <- filter(response_curves, broad_id %in% compounds_to_test)

## Annotate cell line histology
response_curves <- merge(x=response_curves,
                         y=cell_lines_annotation[,c("DepMap_ID", "lineage")],
                         by.x="depmap_id",
                         by.y="DepMap_ID",
                         all.x=TRUE,
                         all.y=FALSE)

response_curves$lineage <- as.factor(response_curves$lineage)

if(!dir.exists(file.path(auc_models_candidates))){
    dir.create(auc_models_candidates)}

by(response_curves,
   response_curves$broad_id,
   FUN=function(i) write.csv(i, paste0(auc_models_candidates, "/", i$broad_id[1], ".csv"), , row.names=FALSE))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
library("tidyverse")

### SNAKEMAKE I/O ###
compound_data        <- snakemake@input[['compound_data']]
lines_compounds      <- snakemake@input[['lines_compounds']]

comma_file <- snakemake@output[['csv_db']]
rdata      <- snakemake@output[['rdata_db']]

full_table <- compound_data %>%
    map(read_csv) %>%
    reduce(bind_rows)

## Annotate how many lines were profiled by comp.
lines_compounds <- read_csv(lines_compounds) 

## Now keep columns of interest
filtered_data <- full_table %>%
    select(
        broad_id,
        name,
        auc,
        ic50,
        depmap_id,
        lineage,
        moa,
        target,
        smiles
        ) %>%
    mutate(
         cid = str_remove_all(
            string = smiles,
            pattern = ",.*"
         )
    ) %>%
    left_join(y = lines_compounds, by = "broad_id")



write_csv(x = filtered_data, file = comma_file)
save(filtered_data, file = rdata)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
suppressMessages(library('edgeR'))
suppressMessages(library('limma'))

### SNAKEMAKE I/O ###
compound_to_test      <- snakemake@input[['compound_to_test']]
raw_gene_counts       <- snakemake@input[['raw_gene_counts']]

ebayes_model          <- snakemake@output[['ebayes']]

## Logging
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


## Load and set data types manually just in case
ccle_counts           <- readRDS(raw_gene_counts)
compound_to_test      <- read.csv(compound_to_test)

compound_to_test$auc            <- as.numeric(compound_to_test$auc)
compound_to_test$lineage        <- as.factor(compound_to_test$lineage)

## Subset the counts
lines_to_test <- compound_to_test$depmap_id
count_matrix  <- ccle_counts[,lines_to_test]

rownames(compound_to_test) <- compound_to_test$depmap_id

## voom model
design <- model.matrix(~lineage + auc, data=compound_to_test)

## reorder count_matrix so that cols matches rows from design
count_matrix <- count_matrix[,rownames(design)]

dge  <- DGEList(counts=count_matrix)
keep <- filterByExpr(dge, design=design)

dge <- dge[keep, keep.lib.sizes=FALSE]
dge <- calcNormFactors(dge)

## TODO: save voom model plot
v <- voom(dge, plot=FALSE, normalize.method='quantile')

fit <- lmFit(v, design)
fit <- eBayes(fit)

saveRDS(fit, ebayes_model)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
suppressMessages(library('tidyverse'))

### SNAKEMAKE I/O ###
raw_expected_counts <- snakemake@input[["raw_expected_counts"]]
ccle_default_line <- snakemake@input[["ccle_default_line"]]
protein_coding_genes <- snakemake@input[["protein_coding_genes"]]
raw_gene_counts <- snakemake@output[["raw_gene_counts"]]

### SNAKEMAKE PARAMS ###
coding_genes <- as.logical(snakemake@params["coding_genes_only"])

## Load CCLE raw counts
ccle_counts <- data.table::fread(raw_expected_counts)

## Load cell line equivalences
cell_lines <- data.table::fread(ccle_default_line)

## Keep RNA cell line equivalences
cell_lines <- cell_lines %>%
  filter(ProfileType == "rna") %>%
  rename(V1 = ProfileID) %>%
  select(V1, ModelID) %>%
  unique()

## Convert CCLE raw counts to a matrix where each gene is a row and each sample, 
## a column.
## Input is RSEM expected counts (thus I have to round up to units)
genes <- colnames(ccle_counts)[-1]
ccle_counts <- ccle_counts %>%
  merge(cell_lines, by = "V1") %>%
  column_to_rownames("ModelID") %>%
  select(-V1) %>%
  t() %>%
  as.data.frame()
rownames(ccle_counts) <- genes

## Most of the genes with NA values are ERCC-0000X spike-ins
ccle_counts <- na.omit(ccle_counts)

## BROAD format for gene names is HUGO (ENSEMBL)
ccle_counts <- ccle_counts %>%
  mutate(gene_variance = unname(apply(ccle_counts, MARGIN = 1, FUN = var)),
         Gene = str_remove(rownames(ccle_counts), pattern = " \\(.*"),
         Ensembl = str_extract(rownames(ccle_counts), 
                               pattern = "ENSG[0-9]+")) %>%
  ## Get rid of remaining spikes
  filter(!grepl(Gene, pattern = "ERCC-", fixed = TRUE))

## If coding_genes = TRUE, just keep protein coding genes
if (coding_genes) {
  hgnc_coding_genes <- data.table::fread(protein_coding_genes) %>%
    rename(Gene = symbol, Ensembl = ensembl_gene_id) %>%
    select(Gene, Ensembl) %>%
    unique()
  ccle_counts <- ccle_counts %>%
    select(-Gene) %>%
    merge(hgnc_coding_genes)
}

## Keep the most variable gene when two ENSEMBL ids point to the same HGNC
ccle_counts <- ccle_counts %>%
  group_by(Gene) %>%
  filter(gene_variance == max(gene_variance)) %>%
  column_to_rownames("Gene") %>%
  select(-gene_variance, -Ensembl)

## Round counts
ccle_counts <- round(as.matrix(ccle_counts), digits = 0)

## Save object
saveRDS(ccle_counts, file = raw_gene_counts)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
suppressMessages(library('GSEABase'))
suppressMessages(library('limma'))
suppressMessages(library('dplyr'))


## SNAKEMAKE I/O  ##
eBayes_model       <- snakemake@input[['fitted_bayes']]
drug_info          <- snakemake@input[['treatment_info']]

geneset_directory <- snakemake@output[['bidirectional_geneset']]

compound_id       <- snakemake@wildcards[['broad_id']]

## SNAKEMAKE PARAMS ##
signature_type    <- tolower(as.character(snakemake@params[['signature_type']]))

# Logging
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

## If signature_type == 'Classic',
## then we'll take the top/bottom 250 genes sorted by T.statistics
## If signature_type == 'fold', we'll perform FDR + log fold selection

generate_bidirectional_signature <- function(sig_name, deg_genes){

    ## Split the table between S_genes (negative logfold) 
    ## and R genes (positive)
    if(signature_type=='classic'){
        sensitivity_genes <- deg_genes[deg_genes$t <0, 'ID']
        resistance_genes  <- deg_genes[deg_genes$t >0, 'ID']
    }else{
        sensitivity_genes <- deg_genes[deg_genes$logFC < 0, 'ID']
        resistance_genes  <- deg_genes[deg_genes$logFC > 0, 'ID']
    }
    if(length(sensitivity_genes) >= 15){
       candidate_genes <- extract_top_genes(deg_genes, 'sensitivity', signature_type=signature_type)
       sensitivity_gset <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
       setName(sensitivity_gset) <- paste(sig_name, 'UP', sep='_')

       GSEABase::toGmt(sensitivity_gset, con = paste(geneset_directory, '/', sig_name, '_UP', '.gmt', sep=''))
    }
    if(length(resistance_genes) >= 15){
        candidate_genes <- extract_top_genes(deg_genes, 'resistance', signature_type=signature_type)
        resistance_set <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
        setName(resistance_set) <- paste(sig_name, 'DOWN', sep='_')

        GSEABase::toGmt(resistance_set, con = paste(geneset_directory, '/', sig_name, '_DOWN', '.gmt', sep='')) 
    }
}

#TODO: This function here is extremely ugly. Refactor it
extract_top_genes <- function(deg_genes, mode='sensitivity', signature_type='classic'){

    ## Extract and sort the top hits for the selected direction/mode
    if(mode=='sensitivity'){

        if(signature_type=='classic'){
            deg_genes <- head(deg_genes[order(deg_genes$t), 'ID'], n=250)
        }else{
            deg_genes <- head(deg_genes[order(deg_genes$logFC), 'ID'], n=250)
        }
    }else{
        if(signature_type=='classic'){
            deg_genes <- head(deg_genes[order(-deg_genes$t), 'ID'], n=250)
        }else{
        deg_genes <- head(deg_genes[order(-deg_genes$logFC), 'ID'], n=250)
        }
    }
    return(deg_genes)
}


drug_info <- data.table::fread(drug_info) %>%
    as.data.frame()

drug_info <- drug_info[!duplicated(drug_info$broad_id),]
drug_info <- drug_info[drug_info$screen_id == 'HTS002',]

bayes_results <- readRDS(eBayes_model)

if(signature_type == 'classic'){
    all_genes <- topTable(bayes_results, coef='auc', number = Inf, adjust.method='fdr')
}else{ 
    ## Set a threshold of the adjusted p-value if signature_type is not classic
    all_genes <- topTable(bayes_results, coef = 'auc', 
                     number = Inf, adjust.method = 'fdr',
                     p.value=0.05)
}
# Set the genes as a column
all_genes$ID <- rownames(all_genes)

if(!dir.exists(file.path(geneset_directory))){
    dir.create(geneset_directory)}

## Check if we have at least 15 genes left after FDR-cutoff. Does not matter
## If both directions are < 15 individually, we'll account for that later.
## Just make sure we are not evaluating empty results 

if(nrow(all_genes) >= 15){

    common_name <- drug_info[drug_info$broad_id == compound_id, 'name']

    if(length(common_name) == 0){
        common_name <- compound_id}

    brd_split   <- strsplit(x=compound_id, split='-', fixed=TRUE)[[1]][2]

    sig_name <- paste(sep='_', common_name,'PRISM',brd_split)

    generate_bidirectional_signature(sig_name, all_genes)

}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

suppressMessages(library("hgu219.db"))
suppressMessages(library("biomaRt"))
suppressMessages(library("dplyr"))

## CODE ##
httr::set_config(httr::config(ssl_verifypeer = FALSE))

# Get ENSEMBL IDs for each probe
probeIDs <- unlist(as.list(hgu219ENSEMBL))
probeIDs <- na.omit(data.frame(probe = names(probeIDs), ensembl = probeIDs, 
                               row.names = NULL))

# BiomaRt
mart <- useMart("ENSEMBL_MART_ENSEMBL")
human <- useDataset("hsapiens_gene_ensembl", mart)

# Get HUGO genes
hugoIDs <- getBM(c("hgnc_symbol", "ensembl_gene_id"), mart = human,
                 filters = "ensembl_gene_id", values = unique(probeIDs$ensembl))
colnames(hugoIDs) <- c("hgnc", "ensembl")
hugoIDs <- hugoIDs %>% mutate(hgnc = na_if(hgnc, "")) %>% na.omit

# Merge tables
merged <- merge(probeIDs, hugoIDs, by = "ensembl")
merged <- unique(merged[c("probe", "hgnc", "ensembl")])

# Save
write.table(merged, file = snakemake@output[["tsv"]], sep = "\t", 
            row.names = FALSE, quote = FALSE)
177
178
shell:
    "cat {results}/ctrp/genesets/classic/*/*.gmt {results}/gdsc_rna/genesets/classic/*/*.gmt {results}/prism/genesets/classic/*/*.gmt > {output}"
ShowHide 56 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/cnio-bu/drug_susceptibility_collection
Name: drug_susceptibility_collection
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...