Supplementary material for the OLOGRAM-MODL paper

public public 1yr ago 0 bookmarks

This is the code accompanying the paper describing OLOGRAM-MODL. It was used to create supplementary files and the paper's figures.

To run it, create first a conda environment containing pygtftk with conda env create -f ologram_modl_env.yaml . This will download the latest pygtftk release from bioconda and install Snakemake, as well as all requires dependencies.

Then, to run the workflow itself, use these commands:

# Full run, where -j is the number of jobs in parallel
snakemake -j4
# Dry run (nothing is done)
snakemake -n -p --reason
# Example: full run on a cluster, with qsub, on the "tagc" queue
snakemake -c 'qsub -q tagc -V -l nodes=1:ppn={threads}' -j32

To clean the working directory:

rm -rf output
rm -rf input
git reset --hard

This code is available under GNU general public license v3.

Code Snippets

  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnchoredText

from scipy.stats import beta
from pygtftk.stats.beta import fit_beta

# Graph paths
res_file_a = snakemake.output["a"]   
res_file_b = snakemake.output["b"] 
res_file_min = snakemake.output["mini"] 
res_file_max = snakemake.output["maxi"] 
res_file_v = snakemake.output["v"] 

# Create directories if needed
for filepath in [res_file_a, res_file_b, res_file_v]:
    path = Path(filepath).parent # Get parent directory
    Path(path).mkdir(parents=True, exist_ok=True)


# Fix the parameters to be fiitted
a, b = 1., 2.

def beta_var(a,b) : return (a*b)/((a+b)**2 * (a+b+1)) 
true_variance = beta_var(a,b)

np.random.seed(seed=42)

# ---------------------------- Experiment ----------------------------- #
# Make many different fittings with many different samples sizes
K = 200
SAMPLE_SIZES = [100]*K + [200]*K + [1000]*K + [10000]*K + [20000]*K

result = list()

for size in SAMPLE_SIZES:
    obs = beta.rvs(a, b, size=size)  # Generate data
    ahat, bhat, mhat, chat = fit_beta(obs) # Fit

    # Record result
    result += [{
        'samples':size,
        'ahat':ahat,
        'bhat':bhat,
        'mhat':mhat,
        'chat':chat,
        'var':np.var(obs, ddof=1),
        'rebuilt_var':beta_var(ahat, bhat)
    }]


# ------------------------------- Plot figures ------------------------------- #
result_df = pd.DataFrame(result)
result_df['Number of samples'] = result_df['samples'] # Renaming


### For alpha

meanplot = result_df.boxplot(column=['ahat'], by='Number of samples')

# Add a line for the true parameter
plt.axhline(y=a, color='g', linestyle='-')

at = AnchoredText("True α = "+str(a),
                  prop=dict(size=8), frameon=True,
                  loc='upper right',
                  )
at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
meanplot.add_artist(at)

plt.savefig(res_file_a)
plt.close()

### Same for beta

meanplot = result_df.boxplot(column=['bhat'], by='Number of samples')

plt.axhline(y=b, color='b', linestyle='-')
at = AnchoredText("True β = "+str(b), prop=dict(size=8), frameon=True, loc='upper right')
at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
meanplot.add_artist(at)

plt.savefig(res_file_b)
plt.close()


### And min and max

meanplot = result_df.boxplot(column=['mhat'], by='Number of samples')
plt.axhline(y=0, color='r', linestyle='-')
at = AnchoredText("True minimum = "+str(0), prop=dict(size=8), frameon=True, loc='upper right')
at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
meanplot.add_artist(at)
plt.savefig(res_file_min)
plt.close()

meanplot = result_df.boxplot(column=['chat'], by='Number of samples')
plt.axhline(y=1, color='r', linestyle='-')
at = AnchoredText("True maximum = "+str(1), prop=dict(size=8), frameon=True, loc='upper right')
at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
meanplot.add_artist(at)
plt.savefig(res_file_max)
plt.close()


### And for variance, to show Neg Binom fitting would thus be better

# Renaming
result_df['Empirical'] = result_df['var']
result_df['Fitted'] = result_df['rebuilt_var']

sns.set_theme(style="whitegrid")
dd = pd.melt(result_df,id_vars=['Number of samples'],value_vars=['Empirical','Fitted'],var_name='Variance')
vplot = sns.boxplot(x='Number of samples', y='value', data=dd, hue='Variance',
    linewidth=1)

vplot.get_figure().savefig(res_file_v)
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import numpy as np
import pandas as pd
import pandas.api.types as pdtypes
from pygtftk.stats.intersect.modl import tree
from functools import partial
import math
from collections import Counter

from plotnine import ggplot, aes, labs, scale_color_gradient, geom_point, geom_abline, scale_x_log10, scale_y_log10, geom_violin, geom_boxplot, position_dodge, scale_y_log10, xlab, ylab, ggtitle

ROOT_PATH = "./output/ologram_result_scatacseq_pbmc/"
DATA_ROOT_PATH = "./output/sc_atac_seq_pbmc_data/"

# For manual execution
# ROOT_PATH = "../output/ologram_result_scatacseq_pbmc/"
# DATA_ROOT_PATH = "../output/sc_atac_seq_pbmc_data/"


## Superclusters: how to regroup the cell types
# For now, this is hardcoded
SUPERCLUSTERS = {
    "CD14+ Monocytes":  "cd14",
    "CD4 Naive":        "cd48",
    "CD8 Naive":        "cd48",
    "Quer":             "NA",
    "...":              "NA",
    "NA":               "NA",
    "pre-B cell":       "preB"
}


# ---------------------------------------------------------------------------- #

## Read the OLOGRAM-MODL result file
# Read dataframe to create the found_combis dictionary
df_res = pd.read_csv(ROOT_PATH+"merged_batches_result.tsv", sep='\t', header=0, index_col=None)
# Pval set to 0 or -1 are changed to 1e-320 and NaN respectively
df_res.loc[df_res['summed_bp_overlaps_pvalue'] == 0, 'summed_bp_overlaps_pvalue'] = 1e-320
df_res.loc[df_res['summed_bp_overlaps_pvalue'] == -1, 'summed_bp_overlaps_pvalue'] = np.nan

# Create a Library tree with the combinations, like in the MODL algorithm itself  
print("Creating a Library for these words. This can be very long for longer words.")
print("In this particular Snakemake, it should take about 12 minutes.")

L = tree.Library()
L.build_nodes_for_words_from_ologram_result_df(df_res)
L.assign_nodes()


# ---------------------------------------------------------------------------- #


# Remember the features names are in L.features_names
# And the interesting attributes of each node are: 
#   node.word, node.children, node.s pval and fc
def word_to_combi(word): return [L.features_names[i]  for i in range(len(word)) if word[i]]

# Retrieves all combis and enrichment
def work_on_this_node(node, graph):
    return word_to_combi(node.word), node.fc, node.pval, node.s

# Iterate over all nodes
mygraphfunc = partial(work_on_this_node, graph=L)
global_results = {}
tree.apply_recursively_to_all_nodes(L.root_node, mygraphfunc, global_results)

df = pd.DataFrame(global_results.values(), columns = ["combination","fc","pval","s"])

df['combi_length'] = (df['combination'].apply(len)) -2
# Substract 2 to the length to discard the "Query" and "..." terms of the combinations

# Now, get the enrichments of all 2-wise combis, all 3-wise, etc.
def entropy(data):
    """
    Compute Shannon entropy.
    """
    if len(data) <= 1: return 0

    # Data counts
    counts = Counter()
    for d in data: counts[d] += 1

    ent = 0
    probs = [float(c) / len(data) for c in counts.values()]
    for p in probs:
        if p > 0.: ent -= p * math.log(p, 2)

    return ent


# ---------------------------------------------------------------------------- #


# Read the translation table as a dictionary and convert
# If df_map is a pandas dataframe with two columns, 'name' and 'category'

df_map = pd.read_csv(DATA_ROOT_PATH + "cell_to_class.tsv", sep='\t', header=0, index_col=None)
df_map.set_index('id', inplace=True)
df_map.transpose()
dict_translate = df_map['class'].to_dict()

dict_translate["Query"] = "NA" # Add the 'Query' class
dict_translate["..."] = "NA"


def combination_to_combi_of_classes(combination):
    new_combination = []
    for element in combination:
        current_key = dict_translate[element]
        element_to_add = SUPERCLUSTERS[current_key]
        if element_to_add != "NA": 
            new_combination += [element_to_add]
    return new_combination

df['combination_classes'] = df['combination'].apply(combination_to_combi_of_classes)


## Compute entropy
df['entropy'] = df['combination_classes'].apply(entropy)
# Round to 2 decimals and make a category
df['entropy'] = df['entropy'].apply(lambda x: round(x,2)).astype(pdtypes.CategoricalDtype())


# Using only combis of each individual length

all_lengths = sorted(set(df['combi_length']))

# No point in going beyond, roughly, 12
all_lengths = [l for l in all_lengths if l <= 12]

for length in all_lengths:

    df_filtered = df.loc[df['combi_length'] == length,:]

    try:
        p = (ggplot(data=df_filtered, mapping = aes(x='entropy', y='fc'))
                + geom_violin(position = position_dodge(1), width = 1)
                + geom_boxplot(position = position_dodge(1), width = 0.25)
                + xlab("Entropy") + ylab("Fold change (log2)")
                + ggtitle("Order "+str(length)))

        p.save(filename = ROOT_PATH + "entropy_graph/entropy_length_" + str(length) + "_fc.png")



        p = (ggplot(data=df_filtered, mapping = aes(x='entropy', y='s'))
                + geom_violin(position = position_dodge(1), width = 1)
                + geom_boxplot(position = position_dodge(1), width = 0.25)
                + xlab("Entropy") + ylab("True total overlapping bp.")
                + ggtitle("Order "+str(length)))

        p.save(filename = ROOT_PATH + "entropy_graph/entropy_length_" + str(length) + "_s.png")


    except:
        print("Skipping length",length,"which caused an error.")
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.offsetbox import AnchoredText

# root_dir = './output/supp_fig4/'
# concat_file = root_dir + 'merged_ologram_test.tsv'
# res_file_1 = root_dir + 'S_mean_boxplot.pdf'
# res_file_2 = root_dir + 'S_var_boxplot.pdf'

concat_file = snakemake.input["merged"]         # Concatenated shuffles
truth_file = snakemake.input["truth"]           # Deep sampled empirical p-value
res_file = snakemake.output[0]                  # Result file


# ---------------------------- Read result files ----------------------------- #
def result_ologram_to_list_of_lists(file, is_truth = False):

    # Skip even rows except for 0, they are the headers of other files (non-truth data only)
    def logic(index):
        if (index % 2 == 0) and index != 0 : return True
        return False 

    if not is_truth: df = pd.read_csv(file, sep='\t', header = 0, index_col = None, skiprows = lambda x: logic(x))
    else: df = pd.read_csv(file, sep='\t', header = 0, index_col = None)

    # Iterate over each row and build a result object
    result = list()
    for i, row in df.iterrows():
        header = row['feature_type'].split('_')

        if not is_truth:
            r = {
                'nb_minibatches': int(header[2]),
                'try_id': int(header[5]),
                'p_val': row['summed_bp_overlaps_pvalue'],
                'log_10(p_val)': np.log10(row['summed_bp_overlaps_pvalue'] + 1E-320),
                'log_10(empirical_pval)': np.log10(row["summed_bp_overlaps_empirical_pvalue"] + 1E-320)
            }
        else:
            r = {
                'p_val': row['summed_bp_overlaps_pvalue'],
                'log_10(p_val)': np.log10(row['summed_bp_overlaps_pvalue'] + 1E-320),
                'log_10(empirical_pval)': np.log10(row["summed_bp_overlaps_empirical_pvalue"] + 1E-320)
                #'log_10(beta_pval)': np.log10(row["ad_hoc_beta_summed_bp_overlaps_pvalue"] + 1E-320)
            }

        result += [r]

    return result


result = result_ologram_to_list_of_lists(concat_file)
result_truth = result_ologram_to_list_of_lists(truth_file, is_truth = True)

# ------------------------------- Plot figure -------------------------------- #
result_df = pd.DataFrame(result)

MINIBATCH_SIZE = 10 # Hardcoded for now, keep it the same in the Snakefile
result_df['Number of shuffles'] = MINIBATCH_SIZE * result_df['nb_minibatches']

meanplot = result_df.boxplot(column=['log_10(p_val)'], by='Number of shuffles')
result_truth_df = pd.DataFrame(result_truth)

# Add a line for the true p-val (as estimated by very deep sampling)
true_pval = list(result_truth_df['log_10(empirical_pval)'])[0] # Hardcoded to read the first line, since there should be only one
plt.axhline(y=true_pval, color='g', linestyle='-')

#beta_pval = list(result_truth_df['log_10(beta_pval)'])[0] # Hardcoded to read the first line, since there should be only one
#plt.axhline(y=beta_pval, color='r', linestyle='-')

at = AnchoredText("True p-val = "+'{0:.4g}'.format(10**true_pval),
                  prop=dict(size=8), frameon=True,
                  loc='upper right',
                  )
at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
meanplot.add_artist(at)

plt.savefig(res_file)
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import numpy as np
import pandas as pd

import time
import subprocess
import warnings
import random

from pygtftk.stats.intersect.modl.dict_learning import Modl, test_data_for_modl
from pygtftk.stats.intersect.modl.apriori import Apriori, matrix_to_list_of_transactions, apriori_results_to_matrix
from pygtftk.stats.intersect.modl.subroutines import learn_dictionary_and_encode
from pygtftk import utils

## ---------------------------- Parameters ---------------------------------- ##

np.random.seed(42)
random.seed(42)

utils.VERBOSITY = 3 # Force debug messages to appear
N_THREADS = 4

## Paths
# Hardcoded for now. It was necessary to launch this script in 
# shell and not via snakemake.script to allow proper redirection of the log
OUTPUT_ROOT = "output/benchmark/comparison/" # Output path
EXT_PATH = "ext/"   # Path to external tools

# # For manual launch
# OUTPUT_ROOT = "../output/benchmark/comparison/" # Output path
# EXT_PATH = "../ext/"   # Path to external tools

# Debug : plots are not shown, only printed, so use Agg backend for matplotlib
import matplotlib
matplotlib.use("Agg")


# Data parameters
NB_SETS = 6
NFLAGS = 10000
NOISE = 0.12
NOISE_HIGH = 0.2
CORR_GROUPS = [(0,1),(0,1,2,3),(4,5)]

# MODL parameters
N_QUERIED_ATOMS = 3

# Other algorithms' parameters
MIN_SUPPORT = 1E-10 # This value is an epsilon for 0. Hence, return all itemsets ordered by support



## --------------------------- Found combinations --------------------------- ##

# Generate data with the AB, ABCD, EF combinations, adding uniform noise
names = [str(i) for i in range(NB_SETS)]
x = test_data_for_modl(nflags = NFLAGS, number_of_sets = NB_SETS,
    noise = NOISE, cor_groups = CORR_GROUPS)

# Convert to list of transactions
transactions = matrix_to_list_of_transactions(x, names) 




## Run MODL

combi_miner = Modl(x, 
    multiple_overlap_target_combi_size = -1,                          # Optional: limit the size of the combinations
    multiple_overlap_max_number_of_combinations = N_QUERIED_ATOMS,    # How many words to find ?
    nb_threads = N_THREADS,                                           # Full multithreading
    smother = True,                                                   # Reduce each row's abundance to its square root. Helps find rarer combinations but magnifies the noise.
    step_1_factor_allowance = 2,                                      # Optional: how many words to ask for in each step 1 rebuilding
    normalize_words = True,                                           # Normalize word sum of square in step 2
    step_2_alpha = None)                                              # Optional: override the sparsity control in step 2
modl_interesting_combis = combi_miner.find_interesting_combinations()


# Run MODL without smothering
x = test_data_for_modl(noise = NOISE_HIGH)
combi_miner = Modl(x, 
    multiple_overlap_max_number_of_combinations=N_QUERIED_ATOMS, 
    smother = False)
modl_interesting_combis_no_smother = combi_miner.find_interesting_combinations()


# Run MODL without word normalization
x = test_data_for_modl(noise = NOISE)
combi_miner = Modl(x, 
    multiple_overlap_max_number_of_combinations=N_QUERIED_ATOMS, 
    normalize_words = False)
modl_interesting_combis_not_normalized = combi_miner.find_interesting_combinations()


print("-- MODL INTERESTING COMBINATIONS --")
print("Standard =", modl_interesting_combis)
print("No smother, more noise =", modl_interesting_combis_no_smother)
print("No word normalization =", modl_interesting_combis_not_normalized)
print("-------------------------------")


## Run Apriori and FP-growth

# Apriori
myminer = Apriori(min_support = MIN_SUPPORT)
myminer.run_apriori(transactions)
results = myminer.produce_results()
apriori_results_df = apriori_results_to_matrix(results, names)

print("-- APRIORI ASSOCIATION RULES --")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(apriori_results_df)
print("-------------------------------")



# FP-Growth
from mlxtend.frequent_patterns import fpgrowth
x_as_dataframe = pd.DataFrame(x)
fpres = fpgrowth(x_as_dataframe, min_support=MIN_SUPPORT)



pd.set_option('display.max_rows', 1000)
print(fpres)


# Here, output the matrices to BED files
# from matrix_to_bed import intersection_matrix_to_bedfiles
# bedpaths = intersection_matrix_to_bedfiles(x, output_root)

# # Also print it to a tsv, and to a list of transactions
# np.savetxt(OUTPUT_ROOT+'artifmat.tsv', delimiter='\t')

# def format_list_brackets(alist): return '{'+','.join(alist)+'}'
# with open(OUTPUT_ROOT+'artiftransact_brackets.tsv', 'w+') as f:
#     for item in transactions:
#         f.write(format_list_brackets(item)+'\n')


# And in the specific SPMF format
with open(OUTPUT_ROOT+'artiftransact.txt', 'w+') as f:
    for item in transactions: f.write(' '.join(item)+'\n')



## ---------- Manual launch of other itemset miner(s)

## Run LCM
command_line = ["java", "-jar", EXT_PATH + "spmf.jar", # Java SPMF toolset
    "run", "LCM",                        # Run this algorithm
    OUTPUT_ROOT + "artiftransact.txt",   # Query file
    OUTPUT_ROOT + "output_lcm.txt",      # Output
    str(round(MIN_SUPPORT*100))+"%"      # Min support (parameter)
    ] 


# NOTE Remember that in subprocess.run, the command line must be a list of 
# arguments and not a string of the command
stdout_captured = subprocess.run(command_line, 
    stdout=subprocess.PIPE, universal_newlines=True).stdout





## Run CL-Max (approximate itemset miner)

warnings.filterwarnings('ignore') # Silence NumPy warnings for display

# Load it by running the code (kinda hacky but works for now)
exec(open(EXT_PATH + "cl-max.py").read())



# Run a variety of parameters
# Esp. rounding threshold. And random seeds seemed to change results a lot !
RANDOM_SEED = 1234
min_support = 0.1
for num_cluster in [3,6]:
    for rounding_threshold in [0.5,0.9]:

        print("-----------------")
        print("num_cluster =",num_cluster)
        print("rounding_threshold =",rounding_threshold)

        np.random.seed(RANDOM_SEED)
        random.seed(RANDOM_SEED)

        cl_max = CL_MAX(min_support, num_cluster, rounding_threshold)

        # Load dataset
        cl_max.dataset = pd.DataFrame(x)
        cl_max.sorted_items = names
        cl_max.transactions = [[int(i) for i in t] for t in transactions if t] # Remove empties
        cl_max.maximum = NB_SETS
        cl_max.remove_non_frequent_single_items()

        # Run the algorithm
        MFIs = cl_max._CL_MAX()
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
QUERY_PATH  = "input/as_ginom/query_som_trans_lung.bed"        # The query set
INCL_PATH   = "input/as_ginom/mappability_human.bed"           # Inclusion: the analysis will be limited to the sub-genome designated in this files. Regions outside of it will be discarded.
GENOME_PATH = "input/hg19.genome"                              # Chromosome sizes

# Reference sets
MORE_PATHS  = [
    "input/as_ginom/reference_1_hg19_Refseq_promoters_2000bp.bed",
    "input/as_ginom/reference_2_hg19_Refseq_gene_bodies.bed",
    "input/as_ginom/reference_3_Broad_A549_H3k04me3_Dex.bed",
    "input/as_ginom/reference_4_Broad_A549_H3k36me3_Dex.bed",
    "input/as_ginom/reference_5_Broad_A549_H3k79me2_Dex.bed"
]
# The meaning of the combinations  returnedwill always be : query, followed by the reference sets in the order they are given
# For example, '[101010]' means "Query + Reference 2 (gene bodies) + Reference 4 (H3K36me3)"


## Parameters

# Subsampling parameters
KEEP_N_TIMES_QUERY = 100          # For each '1' line with the query, how many '0' lines without it do we keep?
QUERY_WEIGHT = 100                # In the Naive Bayes classifier, how much more do we weigh the presence of the query?

# MODL parameters
DESIRED_ITEMSETS = 7              # How many interesting combinations should MODL return?

RANDOM_SEED = 42

# ---------------------------------------------------------------------------- #

def presence_vector(row, combis):
    """
    Given a list of combinations, returns a vector saying whether they are
    present or absent in this row. The order in the vector is the same as given
    in the combis list.

    This works exactly like an inexact (transitive) count, which is OLOGRAM-MODL's default operating mode.

    Example:

    >>> row = [1,0,1,1,0,1]
    >>> combis = [1,0,1,0,0,0],[0,1,1,0,0,0]
    >>> res = presence_vector(row, combis)
    >>> assert list(res) == [1,0]

    """

    result = []
    for combi in combis:
        mask = row - combi

        # If any element from the combi is missing, it's absent, so
        # add a 0. Otherwise add a 1.
        if min(mask) < 0 : result +=[0]
        else: result += [1]

    return result


import pandas as pd
import numpy as np
import sklearn
import pybedtools
from sklearn.naive_bayes import GaussianNB

from pygtftk.stats.intersect.overlap_stats_compute import compute_true_intersection
from pygtftk.stats.intersect.overlap.overlap_regions import does_combi_match_query
from pygtftk.stats.intersect.modl.dict_learning import Modl
from pygtftk import utils
from pygtftk.stats.intersect.read_bed import read_bed_as_list as read_bed
from pygtftk.utils import chrom_info_as_dict
from pygtftk import arg_formatter

np.random.seed(RANDOM_SEED)  # Random seed for reproducibility



## Prepare files
bedA = pybedtools.BedTool(QUERY_PATH).sort().merge()
bedsB = [pybedtools.BedTool(bedfilepath).sort().merge() for bedfilepath in MORE_PATHS]


# Do the exclusion manually Generate a fake bed for the entire genome, using the chromsizes
bed_incl = pybedtools.BedTool(INCL_PATH)
chrom_len = chrom_info_as_dict(open(GENOME_PATH, 'r'))

full_genome_bed = [str(chrom) + '\t' + '0' + '\t' + str(chrom_len[chrom]) + '\n' for chrom in chrom_len if chrom != 'all_chrom']
full_genome_bed = pybedtools.BedTool(full_genome_bed)
bed_excl = full_genome_bed.subtract(bed_incl)

bedA = read_bed.exclude_concatenate(bedA, bed_excl)
bedsB = [read_bed.exclude_concatenate(bedB, bed_excl) for bedB in bedsB]
full_genome_bed_after_excl = read_bed.exclude_concatenate(full_genome_bed, bed_excl)


# Note that by definition, in this intersections' matrix only regions where at 
# least two sets are open are given. For example {4} alone is not found. 
# To fix it, use a fake full genome bed as query, so there is always one file 
# open, then truncate it to get the flags_matrix. 
#true_intersection = compute_true_intersection(bedA, bedsB)
true_intersection = compute_true_intersection(full_genome_bed_after_excl, [bedA] + bedsB)
flags_matrix = np.array([i[3] for i in true_intersection])

"""
# NOTE : The length of each intersection is also accessible, in case you want
# the final matrix to have one row per X base pairs instead of 1 row per intersection/event.
# For reference, the true_intersection object looks like this: [('chr1', 150, 180, np.array([1,1,0])), ('chr1', 180, 200, np.array([1,1,1])), ('chr1', 350, 380, np.array([1,1,0]))]
for i in true_intersection: # For each observed intersection...
    inter_chr, inter_start, inter_stop = i[0], i[1], i[2]   # Get the chromosome, start and end of the observed intersection
    intersection_length = inter_stop - inter_start          # Deduce the length of the intersection
    intersection_flags = i[3]                               # Which combination was observed here?
    # Now you can process it.
"""

flags_matrix = flags_matrix[:,1:] # Remove first column that represents the fake, full genome BED.


# Print unique rows with transitive (inexact) counting
uniqueValues, occurCount = np.unique(flags_matrix, return_counts=True, axis=0)
for uniqueRow in uniqueValues:
    count = 0

    for row in flags_matrix:
        if does_combi_match_query(
            tuple(row), 
            tuple(uniqueRow), 
            exact = False):
            count += 1
    print(uniqueRow, ' - freq = ', count)
print("------------------")


# Perform subsampling of the majority class ('0', meaning query is absent)
flags_matrix_with_query = flags_matrix[flags_matrix[:, 0] != 0]
draw_this_many = int(np.around(KEEP_N_TIMES_QUERY * len(flags_matrix_with_query)))
random_rows = flags_matrix[np.random.randint(flags_matrix.shape[0], size=draw_this_many),:]
flags_matrix = np.concatenate((flags_matrix_with_query,random_rows))



# This is the custom error function that we will use
def custom_error_function_bnb(X_true, X_rebuilt, encoded, dictionary):
    """
    This is a non-pure function that only cares about the dictionary and discards X_true in 
    favor of the previous flags_matrix.
    """

    # Always assume that the first column is the query, so we split it.
    # Truncate the first column, with the query, since that is what we want to predict
    Y = flags_matrix[:,0]
    X = flags_matrix[:,1:]

    new_dictionary = []
    for atom in dictionary:
        new_dictionary += [atom[1:]]
    dictionary = np.array(new_dictionary)


    # Convert to a binary matrix giving, for each atom in the dictionary, 
    # whether it is present at this row
    X_bydict_bin = [presence_vector(row, dictionary) for row in X]
    X_bydict_bin = np.array(X_bydict_bin)

    # Use Gaussian Naive Bayes on the data and retrieve accuracy
    bnb = GaussianNB()

    # Potential sample weights
    sample_weights = []
    for y in Y:
        if y == 1 : sample_weights += [QUERY_WEIGHT]
        else: sample_weights += [1]


    bnb.fit(X_bydict_bin, Y, sample_weight= sample_weights)
    Y_pred = bnb.predict(X_bydict_bin)

    # Compute F1-score
    score = sklearn.metrics.f1_score(Y, Y_pred)

    # print(pd.crosstab(Y,Y_pred))

    return -score # We want an error, higher means worse



## Run MODL with custom error function based on error of BNB
utils.VERBOSITY = 3 # Ensure DEBUG messages are shown


# Keep only combis WITH query (first element is not 0)
flags_matrix_with_query = flags_matrix[flags_matrix[:, 0] != 0]

combi_miner = Modl(flags_matrix_with_query,
    multiple_overlap_max_number_of_combinations = DESIRED_ITEMSETS,        # How many words to find ?
    error_function = custom_error_function_bnb)                            # Custom error function in step 2
interesting_combis = combi_miner.find_interesting_combinations()
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import numpy as np
np.random.seed(42)

import pandas as pd
from plotnine import ggplot, aes, geom_point, geom_line, geom_smooth, scale_x_continuous, scale_y_log10, xlab, ylab

import time

from pygtftk.stats.intersect.modl.dict_learning import Modl, test_data_for_modl
from pygtftk.stats.intersect.modl.apriori import Apriori, matrix_to_list_of_transactions, apriori_results_to_matrix
from pygtftk.stats.intersect.modl.subroutines import learn_dictionary_and_encode
from pygtftk import utils


# Debug : plots are not shown, only printed, so use Agg backend for matplotlib
import matplotlib
matplotlib.use("Agg")

OUTPUT_ROOT = "output/benchmark/scaling/" # Hardcoded for now. It was necessary to launch this script in shell and not via snakemake.script to allow proper redirection of the log


## ---------------------------- Parameters ---------------------------------- ##
utils.VERBOSITY = 0 # We don't want to record DEBUG messages for these tests
REPEATS = range(5) # Repeat all operations N times to get the average

N_THREADS = 4

## MODL
SIZES = [6,9,12,18,24,30,40,50]  # Numbers of sets (columns)
STEPS = [3,5,8,10,12,15,18,20,22,25,30] # Numbers of queried atoms

# When varying only one parameter, which is the default for the others ?
DEFAULT_QUERIED_ATOMS = 8
DEFAULT_N_SETS = 8

# Data parameters
NFLAGS = 10000
NOISE = 0.1     # Previously 0.5


## ---------------------------- Time benchmarks ----------------------------- ##

## Number of sets
df_bench = pd.DataFrame(columns = ['nb_sets','time'])   # Prepare dataframe

for _ in REPEATS:

    for size in SIZES:
        X = test_data_for_modl(nflags = NFLAGS, number_of_sets = size, noise = NOISE)

        start_time = time.time()
        combi_miner = Modl(X, multiple_overlap_max_number_of_combinations = DEFAULT_QUERIED_ATOMS, nb_threads = N_THREADS)
        modl_interesting_combis = combi_miner.find_interesting_combinations()
        stop_time = time.time()

        df_bench = df_bench.append({'nb_sets':size, 'time': stop_time-start_time}, ignore_index = True)

df_bench['nb_sets'] = df_bench['nb_sets'].astype(int)
p = (ggplot(df_bench) + aes('nb_sets', 'time')
 + geom_point() + geom_smooth(span=.3) + scale_x_continuous()
 + xlab("Number of sets") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig1")


## Number of queried words
df_bench = pd.DataFrame(columns = ['step','time'])

X = test_data_for_modl(nflags = NFLAGS, number_of_sets = DEFAULT_N_SETS, noise = NOISE)

for _ in REPEATS:
    for step in STEPS:

        start_time = time.time()
        combi_miner = Modl(X, multiple_overlap_max_number_of_combinations = step, nb_threads = N_THREADS)
        modl_interesting_combis = combi_miner.find_interesting_combinations()
        stop_time = time.time()

        df_bench = df_bench.append({'step':step, 'time': stop_time-start_time}, ignore_index = True)

df_bench['step'] = df_bench['step'].astype(int)
p = (ggplot(df_bench) + aes('step', 'time')
 + geom_point() + geom_smooth(span=.3) + scale_x_continuous()
 + xlab("Queried nb. of atoms") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig2")
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import numpy as np
np.random.seed(42)

import pandas as pd
from plotnine import ggplot, aes, geom_point, geom_line, geom_smooth, scale_x_continuous, scale_y_log10, xlab, ylab

import time

from pygtftk.stats.intersect.modl.dict_learning import Modl, test_data_for_modl
from pygtftk.stats.intersect.modl.apriori import Apriori, matrix_to_list_of_transactions, apriori_results_to_matrix
from pygtftk.stats.intersect.modl.subroutines import learn_dictionary_and_encode
from pygtftk import utils

from mlxtend.frequent_patterns import fpgrowth, apriori

# Debug : plots are not shown, only printed, so use Agg backend for matplotlib
import matplotlib
matplotlib.use("Agg")

OUTPUT_ROOT = "output/benchmark/scaling/" # Hardcoded for now. It was necessary to launch this script in shell and not via snakemake.script to allow proper redirection of the log


## ---------------------------- Parameters ---------------------------------- ##
utils.VERBOSITY = 0 # We don't want to record debug messages for these tests
REPEATS = range(5) # Repeat all operations N times to get the average

## Elementary operation (DL) vs other itemset miners

# Number of words, and 1/min_support for the comparisons
SCALING_FACTORS =  [1,2,5,10,25,30,40,50,75,100,150,200,250,300,350,400,500]    

# Data generation parameters
NOISE = 0.5
NLINES = 5000
NSETS = 13

# DL parameters
ALPHA = 0.5

## -------------- Elementary operation benchmark : apriori vs fpgrowth vs dict learning

## Support and number of queried words
df_bench = pd.DataFrame(columns = ['scaling_factor','algo','time'])   # Prepare df

for _ in REPEATS:

    for k in SCALING_FACTORS:

        # NOTE Scaling factor of k means:
        # - Apriori and FP-growth will have min support of 1/k
        # - Dictionary learning will learn k atoms
        current_min_support = 1/k

        X = test_data_for_modl(nflags = NLINES, number_of_sets = NSETS, noise = NOISE)
        names = [str(i) for i in range(X.shape[1])]
        transactions = matrix_to_list_of_transactions(X, names)
        X_as_dataframe = pd.DataFrame(X)

        # Apriori - my pure Python implementation
        start_time = time.time()
        myminer = Apriori(min_support = current_min_support)
        myminer.run_apriori(transactions)
        results = myminer.produce_results()
        stop_time = time.time()

        df_bench = df_bench.append({'scaling_factor':k, 'algo':'apriori_pure_python', 'time': stop_time-start_time}, ignore_index = True)  


        # Apriori 
        # It seems this particular apriori implementation reserves a very large amount of memory when support is very low, so cap it.
        if current_min_support >= 0.02: 
            start_time = time.time()
            apriori(X_as_dataframe, min_support = current_min_support)
            stop_time = time.time()

            df_bench = df_bench.append({'scaling_factor':k, 'algo':'apriori', 'time': stop_time-start_time}, ignore_index = True)  

        # FP-Growth
        start_time = time.time()
        result = fpgrowth(X_as_dataframe, min_support=current_min_support)
        stop_time = time.time()

        df_bench = df_bench.append({'scaling_factor':k, 'algo':'fpgrowth', 'time': stop_time-start_time}, ignore_index = True)  

        # Dict learning
        start_time = time.time()
        U_df, V_df, error = learn_dictionary_and_encode(X, n_atoms = k, alpha = ALPHA, n_jobs = 1)
        stop_time = time.time()

        df_bench = df_bench.append({'scaling_factor':k, 'algo':'DL', 'time': stop_time - start_time}, ignore_index = True)   


df_bench['scaling_factor'] = df_bench['scaling_factor'].astype(int)
p = (ggplot(df_bench) + aes('scaling_factor', 'time', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous()
 + xlab("Scaling factor (k)") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig3")

p = (ggplot(df_bench) + aes('scaling_factor', 'time', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + scale_y_log10()
 + xlab("Scaling factor (k)") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig3_log10")


# Normalized time to scaling factor of 1
minimum_time = df_bench[df_bench['scaling_factor']==1][['algo','time']]
df_bench['time_relative'] = df_bench['time'] # Placeholder
for index, row in df_bench.iterrows():
    my_algo = row['algo']
    my_minimum_time = pd.to_numeric(minimum_time.loc[minimum_time['algo'] == my_algo]['time']).min()
    df_bench.at[index,'time_relative'] = row['time']/my_minimum_time

p = (ggplot(df_bench) + aes('scaling_factor', 'time_relative', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous()
 + xlab("Scaling factor (k)") + ylab("Time (relative)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig3_relative")


p = (ggplot(df_bench) + aes('scaling_factor', 'time_relative', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + scale_y_log10()
 + xlab("Scaling factor (k)") + ylab("Time (relative)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig3_relative_log10")
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import numpy as np
np.random.seed(42)

import pandas as pd
from plotnine import ggplot, aes, geom_point, geom_line, geom_smooth, scale_x_continuous, scale_y_log10, xlab, ylab

import time

from pygtftk.stats.intersect.modl.dict_learning import Modl, test_data_for_modl
from pygtftk.stats.intersect.modl.apriori import Apriori, matrix_to_list_of_transactions, apriori_results_to_matrix
from pygtftk.stats.intersect.modl.subroutines import learn_dictionary_and_encode
from pygtftk import utils


from mlxtend.frequent_patterns import fpgrowth, apriori

# Debug : plots are not shown, only printed, so use Agg backend for matplotlib
import matplotlib
matplotlib.use("Agg")

OUTPUT_ROOT = "output/benchmark/scaling/" # Hardcoded for now. It was necessary to launch this script in shell and not via snakemake.script to allow proper redirection of the log



## ---------------------------- Parameters ---------------------------------- ##
utils.VERBOSITY = 0 # We don't want to record debug messages for these tests
REPEATS = range(5) # Repeat all operations N times to get the average

## Elementary operation (DL) vs other itemset miners

# Number of sets (columns), minimum of 6
SETS_NB = [6,7,8,9,10,11,12,13,14,16,18,20,22,24,26]    

# Data generation parameters
NOISE = 0.5
LOW_NOISE = 0.01
N_FLAGS = 5000

# DL parameters
ALPHA = 0
N_ATOMS = 120

# Other algorithms' parameters
MIN_SUPPORT = 1/100 

## -------------- Elementary operation benchmark : apriori vs fpgrowth vs dict learning



## Number of sets
df_bench = pd.DataFrame(columns = ['set_nb','algo','time'])   # Prepare df

for _ in REPEATS:

    for size in SETS_NB:

        # Generate data
        X = test_data_for_modl(nflags = N_FLAGS, number_of_sets = size, noise = NOISE)
        names = [str(i) for i in range(X.shape[1])]
        transactions = matrix_to_list_of_transactions(X, names)
        X_as_dataframe = pd.DataFrame(X)

        X_low_noise = test_data_for_modl(nflags = N_FLAGS, number_of_sets = size, noise = LOW_NOISE)
        X_noiseless = test_data_for_modl(nflags = N_FLAGS, number_of_sets = size, noise = 0)

        # Apriori
        # Cap it to the max number that is not unreasonable
        if size <= 15:
            start_time = time.time()
            myminer = Apriori(min_support = MIN_SUPPORT)
            myminer.run_apriori(transactions)
            results = myminer.produce_results()
            stop_time = time.time()

            df_bench = df_bench.append({'set_nb':size, 'algo':'apriori_pure_python', 'time': stop_time-start_time}, ignore_index = True)   


        # Apriori
        # NOTE: this implementation has high RAM cost for large data or low min_support, be careful
        if size <= 15:
            start_time = time.time()
            result = apriori(X_as_dataframe, min_support = MIN_SUPPORT)
            stop_time = time.time()
            df_bench = df_bench.append({'set_nb':size, 'algo':'apriori', 'time': stop_time-start_time}, ignore_index = True)  


        # FP-Growth
        start_time = time.time()
        result = fpgrowth(X_as_dataframe, min_support = MIN_SUPPORT)
        stop_time = time.time()

        df_bench = df_bench.append({'set_nb':size, 'algo':'fpgrowth', 'time': stop_time-start_time}, ignore_index = True)  


        # Dict learning
        start_time = time.time()
        U_df, V_df, error = learn_dictionary_and_encode(X, n_atoms = N_ATOMS, alpha = ALPHA, n_jobs = 1)
        stop_time = time.time()

        df_bench = df_bench.append({'set_nb':size, 'algo':'DL', 'time': stop_time - start_time}, ignore_index = True)   


        # Dict learning - low noise data
        start_time = time.time()
        U_df, V_df, error = learn_dictionary_and_encode(X_low_noise, n_atoms = N_ATOMS, alpha = ALPHA, n_jobs = 1)
        stop_time = time.time()

        df_bench = df_bench.append({'set_nb':size, 'algo':'DL_low_noise_data', 'time': stop_time - start_time}, ignore_index = True)  


        # Dict learning - noiseless data
        start_time = time.time()
        U_df, V_df, error = learn_dictionary_and_encode(X_noiseless, n_atoms = N_ATOMS, alpha = ALPHA, n_jobs = 1)
        stop_time = time.time()

        df_bench = df_bench.append({'set_nb':size, 'algo':'DL_noiseless_data', 'time': stop_time - start_time}, ignore_index = True)  



df_bench['set_nb'] = df_bench['set_nb'].astype(int)
p = (ggplot(df_bench) + aes('set_nb', 'time', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous()
 + xlab("Number of sets") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig4")

p = (ggplot(df_bench) + aes('set_nb', 'time', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + scale_y_log10()
 + xlab("Number of sets") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig4_log10")


# Normalized time to minimum number of sets
min_nb_sets = min(SETS_NB)
minimum_time = df_bench[df_bench['set_nb'] == min_nb_sets][['algo','time']]
df_bench['time_relative'] = df_bench['time'] # Placeholder
for index, row in df_bench.iterrows():
    my_algo = row['algo']
    my_minimum_time = pd.to_numeric(minimum_time.loc[minimum_time['algo'] == my_algo]['time']).min()
    df_bench.at[index,'time_relative'] = row['time']/my_minimum_time

p = (ggplot(df_bench) + aes('set_nb', 'time_relative', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous()
 + xlab("Number of sets") + ylab("Time (relative)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig4_relative")


p = (ggplot(df_bench) + aes('set_nb', 'time_relative', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + scale_y_log10()
 + xlab("Number of sets") + ylab("Time (relative)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig4_relative_log10")
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import numpy as np
np.random.seed(42)

import pandas as pd
from plotnine import ggplot, aes, geom_point, geom_line, geom_smooth, scale_x_continuous, scale_y_log10, xlab, ylab

import time

from pygtftk.stats.intersect.modl.dict_learning import Modl, test_data_for_modl
from pygtftk.stats.intersect.modl.apriori import Apriori, matrix_to_list_of_transactions, apriori_results_to_matrix
from pygtftk.stats.intersect.modl.subroutines import learn_dictionary_and_encode
from pygtftk import utils

from mlxtend.frequent_patterns import fpgrowth, apriori

# Debug : plots are not shown, only printed, so use Agg backend for matplotlib
import matplotlib
matplotlib.use("Agg")

OUTPUT_ROOT = "output/benchmark/scaling/" # Hardcoded for now. It was necessary to launch this script in shell and not via snakemake.script to allow proper redirection of the log

## ---------------------------- Parameters ---------------------------------- ##
utils.VERBOSITY = 0 # We don't want to record debug messages for these tests
REPEATS = range(5) # Repeat all operations N times to get the average


## Elementary operation (DL) vs other itemset miners

# Number of lines (in thousands)
LINES_NB = [1,2,5,8,10,15,20,25,40,50]
NB_SETS = 15  

# Data generation parameters
NOISE = 0.5

# DL parameters
ALPHA = 0
N_ATOMS = 120

# Other algo parameters
MIN_SUPPORT = 1/100


## -------------- Elementary operation benchmark : apriori vs fpgrowth vs dict learning

## Number of lines
df_bench = pd.DataFrame(columns = ['lines','algo','time'])   # Prepare df

for _ in REPEATS:

    for lines in LINES_NB:

        true_line_nb = lines * 1000

        # Generate data
        # Scaling factor is in the thouands
        X = test_data_for_modl(nflags = true_line_nb, number_of_sets = NB_SETS, noise = NOISE)
        names = [str(i) for i in range(X.shape[1])]
        transactions = matrix_to_list_of_transactions(X, names)
        X_as_dataframe = pd.DataFrame(X)

        # Apriori
        start_time = time.time()
        myminer = Apriori(min_support = MIN_SUPPORT)
        myminer.run_apriori(transactions)
        results = myminer.produce_results()
        stop_time = time.time()

        df_bench = df_bench.append({'lines':lines, 'algo':'apriori_pure_python', 'time': stop_time-start_time}, ignore_index = True)   


        # Apriori
        # NOTE: this implementation has high RAM cost for large data or low min_support, be careful with scaling
        if true_line_nb <= 20000:
            start_time = time.time()
            result = apriori(X_as_dataframe, min_support = MIN_SUPPORT)
            stop_time = time.time()
            df_bench = df_bench.append({'lines':lines, 'algo':'apriori', 'time': stop_time-start_time}, ignore_index = True)  


        # FP-Growth
        start_time = time.time()
        result = fpgrowth(X_as_dataframe, min_support = MIN_SUPPORT)
        stop_time = time.time()

        df_bench = df_bench.append({'lines':lines, 'algo':'fpgrowth', 'time': stop_time-start_time}, ignore_index = True)  


        # Dict learning
        start_time = time.time()
        U_df, V_df, error = learn_dictionary_and_encode(X, n_atoms = N_ATOMS, alpha = ALPHA, n_jobs = 1)
        stop_time = time.time()

        df_bench = df_bench.append({'lines':lines, 'algo':'DL', 'time': stop_time - start_time}, ignore_index = True)   


df_bench['lines'] = df_bench['lines'].astype(int)
p = (ggplot(df_bench) + aes('lines', 'time', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous()
 + xlab("Number of lines") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig5")

p = (ggplot(df_bench) + aes('lines', 'time', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + scale_y_log10()
 + xlab("Number of lines") + ylab("Time (seconds)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig5_log10")


# Normalized time to minimum line number
min_nb_lines = min(LINES_NB)
minimum_time = df_bench[df_bench['lines'] == min_nb_lines][['algo','time']]
df_bench['time_relative'] = df_bench['time'] # Placeholder
for index, row in df_bench.iterrows():
    my_algo = row['algo']
    my_minimum_time = pd.to_numeric(minimum_time.loc[minimum_time['algo'] == my_algo]['time']).min()
    df_bench.at[index,'time_relative'] = row['time']/my_minimum_time

p = (ggplot(df_bench) + aes('lines', 'time_relative', color='algo', group='algo')
 + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous()
 + xlab("Number of lines") + ylab("Time (relative)"))
p.save(filename = OUTPUT_ROOT + "scaling_fig5_relative")
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
args <- commandArgs(trailingOnly = TRUE)
this_dir <- toString(args[1])

setwd(this_dir)
print(getwd())


# ---------------------------------------------------------------------------- #

# Install packages
allpkgs <- c("UpSetR", "dplyr", "reshape2", "patchwork", "latex2exp")
for (p in allpkgs){
  if (!requireNamespace(p)){ install.packages(c(p), repos="http://cran.us.r-project.org", dependencies = TRUE) }
}

library('UpSetR')
require(ggplot2)
require(plyr)
require(dplyr)
require(gridExtra)
require(grid)
library(stringr)
library(RColorBrewer)
library(reshape2)
library(patchwork)
library(latex2exp)

d <- read.table("all_ologram_results.txt", head=T, sep="\t")

# Compute -log10(pval)
d$summed_bp_overlaps_pvalue[d$summed_bp_overlaps_pvalue == 0] <- 1e-320
d$log10_pval <- -log10(d$summed_bp_overlaps_pvalue)


## Extract ("regexp with capture '()'") the query name
d$query <- str_match(d$query, "-([0-9A-Za-z]+)/00_ologram.*")[,2]

## Extract the combination
tmp <- lapply(strsplit(as.character(d$feature_type), "\\+"), str_match, "_([^_]+)_")

# Quick hack
tmp <- lapply(tmp, "[", i=,j=2)
tmp <- lapply(lapply(lapply(tmp, "[", -1), rev), "[", -1)

all_features <- unique(unlist(tmp))
feature_mat <- matrix(0, nrow=nrow(d), ncol=length(all_features))
colnames(feature_mat) <- all_features
for(i in 1:length(tmp)){
  for(j in tmp[[i]]){
    feature_mat[i, j] <- 1
  }
}

## Prepare a binary matrix with TF as column and combi as row 
feature_mat <- as.data.frame(feature_mat)
feature_mat$degree <- rowSums(feature_mat) + 1
d <- cbind(d, as.matrix(feature_mat))

## Let's plot
d$color <-as.character(factor(d$query, levels=brewer.pal(3,"Paired")))

d$combination <- 1:nrow(d)

dm <- melt(data = d, id.vars = c("degree",
                                 "summed_bp_overlaps_true",
                                 "summed_bp_overlaps_pvalue",
                                 "query",
                                 "combination",
                                 "log10_pval",
                                 "summed_bp_overlaps_log2_fold_change"),
     measure.vars=all_features)
dm$value[dm$value == 0] <- NA
dm$Factors <- dm$variable



## Order combination levels by *summed basepairs in the true overlaps*
dm$combination <- factor(dm$combination, ordered = T, levels = unique(as.character(dm$combination[order(dm$summed_bp_overlaps_true, decreasing = T)])))

dm$Factors <- factor(dm$Factors, levels=c("Ctcf", "Rad21", "Smc1a", "Smc3",
                                            "Irf1", "Irf9", "Stat1", "Stat6",
                                            "Nanog","Pou5f1","Klf4", "Sox2"), order=T)


### Point colors
dm$Dataset <- as.character(dm$variable)
dm$Dataset[dm$Dataset %in% c("Ctcf", "Rad21", "Smc1a", "Smc3")] <- "Insulators"
dm$Dataset[dm$Dataset %in% c("Irf1", "Irf9", "Stat1", "Stat6")] <- "Interferon\nresponse"
dm$Dataset[dm$Dataset %in% c("Nanog","Pou5f1","Klf4", "Sox2")] <- "Stem cells"


### Take the n best combi for each query
n_best <- 20

dm %>% 
  arrange(desc(query)) %>% 
  arrange(desc(summed_bp_overlaps_true)) %>%
  group_by(query) %>% 
  slice(1:(n_best*length(all_features)-1)) %>% 
  ungroup() -> dm_sub

### Order queries
dm_sub$query <- factor(dm_sub$query, levels = c("Nanog", "Ctcf", "Irf1"),
                   ordered=TRUE)

### Plot
p1 <- ggplot(dm_sub, aes(y = Factors, x=combination,  fill=Dataset)) +
    geom_point(aes(size=value), guide='none', shape=22, color="white") +
    theme_bw() +
    scale_size(guide="none") +
    theme(axis.text.x = element_text(angle=0, size=7),
          axis.text.y = element_text(size=7),
          legend.key.height = unit(1,"line"),
          axis.title.y = element_text(size=8)) +
    scale_color_manual(values=c("#D0AB58", "#3F4EF3", "#A63965")) +
    scale_fill_manual(values=c("#D0AB58", "#3F4EF3", "#A63965")) +
    facet_wrap(~query, scale="free_x") +
    scale_x_discrete(name ="Combinations",
                     labels=c(1:n_best)) 

p2 <- ggplot(dm_sub,aes(x=combination, y = log10_pval, fill= factor(degree) )) +
  geom_bar(stat="identity", position="dodge") +
  theme_bw() +
  theme(axis.title.x = element_blank(),
        axis.text.y = element_text(size=8),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        strip.text = element_blank(),
        strip.background = element_blank(),
        axis.title.y = element_text(size=8)) +
  scale_fill_manual(name="Combi. order", values=c("#FFC30F", "#1D9A6C", "#104765", "#08062C")) +
  facet_wrap(~query, scale="free_x", strip.position = "bottom")  

p3 <- ggplot(dm_sub,aes(x=combination, y = summed_bp_overlaps_log2_fold_change, fill = factor(degree) )) +
  geom_bar(stat="identity", position="dodge") +
  theme_bw() +
  theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        strip.background = element_blank(),
        strip.text = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "none",
        axis.title.y = element_text(size=8)) +
  scale_fill_manual(values=c("#FFC30F", "#1D9A6C", "#104765", "#08062C")) +
  facet_wrap(~query, scale="free_x", strip.position = "bottom") +
  ylab(TeX('$log2(\\sum bp_{obs}/ \\sum bp_{sim})$'))  


# Save the combined plot
combined_plot <- p3 + p2 + p1 + plot_layout(ncol = 1, heights = c(1,1,2))
ggsave("murine_fig.png", plot = combined_plot, width = 12, height = 8, dpi = 240)
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
this.dir = "./output/sc_atac_seq_pbmc_data/"
setwd(this.dir)
print(getwd())


## Library
if (!requireNamespace("BiocManager", quietly = TRUE)){
  install.packages("BiocManager", repos="http://cran.us.r-project.org", dependencies = TRUE)
}
BiocManager::install()
setRepositories(ind=1:5) # To automatically install Bioconductor dependencies

# Install Signac
if (!requireNamespace("Signac")){
  BiocManager::install(c("GenomeInfoDbData", 'BSgenome.Hsapiens.UCSC.hg19', 'EnsDb.Hsapiens.v75','BSgenome.Hsapiens.UCSC.hg38', 'EnsDb.Hsapiens.v86'))

  BiocManager::install(c("ggbio","chromVAR","TFBSTools","motifmatchr"))

  install.packages("Signac", repos="http://cran.us.r-project.org", dependencies = TRUE)
}


library(Signac)
library(Seurat)
library(GenomeInfoDb)
library(EnsDb.Hsapiens.v75)
library(ggplot2)
library(patchwork)

library(parallel)

set.seed(1234)



## Download and read the data from Signac PBMC vignette

# Download all
options(timeout = 3600)


# Counts
destfile = "atac_v1_pbmc_10k_filtered_peak_bc_matrix.h5"
if (!file.exists(destfile)) { download.file("https://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_10k/atac_v1_pbmc_10k_filtered_peak_bc_matrix.h5", destfile = destfile) }
counts <- Read10X_h5(filename = "atac_v1_pbmc_10k_filtered_peak_bc_matrix.h5")
# From the vignette : each row of the matrix represents a region of the genome (a peak), that is predicted to represent a region of open chromatin. 
# Each value in the matrix represents the number of Tn5 integration sites for each single barcode (i.e. a cell) that map within each peak

# Metadata
destfile = "atac_v1_pbmc_10k_singlecell.csv"
if (!file.exists(destfile)) { download.file("https://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_10k/atac_v1_pbmc_10k_singlecell.csv", destfile = destfile) }
metadata <- read.csv(file = "atac_v1_pbmc_10k_singlecell.csv",
  header = TRUE, row.names = 1)

# Fragment file and index
destfile = "atac_v1_pbmc_10k_fragments.tsv.gz"
if (!file.exists(destfile)) { download.file("https://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_10k/atac_v1_pbmc_10k_fragments.tsv.gz", destfile = destfile) }
destfile = "atac_v1_pbmc_10k_fragments.tsv.gz.tbi"
if (!file.exists(destfile)) { download.file("https://cf.10xgenomics.com/samples/cell-atac/1.0.1/atac_v1_pbmc_10k/atac_v1_pbmc_10k_fragments.tsv.gz.tbi", destfile = destfile) }
# From the vignette : This represents a full list of all unique fragments across all single cells (not just those associated to peaks)


destfile = "pbmc_10k_v3.rds"
if (!file.exists(destfile)) { download.file("https://www.dropbox.com/s/zn6khirjafoyyxl/pbmc_10k_v3.rds?dl=1", destfile = destfile) }


## Create Signac objects
chrom_assay <- CreateChromatinAssay(
  counts = counts, sep = c(":", "-"),  genome = 'hg19',
  fragments = "atac_v1_pbmc_10k_fragments.tsv.gz",
  min.cells = 10, min.features = 200
)

pbmc <- CreateSeuratObject(
  counts = chrom_assay,
  assay = "peaks",
  meta.data = metadata
)

# Add gene annotations
annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v75)
seqlevelsStyle(annotations) <- 'UCSC' # change to UCSC style since the data was mapped to hg19
genome(annotations) <- "hg19"
Annotation(pbmc) <- annotations





# NOTE: to get the atackseq data, use `pbmc[['peaks']]`
# To get the associated genomic ranges, use `granges(pbmc)`

## Quality control

pbmc <- NucleosomeSignal(object = pbmc) # compute nucleosome signal score per cell
pbmc <- TSSEnrichment(object = pbmc, fast = FALSE) # compute TSS enrichment score per cell
# add blacklist ratio and fraction of reads in peaks
pbmc$pct_reads_in_peaks <- pbmc$peak_region_fragments / pbmc$passed_filters * 100
pbmc$blacklist_ratio <- pbmc$blacklist_region_fragments / pbmc$peak_region_fragments
pbmc$high.tss <- ifelse(pbmc$TSS.enrichment > 2, 'High', 'Low')
pbmc$nucleosome_group <- ifelse(pbmc$nucleosome_signal > 4, 'NS > 4', 'NS < 4')

# Remove outliers
pbmc <- subset(x = pbmc,
  subset = peak_region_fragments > 3000 &
    peak_region_fragments < 20000 &
    pct_reads_in_peaks > 15 &
    blacklist_ratio < 0.05 &
    nucleosome_signal < 4 &
    TSS.enrichment > 2
)

## Perform normalization+feature selection, and then UMAP cluster computation
pbmc <- RunTFIDF(pbmc)
pbmc <- FindTopFeatures(pbmc, min.cutoff = 'q0')
pbmc <- RunSVD(pbmc)

pbmc <- RunUMAP(object = pbmc, reduction = 'lsi', dims = 2:30)
pbmc <- FindNeighbors(object = pbmc, reduction = 'lsi', dims = 2:30)
pbmc <- FindClusters(object = pbmc, verbose = FALSE, algorithm = 3)

#DimPlot(object = pbmc, label = TRUE) + NoLegend()


# Gene activity matrix, using chromatin accessibility as proxy for activity
gene.activities <- GeneActivity(pbmc)
pbmc[['RNA']] <- CreateAssayObject(counts = gene.activities)
pbmc <- NormalizeData(object = pbmc,
  assay = 'RNA',  normalization.method = 'LogNormalize',
  scale.factor = median(pbmc$nCount_RNA)
)
DefaultAssay(pbmc) <- 'RNA'


## Load names from transversal scRNA-Seq study

pbmc_rna <- readRDS("pbmc_10k_v3.rds") # Load the pre-processed scRNA-seq data for PBMCs

transfer.anchors <- FindTransferAnchors(
  reference = pbmc_rna, query = pbmc,
  reduction = 'cca'
)
predicted.labels <- TransferData(
  anchorset = transfer.anchors, refdata = pbmc_rna$celltype,
  weight.reduction = pbmc[['lsi']], dims = 2:30
)

pbmc <- AddMetaData(object = pbmc, metadata = predicted.labels)

#plot1 <- DimPlot( object = pbmc_rna, group.by = 'celltype',label = TRUE,repel = TRUE) + NoLegend() + ggtitle('scRNA-seq')
#plot2 <- DimPlot( object = pbmc, group.by = 'predicted.labels',label = TRUE, repel = TRUE) + NoLegend() + ggtitle('scATAC-seq')
#plot1 + plot2


# Change names
pbmc <- subset(pbmc, idents = 14, invert = TRUE)
pbmc <- RenameIdents(
  object = pbmc,
  '0' = 'CD14 Mono',
  '1' = 'CD4 Memory',
  '2' = 'CD8 Effector',
  '3' = 'CD4 Naive',
  '4' = 'CD14 Mono',
  '5' = 'DN T',
  '6' = 'CD8 Naive',
  '7' = 'NK CD56Dim',
  '8' = 'pre-B',
  '9' = 'CD16 Mono',
  '10' = 'pro-B',
  '11' = 'DC',
  '12' = 'NK CD56bright',
  '13' = 'pDC'
)


DefaultAssay(pbmc) <- 'peaks' # Important, change back to working with peaks instead of gene activities


## Find differantially accessible peaks between clusters
## May be useful to create a restriction for shuffling

# da_peaks <- FindMarkers(
#   object = pbmc,
#   ident.1 = "CD4 Naive",
#   ident.2 = "CD14 Mono",
#   min.pct = 0.2,
#   test.use = 'LR',
#   latent.vars = 'peak_region_fragments'
# )


# Translation between peaks's column name and cell id + predicted type
translation = data.frame(class = pbmc$predicted.id, id = pbmc$cell_id)
write.table(translation, file="cell_to_class.tsv", sep = '\t')

## Convert to BED

ifelse(!dir.exists("bed"), dir.create("bed"), FALSE) # Create directory




# For each cell (ie. tag)...
n_tags = ncol(pbmc[['peaks']])
#for (j in 1:n_tags) {}

message_parallel <- function(...){
  system(sprintf('echo "\n%s\n"', paste0(..., collapse="")))
}

signal_to_bed = function(j){

  # Get the vector of signal for this cell, giving the signal or absence thereof
  # at each candidate region
  signal_this_tag = data.frame(
    pbmc[['peaks']][1:nrow(pbmc[['peaks']]), j], check.names = FALSE
    )

  # Get the tag's id and predicted class
  tag = colnames(signal_this_tag)
  tag_id = translation[tag,"id"]
  tag_class = translation[tag,"class"]
  tag_class = chartr(" ", "_", tag_class)

  # Open a BED file
  bed_dir = paste("bed/",tag_class,"/", sep = '')
  bedpath = paste(bed_dir,tag_id,".bed", sep = '')

  ifelse(!dir.exists(bed_dir), dir.create(bed_dir), FALSE)

  # For each region (row), write it to the BED file if detected in this cell
  for(i in 1:nrow(signal_this_tag)) {
    signal = signal_this_tag[i,1]

    # Since we work on the peaks matric and not on the fragments themselves,
    # I binarize the counts value 
    if(signal > 0){
      region = rownames(signal_this_tag)[i]
      region_list = strsplit(region, split='-', fixed=TRUE)[[1]]
      line = paste(region_list[1],'\t',region_list[2],'\t',region_list[3], sep = '')
      write(line, file = bedpath, append = TRUE)
    }
  }

  message_parallel("Cell tag ",j,"/",n_tags,"complete.")

  return(TRUE)
}

res = mclapply(1:n_tags, signal_to_bed, mc.cores = 4)




# One BED file with all regions
bedpath_all = "bed/allmerged.bed" 
all_regions = rownames(pbmc[['peaks']])
for (region in all_regions) {
  region_list = strsplit(region, split='-', fixed=TRUE)[[1]]
  line = paste(region_list[1],'\t',region_list[2],'\t',region_list[3], sep = '')
  write(line, file = bedpath_all, append = TRUE)
}






## ------ Final step: random selection
# Randomly select 50 cells from CD14, 25 from CD4 and 25 from CD8

ifelse(!dir.exists("bed_selected"), dir.create("bed_selected"), FALSE)

draw_for_this_type = function(cell_type, size, randomize = FALSE){

  # Get file list with full path 
  dirpath = paste("bed/",cell_type, sep = "")
  files <- list.files(dirpath, full.names = TRUE)

  # Select the desired number of files by simple random sampling,
  # or just take the top N if `randomize` is False
  if (randomize) {
    randomize <- sample(seq(files))
    files2analyse <- files[randomize] 
  }
  else {
    files2analyse <- files
  }

  files2analyse <- files2analyse[(1:size)]

  # Move files
  for(i in seq(files2analyse)){
    file.copy(from = files2analyse[i], to = "bed_selected/")
  }
}

draw_for_this_type("CD14+_Monocytes", 30, FALSE)
draw_for_this_type("CD4_Naive", 15, FALSE)
draw_for_this_type("CD8_Naive", 15, FALSE)
draw_for_this_type("pre-B_cell", 15, FALSE)
81
82
83
84
85
86
87
88
89
90
91
92
shell: """
    # Create incl file by merging all BEDs
    cat input/mcf7/*.bed | bedtools sort | bedtools merge > {output.incl}

    # And add another one
    cat input/mcf7_additional/*.bed > input/extended_all_mcf7.bed
    awk -F"\t" 'NF{{NF-=3}};3' < input/extended_all_mcf7.bed > input/extended_all_mcf7_3col.bed
    cat {output.incl} input/extended_all_mcf7_3col.bed | sed -e 's/ /\t/g' | bedtools sort | bedtools merge > {output.incl_extended}

    # Once done, move future query
    mv input/mcf7/foxa1.bed {output.query}
"""
110
111
112
113
shell: """
    gunzip input/*.gz
    gunzip input/*/*.gz    
"""
SnakeMake From line 110 of master/Snakefile
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
shell: """
mkdir -p output/artificial_data

# Generate query
bedtools random -n $((3*{params.size})) -l $(({params.length})) -seed 123456 -g input/hg38.genome > {output.query}

# Generate A and B
head -n $(({params.size})) {output.query} > {output.a}
bedtools shuffle -i {output.a} -incl {output.query} -excl {output.a} -g input/hg38.genome -noOverlapping -seed 654321 > {output.b}
# Do not put all regions of query into A and B

# Ensure there is at least one common line in A and B and C so they are displayed
echo "chr1	1	2	0	1	-" >> {output.a}
echo "chr1	1	2	0	1	-" >> {output.b}
echo "chr1	1	2	0	1	-" >> {output.query}

cp {output.a} {output.a2}

# Generate D
bedtools random -n $((2*{params.size})) -l $(({params.length})) -seed 654321 -g input/hg38.genome > {output.c}
"""
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
shell: """

mkdir -p output/artificial_data_calibrate/data

# Prepare query
bedtools random -n $((100*{params.size})) -l $(({params.length})) -seed 123456 -g input/artificial_simple.genome > {output.query}

# And filler regions that do NOT overlap with query
# Use a different set each time
bedtools shuffle -i {output.query} -excl {output.query} -g input/artificial_simple.genome -noOverlapping -seed 654321 > {output.filler1}
bedtools shuffle -i {output.query} -excl {output.query} -g input/artificial_simple.genome -noOverlapping -seed 789789 > {output.filler2}


# Calibrating : get the first K lines, with different K, of the query file
# and fill the rest with random regions that do NOT overlap with the query

head -n $((21*{params.size})) {output.query} > {output.a}
head -n $((79*{params.size})) {output.filler1} >> {output.a}

tail -n $((26*{params.size})) {output.query} > {output.b}
tail -n $((74*{params.size})) {output.filler2} >> {output.b}
"""
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
shell: '''
# Prepare input and output dir
mkdir -p output/artificial_data_finesse
mkdir -p output/ologram_result_artificial_finesse

# Prepare genome
printf "chr1\t100000000\nchr2\t100000000\nchr3\t100000000\nchr4\t100000000\nchr5\t100000000\n" > {output.genome}

# Draw regions
bedtools random -g {output.genome} -n 10000 -l 1000 -seed 42021958 | bedtools sort > {output.a}

# Get some regions from A, and randomize the rest
head -n 51 {output.a} > {output.b}.temp
bedtools random -g {output.genome} -n 10000 -l 1000 -seed 98131240 >> {output.b}.temp
bedtools sort -i {output.b}.temp > {output.b}
'''
243
244
245
246
247
248
249
250
shell:"""
mkdir -p output/sc_atac_seq_pbmc_data/
Rscript scripts/pbmc_download_data.R

# Preparing the incl files
cat output/sc_atac_seq_pbmc_data/bed/*.bed | bedtools sort | bedtools merge > {output.allreg}
bedtools slop -i {output.allreg} -g input/hg19.genome -b 250 > {output.allreg_slopped}
"""
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
shell:"""
mkdir -p output/murine_data/selected_tf/

# Download data
for i in `cat {input.sources} | grep mm10 | awk '{{print $0,"\t","bla"}}' | sed 's/\"//g' | cut -f1,6,7 | perl -npe 's/\t/-/g'| perl -npe 's/ +/_/g'`
do
    n=`echo $i | sed 's/\-.*//'`
    wget http://dbarchive.biosciencedbc.jp/kyushu-u/mm10/eachData/bed10/$n.10.bed -O output/murine_data/selected_tf/$i.bed --cipher 'DEFAULT:!DH'
done


# Prepare incl
wget reftss.clst.riken.jp/datafiles/current/mouse/refTSS_v3.1_mouse_coordinate.mm10.bed.gz -O output/murine_data/refTSS_v3.1_mouse_coordinate.mm10.bed.gz --cipher 'DEFAULT:!DH'
gunzip output/murine_data/refTSS_v3.1_mouse_coordinate.mm10.bed.gz

cat output/murine_data/refTSS_v3.1_mouse_coordinate.mm10.bed | bedtools slop -l {params.incl_prom_left_slop} -r 1 -s -g input/mm10.genome | bedtools sort | bedtools merge > {output.incl}
"""
341
342
343
344
345
346
shell: """
gtftk ologram -z -c hg38 -p {input.query} --more-bed {params.trs} \
    -o output/ologram_result_mcf7 --force-chrom-peak --force-chrom-more-bed  \
    -V 3 -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} \
    --more-bed-multiple-overlap --bed-incl {input.incl} --no-date
"""
SnakeMake From line 341 of master/Snakefile
369
370
371
372
373
374
375
shell: """
gtftk ologram -z -c hg38 -p {input.query} --more-bed {params.trs}\
    -o output/ologram_result_mcf7_filtered --force-chrom-peak --force-chrom-more-bed --no-date \
    -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} -V 3 \
    --more-bed-multiple-overlap --bed-incl {input.incl} \
    --multiple-overlap-max-number-of-combinations {params.max_combis}               
"""
SnakeMake From line 369 of master/Snakefile
388
389
390
391
shell: """
head -n 1 {input.res} > {output}
grep -w -f {input.filtered} {input.res} >> {output}
"""
SnakeMake From line 388 of master/Snakefile
412
413
414
415
416
417
shell: """
gtftk ologram -z -c hg38 -p {input.query} --more-bed {params.trs} \
    -o output/ologram_result_mcf7_full_dhs --force-chrom-peak --force-chrom-more-bed  \
    -V 3 -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} \
    --more-bed-multiple-overlap --bed-incl {input.incl} --no-date
"""
SnakeMake From line 412 of master/Snakefile
436
437
438
439
440
441
442
shell:"""
mkdir -p output/benchmark
python scripts/modl_comparison.py 2> {log.err} 1> {log.out}

# Signal we are done
touch {output}
"""
SnakeMake From line 436 of master/Snakefile
456
457
458
459
460
461
462
463
464
465
466
467
    shell:"""
    mkdir -p output/benchmark/perspective
    python scripts/modl_perspective.py 2> {log.err} 1> {log.out}

    # Signal we are done
    touch {output}
    """


"""
Rules to evaluate MODL time scaling, and scaling of the elementary operations, in parallel
"""
SnakeMake From line 456 of master/Snakefile
477
478
479
480
481
482
483
484
shell:"""  
mkdir -p output/benchmark/scaling

# Print information about the CPU
lscpu > output/cpu_info.txt

python scripts/modl_scaling_1_2.py 2> {log.err} 1> {log.out}
"""
SnakeMake From line 477 of master/Snakefile
493
494
495
496
shell:"""
mkdir -p output/benchmark/scaling
python scripts/modl_scaling_3.py 2> {log.err} 1> {log.out}
"""
SnakeMake From line 493 of master/Snakefile
505
506
507
508
shell:"""
mkdir -p output/benchmark/scaling
python scripts/modl_scaling_4.py 2> {log.err} 1> {log.out}
"""
SnakeMake From line 505 of master/Snakefile
517
518
519
520
shell:"""
mkdir -p output/benchmark/scaling
python scripts/modl_scaling_5.py 2> {log.err} 1> {log.out}
"""
SnakeMake From line 517 of master/Snakefile
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
shell: """
mkdir -p output/ologram_result_ginom
mkdir -p output/ologram_result_ginom_filtered

# With MODL
gtftk ologram -z -c hg19 -p {input.query} --more-bed {params.trs} \
    -o output/ologram_result_ginom_filtered --force-chrom-peak --force-chrom-more-bed --no-date \
    -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} -V 3 \
    --more-bed-multiple-overlap --bed-incl {input.incl} \
    --multiple-overlap-max-number-of-combinations {params.max_combis}   

# Without MODL
gtftk ologram -z -c hg19 -p {input.query} --more-bed {params.trs}\
    -o output/ologram_result_ginom --force-chrom-peak --force-chrom-more-bed --no-date \
    -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} -V 3 \
    --more-bed-multiple-overlap --bed-incl {input.incl}      
"""
SnakeMake From line 561 of master/Snakefile
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
shell:"""
mkdir -p output/murine_result
mkdir -p output/murine_result_restricted

# Without restricting to estimated promoters only

gtftk ologram -z -f -w -q -c mm10 -p output/murine_data/selected_tf/SRX1815531-Ctcf-Blood.bed \
    --more-bed `ls output/murine_data/selected_tf/* | grep -vi ctcf` --more-bed-multiple-overlap --no-date \
    -k {threads} -V 3 -j summed_bp_overlaps_pvalue -a summed_bp_overlaps_pvalue -g 0.05 -o output/murine_result/SRX1815531-Ctcf \
    -mn {params.minibatch_number} -ms {params.minibatch_size}

gtftk ologram -z -f -w -q -c mm10 -p output/murine_data/selected_tf/SRX1583885-Irf1-Blood.bed \
    --more-bed `ls output/murine_data/selected_tf/* | grep -vi Irf1` --more-bed-multiple-overlap --no-date \
    -k {threads} -V 3  -j summed_bp_overlaps_pvalue -a summed_bp_overlaps_pvalue -g 0.05 -o output/murine_result/SRX1583885-Irf1 \
    -mn {params.minibatch_number} -ms {params.minibatch_size}

gtftk ologram -z -f -w -q -c mm10 -p output/murine_data/selected_tf/ERX1633247-Nanog-Pluripotent_stem_cell.bed \
    --more-bed `ls output/murine_data/selected_tf/* | grep -vi nanog` --more-bed-multiple-overlap --no-date \
    -k {threads} -V 3  -j summed_bp_overlaps_pvalue -a summed_bp_overlaps_pvalue -g 0.05 -o output/murine_result/ERX1633247-Nanog \
    -mn {params.minibatch_number} -ms {params.minibatch_size}



# With the restriction

gtftk ologram -z -f -w -q -c mm10 -p output/murine_data/selected_tf/SRX1815531-Ctcf-Blood.bed \
    --more-bed `ls output/murine_data/selected_tf/* | grep -vi ctcf` --more-bed-multiple-overlap --no-date \
    -k {threads} -V 3 -j summed_bp_overlaps_pvalue -a summed_bp_overlaps_pvalue -g 0.05 -o output/murine_result_restricted/SRX1815531-Ctcf \
    -mn {params.minibatch_number} -ms {params.minibatch_size} --bed-incl {input.incl}

gtftk ologram -z -f -w -q -c mm10 -p output/murine_data/selected_tf/SRX1583885-Irf1-Blood.bed \
    --more-bed `ls output/murine_data/selected_tf/* | grep -vi Irf1` --more-bed-multiple-overlap --no-date \
    -k {threads} -V 3  -j summed_bp_overlaps_pvalue -a summed_bp_overlaps_pvalue -g 0.05 -o output/murine_result_restricted/SRX1583885-Irf1 \
    -mn {params.minibatch_number} -ms {params.minibatch_size} --bed-incl {input.incl}

gtftk ologram -z -f -w -q -c mm10 -p output/murine_data/selected_tf/ERX1633247-Nanog-Pluripotent_stem_cell.bed \
    --more-bed `ls output/murine_data/selected_tf/* | grep -vi nanog` --more-bed-multiple-overlap --no-date \
    -k {threads} -V 3  -j summed_bp_overlaps_pvalue -a summed_bp_overlaps_pvalue -g 0.05 -o output/murine_result_restricted/ERX1633247-Nanog \
    -mn {params.minibatch_number} -ms {params.minibatch_size} --bed-incl {input.incl}
"""
SnakeMake From line 591 of master/Snakefile
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
shell:"""
mkdir -p output/ologram_result
mkdir -p output/ologram_result_artificial
mkdir -p output/ologram_result_artificial_calibrate

# Run on artificial data
gtftk ologram -z -c hg38 -p {input.query} --more-bed {params.peaks} \
    -o output/ologram_result_artificial --force-chrom-peak --force-chrom-more-bed \
    -V 3 -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} \
    --more-bed-multiple-overlap --no-date -K output/TMP_ologram_result_artificial

# Run on calibrated artificial data
gtftk ologram -z -c input/artificial_simple.genome -p {input.query_calibrate} --more-bed {params.peaks_calibrate} \
    -o output/ologram_result_artificial_calibrate --force-chrom-peak --force-chrom-more-bed \
    -V 3 -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} \
    --more-bed-multiple-overlap --no-date -K output/TMP_ologram_result_artificial_calibrate
"""
SnakeMake From line 659 of master/Snakefile
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
shell: """
mkdir -p output/multovl_result_artificial_calibrate

# On full artificial data
start=`date +%s.%N`
{params.multovl_exec} {input.query} {params.peaks} --nointrack \
    --reshufflings {params.shuffle_numbers} --free input/hg38.genome.bed \
    > {output.artificial}
end=`date +%s.%N`

runtime=$( echo "$end - $start" | bc -l )
echo "Duration of MULTOVL for full artificial data was (in seconds) : " $runtime > output/multovl_result_artificial_calibrate/timer.txt


# On artificial data, negative control only
{params.multovl_exec} {input.query} output/artificial_data/data/neg_control.bed --nointrack \
    --reshufflings {params.shuffle_numbers} --free input/hg38.genome.bed \
    > {output.artificial_neg_control}


# On simpler, calibrated artificial data
{params.multovl_exec} {input.query_calibrate} {params.peaks_calibrate} --nointrack \
    --reshufflings {params.shuffle_numbers} --free input/artificial_simple.genome.bed \
    > output/multovl_result_artificial_calibrate/partial1.txt

{params.multovl_exec} {input.query_calibrate} {params.peaks_calibrate_as_list[0]} --nointrack \
    --reshufflings {params.shuffle_numbers} --free input/artificial_simple.genome.bed \
    > output/multovl_result_artificial_calibrate/partial2.txt

{params.multovl_exec} {input.query_calibrate} {params.peaks_calibrate_as_list[1]} --nointrack \
    --reshufflings {params.shuffle_numbers} --free input/artificial_simple.genome.bed \
    > output/multovl_result_artificial_calibrate/partial3.txt


# Concatenate results for calibrated, with new line between each
awk 'FNR==1{{print ""}}1' output/multovl_result_artificial_calibrate/partial* > {output.calibrated}

rm output/multovl_result_artificial_calibrate/partial* # Cleanup
"""
SnakeMake From line 703 of master/Snakefile
757
758
759
shell:'''
cat output/ologram_result_artificial_finesse/ologram_output/*.tsv > {output.r}
'''
SnakeMake From line 757 of master/Snakefile
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
shell:'''
MINIBATCH_SIZE=10

for MINIBATCH_NB in 1 2 5 10 15 20 35
do
  for TRYNUMBER in $(seq 1 40)
  do
    gtftk ologram -ms $MINIBATCH_SIZE -mn $MINIBATCH_NB -p {input.a} \
        --more-bed {input.b} -z -c {input.genome} -V 1 -s ${{TRYNUMBER}} -k {threads}\
        --more-bed-labels artificial_minibatches_${{MINIBATCH_NB}}_try_num_${{TRYNUMBER}} \
        --force-chrom-peak   --force-chrom-more-bed\
        -o output/ologram_result_artificial_finesse/ologram_output
  done
done
touch {output}
'''
SnakeMake From line 770 of master/Snakefile
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
shell:'''
MINIBATCH_SIZE=10
for MINIBATCH_NB in 50 100 250
do
  for TRYNUMBER in $(seq 1 8)
  do
    gtftk ologram -ms $MINIBATCH_SIZE -mn $MINIBATCH_NB -p {input.a} \
        --more-bed {input.b} -z -c {input.genome} -V 1 -k {threads} -s ${{TRYNUMBER}}  \
        --more-bed-labels artificial_minibatches_${{MINIBATCH_NB}}_try_num_${{TRYNUMBER}} \
        --force-chrom-peak   --force-chrom-more-bed\
        -o output/ologram_result_artificial_finesse/ologram_output
  done
done
touch {output}
'''
SnakeMake From line 794 of master/Snakefile
817
818
819
820
821
822
823
824
825
826
827
828
829
shell:'''
MINIBATCH_SIZE=10
MINIBATCH_NB=500
for TRYNUMBER in $(seq 1 3)
do
    gtftk ologram -ms $MINIBATCH_SIZE -mn $MINIBATCH_NB -p {input.a} \
        --more-bed {input.b} -z -c {input.genome} -V 1 -k {threads} -s ${{TRYNUMBER}} \
        --more-bed-labels artificial_minibatches_${{MINIBATCH_NB}}_try_num_${{TRYNUMBER}} \
        --force-chrom-peak  --force-chrom-more-bed\
        -o output/ologram_result_artificial_finesse/ologram_output
done
touch {output}
'''
SnakeMake From line 817 of master/Snakefile
851
852
853
854
855
856
857
shell:'''
gtftk ologram -p {input.a} --more-bed {input.b} \
    -mn {params.minibatch_number} -ms {params.minibatch_size} \
    -z -c {input.genome} -V 1 --no-date -k {threads} -s {wildcards.i} \
    --force-chrom-peak --force-chrom-more-bed \
    -o output/ologram_result_artificial_finesse/truth/run_{wildcards.i}
'''
SnakeMake From line 851 of master/Snakefile
877
878
879
880
881
882
883
884
shell:'''
gtftk ologram -p {input.a} --more-bed {input.b} \
    -mn {params.minibatch_number} -ms {params.minibatch_size} \
    -z -c {input.genome} -V 1 --no-date -k {threads} \
    --force-chrom-peak --force-chrom-more-bed \
    --display-fit-quality -K output/ologram_result_artificial_finesse/truth_fit \
    -o output/ologram_result_artificial_finesse/truth_histogram
'''
SnakeMake From line 877 of master/Snakefile
892
893
894
895
896
897
898
899
900
901
902
903
904
shell:"""
mkdir -p output/ologram_bad_fit

printf "chr1\t100000000\nchr2\t100000000\nchr3\t100000000\nchr4\t100000000\nchr5\t100000000\n" > output/ologram_bad_fit/my.genome
bedtools random -g my.genome -n 100 -l 100000 -seed 42021958 | bedtools sort > output/ologram_bad_fit/A.bed
head -n 10 A.bed > output/ologram_bad_fit/B_tmp.bed
bedtools random -g my.genome -n 90  -l 100000 -seed 98131240 >> output/ologram_bad_fit/B_tmp.bed
bedtools sort -i output/ologram_bad_fit/B_tmp.bed > output/ologram_bad_fit/B.bed

gtftk ologram -ms 250 -mn 100 -p output/ologram_bad_fit/A.bed --more-bed output/ologram_bad_fit/B.bed -z -c output/ologram_bad_fit/my.genome -V 1 --no-date -o output/ologram_bad_fit/ -s 42 -k 8 --display-fit-quality -K output/ologram_bad_fit/tmp_files

touch output/ologram_bad_fit/bad_fit_done
"""
918
919
920
shell: """
gtftk ologram_merge_runs --inputfiles `ls output/ologram_result_artificial_finesse/truth/*/*.tsv` -o {output} -V 3 --ori-shuffles 2000
"""
SnakeMake From line 918 of master/Snakefile
928
929
script:
    "scripts/depth_boxplot.py"
SnakeMake From line 928 of master/Snakefile
942
943
script:
    "scripts/beta_precision.py"
SnakeMake From line 942 of master/Snakefile
960
961
962
963
964
965
966
967
968
shell:"""
mkdir -p output/ologram_result_scatacseq_pbmc/run_{wildcards.i}

gtftk ologram -z -c hg19 -p {input} --more-bed {params.peaks} --no-date \
    -o output/ologram_result_scatacseq_pbmc/run_{wildcards.i} \
    --force-chrom-peak --force-chrom-more-bed \
    -k {threads} -mn {params.minibatch_number} -ms {params.minibatch_size} -V 3 \
    --more-bed-multiple-overlap --bed-incl {input}
"""
SnakeMake From line 960 of master/Snakefile
978
979
980
shell: """
gtftk ologram_merge_runs --inputfiles `ls output/ologram_result_scatacseq_pbmc/*/*.tsv` -o {output} -V 3 --ori-shuffles 4
"""
SnakeMake From line 978 of master/Snakefile
989
990
991
992
993
994
995
996
997
shell: """
mkdir -p output/ologram_result_scatacseq_pbmc/entropy_graph/

# Run the Python script to produce the figures
python scripts/combi_entropy_analysis.py

# Signal we are done
touch {output}
"""
SnakeMake From line 989 of master/Snakefile
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
shell: """

## Merging OLOGRAM results
# Print the data and a supplementary column for file names
cat output/murine_{wildcards.kind}/*X*/*tsv | head -n 1 | awk 'BEGIN{{FS=OFS="\t"}}{{print $0,"query"}}'> output/murine_{wildcards.kind}/all_ologram_results.txt
# Print data without header
for i in `ls output/murine_{wildcards.kind}/*X*/*tsv`; do  awk -v f=$i 'BEGIN{{FS=OFS="\t"}}{{print $0,f}}' $i ; done | grep -v nb_intersections_expectation_shuffled >> output/murine_{wildcards.kind}/all_ologram_results.txt

## Now call the R script, with the point of execution in argument
Rscript scripts/murine_analysis.R "./output/murine_{wildcards.kind}/"
"""
SnakeMake From line 1007 of master/Snakefile
1045
1046
1047
shell: """
gtftk ologram_modl_treeify -i {input} -o {output} -l {params.queryname}
"""
SnakeMake From line 1045 of master/Snakefile
ShowHide 41 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/qferre/ologram-modl_supp_mat
Name: ologram-modl_supp_mat
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...