Genetic pipeline to detect variants from a genome of reference

public public 1yr ago 0 bookmarks

Genetic pipeline inspired by Google's DeepVariant in order to detect cancer-causing alleles.

Snakemake workflow, CNN with Keras and Contenairisation with Docker and Singularity

Example of a read: Capture d’écran 2022-06-02 à 17 05 58

Our pipeline : Capture d’écran 2022-06-02 à 17 08 17

See the bellow documents :

manuel_d_installation_1.3.docx

manuel_d_utilisation_1.3.docx

Code Snippets

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import sys
import os
from transcription import *
from util_CNN import *

#warning are not printed
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

root_dir = os.getcwd()
res_dir = os.path.join(root_dir,"result")
test_path = sys.argv[1] if len(sys.argv) > 1 else ""
BATCH_SIZE = int(sys.argv[2]) if len(sys.argv) > 2 else 32
vcf_dir = sys.argv[3]

#créer le dossier ou on va stocker les données de performance de notre réseau
if not os.path.exists(res_dir):
    os.makedirs(res_dir)

trLabel,predLabel,dico = testModel(test_path,BATCH_SIZE)
evaluateVariant(trLabel,predLabel)
for k in dico.keys():
    print(k,dico[k])
result_tab(dico,vcf_dir)
Python From line 1 of CNN/testCNN.py
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
import sys
import os
from util_CNN import *
sys.path.append("/Neat/anaconda/envs/py/lib/python3.6/site-packages")
from rename import renameFiles
from splitData import splitFolder

#warning are not printed
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

input_folder = sys.argv[1]
if input_folder.endswith("/images"):
   input_folder = input_folder[:-len("/images")]
output_folder = input_folder + "Divided"
root_dir = os.getcwd()
train_dir = os.path.join(root_dir, output_folder + "/train/images")
valid_dir = os.path.join(root_dir, output_folder + "/val/images")
test_dir = os.path.join(root_dir, output_folder + "/test/images")

print(train_dir)


#enlever le "_paternal" et "_maternal" des noms des images
renameFiles(input_folder+"/images")

#partage le data en 3 dossiers train (80%), val(10%) et test(10%)
#le folder input doit contenir un dossier qui contient toutes les images
splitFolder(input_folder,output_folder)

#CNN
num_samplesTrain = len(getSamples(train_dir))
num_samplesValid = len(getSamples(valid_dir))

print("nb images d'entrainement : ",num_samplesTrain)
print("nb images de validation : ",num_samplesValid)

# creating TFRecords output folder
if not os.path.exists(tfrecords_dir):
    os.makedirs(tfrecords_dir)  

convertDataToTfrecord(train_dir,"train")
convertDataToTfrecord(valid_dir,"valid")

#Retourne une liste de fichiers qui match le pattern donné en paramètres
train_filenames = tf.io.gfile.glob(f"{tfrecords_dir}/train.tfrec")
valid_filenames = tf.io.gfile.glob(f"{tfrecords_dir}/valid.tfrec")

#gradient descent is an iterative learning algorithm that uses a training dataset to update the weights a model. It minimizes the loss (difference between predicted label and true label)
#The batch size is a hyperparameter of gradient descent that controls the number of training samples to work through before the model’s internal parameters are updated.
#The number of epochs is a hyperparameter of gradient descent that controls the number of complete passes through the training dataset

#si les arguments n'ont pas été précisées, on met la valeur par défaut dans le else
BATCH_SIZE = int(sys.argv[2]) if len(sys.argv) > 2 else 32 #nombre de images données au CNN en 1 seule fois
EPOCHS = int(sys.argv[3]) if len(sys.argv) > 3 else 10 # nombre de cycles durant lesquelles on va entrainer le CNN avec toutes les données
print("batch_size : ",BATCH_SIZE)
print("epochs : ",EPOCHS)
print()

#get your datatensors for training
imageTrain, labelTrain = create_dataset(train_filenames,BATCH_SIZE)
imageTrain = tf.image.resize(imageTrain, size=(299, 299))
imageTrain = tf.image.convert_image_dtype(imageTrain, tf.float32)
imageTrain = tf.keras.applications.inception_v3.preprocess_input(imageTrain)

#get your datatensors for validation
imageValid, labelValid = create_dataset(valid_filenames,BATCH_SIZE)
imageValid = tf.image.resize(imageValid, size=(299, 299))
imageValid = tf.image.convert_image_dtype(imageValid, tf.float32)
imageValid = tf.keras.applications.inception_v3.preprocess_input(imageValid)

#get model
model = getModel(imageTrain,labelTrain)

#l'entrainement s'arrete plus tot si le loss atteint un pallier et ne diminue plus
#évite l'overfitting
early = tf.keras.callbacks.EarlyStopping( patience=10,
                                          min_delta=0.001,
                                          restore_best_weights=True)

#train model
history = model.fit(
    x=imageTrain,
    y=labelTrain,
    epochs=EPOCHS,
    verbose=1,
    callbacks=[early],
    validation_data=(imageValid,labelValid),
    shuffle=False,
)

#afficher les performances de notre réseau
plotAccuracy(history)
plotLoss(history)

#évaluer notre modèle
score = model.evaluate(imageValid, labelValid, verbose=0)
print('\nValidation loss:', score[0])
print('Validation accuracy:', score[1])

#sauvegarder dans un dossier, les weights suite à l'entrainement de notre modèle
model.save("my_model")
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
import sys
sys.path.append("/Neat/anaconda/envs/py/lib/python3.6/site-packages")
from img_util import *


######variables/constantes######
example_path = ""
LABELS_CONST = {"RB":"(read_base)","BQ":"(base_quality)","MQ":"(mapping_quality)","S":"(strand)","RSV":"(read_support variant)","BDFR":"(base_differs_from ref)"}
liste_channels = list(LABELS_CONST.values())

if (len(sys.argv) != 3):
    exit()
    #si le nombre d'arguments n'est pas respecté.

TF_path = str(sys.argv[1])
RES_path = remove_slash(str(sys.argv[2])) + "/dv_images/images"

print(TF_path," ",RES_path)
img_extract(TF_path,RES_path,False)
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
shell:
    "/Neat/varsim/varsim.py --id simu "
    "--reference {input} "   
    "--simulator_executable /Neat/art_src_MountRainier_Linux/art_illumina " 
    "--total_coverage {config[cover]} "
    "--vc_in_vcf {config[vcf]} "
    "--sv_insert_seq work/refs/insert_seq.txt "
    "--out_dir {config[out]} --log_dir {config[log]} "
    "--work_dir {config[work]} --nlanes {config[lanes]} "
    "--read_length 200 "
    "--disable_rand_dgv "
    "--vc_num_ins {config[ins]} "
    "--vc_num_snp {config[snp]} "
    "--vc_num_del {config[del]} "
    "--art_options '\-sam' {config[add]}"
SnakeMake From line 51 of main/Snakefile
73
74
shell:
    "samtools view -S -b {input} > {output}"
81
82
shell:
    "samtools sort {input} -o {output}"
91
92
shell:
    "samtools index {input}"
 99
100
shell:
    "samtools faidx {input}"
115
116
117
118
119
120
121
122
123
124
shell:        
    "singularity run -B /usr/lib/locale/:/usr/lib/locale/ docker://google/deepvariant:\"1.3.0\" /opt/deepvariant/bin/run_deepvariant   "
    "--model_type=WGS   "
    "--ref={input.b} "  
    "--reads={input.a}  "
    "--regions  \"{config[regions]}\"   "
    "--output_vcf={config[out]}/dv_sur_simu/output.vcf.gz   "
    "--output_gvcf={config[out]}/dv_sur_simu/output.g.vcf.gz "
    "--intermediate_results_dir {config[out]}/dv_sur_simu/intermediate_results_dir   "
    "--num_shards={config[coeurs]} "
129
130
shell:
    "sudo gunzip -k {input}"
SnakeMake From line 129 of main/Snakefile
137
138
shell:
    "python img_extract.py {input} {config[out]}"
SnakeMake From line 137 of main/Snakefile
148
149
shell:
    "python CNN/trainCNN.py {input.a} {config[batch]} {config[epoch]}"
SnakeMake From line 148 of main/Snakefile
154
155
shell:
    "python CNN/testCNN.py {input} {config[batch]} {config[rvcf]}"
SnakeMake From line 154 of main/Snakefile
ShowHide 10 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/denliA/BioinformaticPipelineCNN
Name: bioinformaticpipelinecnn
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...