snakemake workflow to run cellranger on a given bucket using gke.

public 1yr ago 0 bookmarks

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation, topic

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

Snakemake workflow: `Cellranger Snakemake GKE`

A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine.

Usage

The usage of this workflow is described in the

Code Snippets

shell:
	"bwa mem -M -t {threads} {config[genome]} {input} | gzip -c > {output}"

BWA From line 9 of modules/0_map_bwa_paired

shell:
	"bwa mem -M -t {threads} {config[genome]} {input} | gzip -c > {output}"

BWA From line 11 of modules/0_map_bwa_single

	shell:
		"""
		zcat {input} | \
                awk 'BEGIN{{FS="\t"; OFS="\t"}}
                !/^@/ && $6!="*" {{
                cigar=$6; gsub("[0-9]+D","",cigar);
                n=split(cigar, vals, "[A-Z]");
                s=0; for(i=1;i<=n;i++) s=s+vals[i];
		seqlen=length($10);
		if(s!=seqlen) print $1"\t";}}' | sort | uniq > {output}
		"""

From line 7 of modules/1_filter_paired

run:
	with open("mapped_reads/" + wildcards.sample + ".badcigar") as fp:
		peek = [x for i,x in enumerate(fp) if i<10 and x.rstrip()]
	cigar_len=len(peek)
	shell("""if [ {cigar_len} -gt 0 ]
	then
		zcat {input.sam} | grep -vF -f {input.cigar} | \
		samtools view -@ {threads} -Su - | \
		samtools sort -@ {threads} -T {params.prefix} -o {output} -

SAMtools From line 27 of modules/1_filter_paired

	shell:
		"""
			samtools view -@ {threads} -F1804 -f2 -q 20 -h \
                        -u {input} | \
                        samtools sort -@ {threads} -m {params.mem} -T {params.prefix} -n -o {output.tmp} -

			samtools fixmate -r {output.tmp} {output.clean}

			samtools view -@ {threads} -F1804 -f2 -u {output.clean} |\
                        samtools sort -@ {threads} -m {params.mem} -T {params.prefix}_2 -o {output.flt} -
		"""

SAMtools From line 53 of modules/1_filter_paired

shell:
	"""
	java -Xmx4G -jar {config[path]}/picard-tools-1.141/picard.jar MarkDuplicates\
	INPUT={input} OUTPUT={output.temp} \
	METRICS_FILE={output.qc} ASSUME_SORTED=true \
	VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=false

	"""

Picard From line 74 of modules/1_filter_paired

		shell:
			"""
			samtools view -@ {threads} -h {input} |\
                        grep -vF -f {config[sponges]} - |\
                        samtools view -@ {threads} -b -o {output.flt} -

			samtools sort -@ {threads} -m {params.mem} -T {params.prefix} -n -o {output.sponge_temp} {output.flt}
			samtools fixmate -r {output.sponge_temp} {output.flt}

			"""

SAMtools From line 93 of modules/1_filter_paired

		shell:
                        """
			samtools view -@ {threads} -F1804 -f2 -b -u {input} |\
                        samtools sort -@ {threads} -m {params.mem} -T {params.prefix}_2 -o {output.final_bam} -

                        #samtools view -@ {threads} -F1804 -f2 -b -o {output.final_bam} {input}
                        samtools index {output.final_bam} {output.final_bai}

			samtools sort -@ {threads} -m {params.mem} -T {params.prefix} -o {output.tmp_sort} -n {input}
                        """	

SAMtools From line 116 of modules/1_filter_paired

shell:
	"""
	java -Xmx4G -jar {config[path]}/picard-tools-1.141/picard.jar MarkDuplicates\
	INPUT={input} OUTPUT={output.flt} \
	METRICS_FILE={output.qc} ASSUME_SORTED=true \
	VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=false

	"""

Picard From line 136 of modules/1_filter_paired

		shell:
                        """
                        samtools view -@ {threads} -F1804 -f2 -b -T {params.prefix} -o {output.final_bam} {input}
                        samtools index {output.final_bam} {output.final_bai}

                        """	

SAMtools From line 155 of modules/1_filter_paired

	shell:
		"""
		samtools sort -@ {threads} -T {params.prefix} -o {output.tmp_sort} -n {input.final_bam}
		bedtools bamtobed -bedpe -mate1 -i {output.tmp_sort} |\
                gzip -c > {output.bedpe} 
		"""

SAMtools BEDTools From line 173 of modules/1_filter_paired

	shell:
		"""zcat {input} | \
                awk 'BEGIN{{OFS="\t"; FS="\t"}}
                {{ chrom=$1; beg=$2; end=$6;
                if($2>$5){{beg=$5}} if($3>$6){{end=$3}}
                print chrom,beg,end
                }}' - | {config[sort]} --parallel={threads} -S 2G -k1,1 -k2,2n | \
                gzip -c > {output.bed}"""

From line 187 of modules/1_filter_paired

	shell:
		"""
            	samtools view -@ {threads} -Su {input.sam} |\
                samtools sort -@ {threads} -T {params.prefix} -o {output} -
		"""

SAMtools From line 8 of modules/1_filter_single

	shell:
		"""
			samtools view -@ {threads} -F 1804 -q 20 \
                        -u {input} -o {output.flt}
		"""

SAMtools From line 21 of modules/1_filter_single

shell:
	"""
	java -Xmx4G -jar {config[path]}/picard-tools-1.141/picard.jar MarkDuplicates\
	INPUT={input} OUTPUT={output.temp} \
	METRICS_FILE={output.qc} ASSUME_SORTED=true \
	VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=false

	"""

Picard From line 36 of modules/1_filter_single

		shell:
			"""
			samtools view -@ {threads} -h {input} |\
                        grep -vF -f {config[sponges]} - |\
                        samtools view -@ {threads} -b -o {output.flt} -

			"""

SAMtools From line 51 of modules/1_filter_single

		shell:
                        """
                        samtools view -@ {threads} -F1804 -b -o {output.final_bam} {input}
                        samtools index {output.final_bam} {output.final_bai}

			samtools sort -@ {threads} -n -T {params.prefix} -o {output.tmp_sort} {input}
                        """	

SAMtools From line 70 of modules/1_filter_single

shell:
	"""
	java -Xmx4G -jar {config[path]}/picard-tools-1.141/picard.jar MarkDuplicates\
	INPUT={input} OUTPUT={output.flt} \
	METRICS_FILE={output.qc} ASSUME_SORTED=true \
	VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=false

	"""

Picard From line 87 of modules/1_filter_single

		shell:
                        """
                        samtools view -@ {threads} -F1804 -b -o {output.final_bam} {input}
                        samtools index {output.final_bam} {output.final_bai}

			samtools sort -@ {threads} -n -T {params.prefix} -o {output.tmp_sort} {input}
                        """	

SAMtools From line 107 of modules/1_filter_single

	shell:
		"""
		bedtools bamtobed -i {input} | cut -f1-3 | \
		{config[sort]} --parallel={threads} -k1,1 -k2,2n | \
                gzip -c > {output.bed} 
		"""

BEDTools From line 122 of modules/1_filter_single

shell:"samtools flagstat {input.final} > {output.final_mapstats}"

SAMtools From line 9 of modules/2_qc_paired

shell:"samtools flagstat {input.raw} > {output.raw_mapstats}"

SAMtools From line 18 of modules/2_qc_paired

	shell:
		"""
		samtools sort -n -@ {threads} -T {params.prefix} -o {output.tmp_sort_bam} {input.flt}
		echo 1 | \
                     awk '{{print "#Total\tDistinct\tOne\tTwo\tNRF\tPBC1\tPBC2"}}' \
                     > {output.lib_complexity}

		bedtools bamtobed -bedpe -i {output.tmp_sort_bam} | \
                     awk 'BEGIN{{OFS="\t"}}{{print $1,$2,$4,$6,$9,$10}}' | \
                     grep -v 'chrM' | sort | uniq -c | \
                     awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0; OFS="\t"}}
                               ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}}
                          END{{print mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}'\
                     >> {output.lib_complexity}
		"""

SAMtools BEDTools From line 32 of modules/2_qc_paired

shell: "samtools flagstat {input.final} > {output.final_mapstats}"

SAMtools From line 9 of modules/2_qc_single

shell: "samtools flagstat {input.raw} > {output.raw_mapstats}"

SAMtools From line 18 of modules/2_qc_single

	shell:
		"""
		samtools sort -n -@ {threads} -T {params.prefix} -o {output.tmp_sort_bam} {input.flt}
		echo 1 | \
                     awk '{{print "#Total\tDistinct\tOne\tTwo\tNRF\tPBC1\tPBC2"}}' \
                     > {output.lib_complexity}

		bedtools bamtobed -i {output.tmp_sort_bam} | \
                     awk 'BEGIN{{OFS="\t"}}{{print $1,$2,$3,$6}}' | \
                     grep -v 'chrM' | sort | uniq -c | \
                     awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0; OFS="\t"}}
                               ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}}
                          END{{if(m2>0) {{print mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}else{{print mt,m0,m1,m2,m0/mt,m1/m0,"-INF"}}}}'\
                     >> {output.lib_complexity}
		"""

SAMtools BEDTools From line 32 of modules/2_qc_single

shell:
	"""
	bamToBed -i {input} | awk 'BEGIN{{OFS="\\t"}}{{$4="N";$5="1000";print $0}}' |\
	tee {output.temp_tagAlign} |\
	gzip -c > {output.tagAlign}
	"""

From line 11 of modules/3_xcor_paired

	shell:
		"""
		grep -v 'chrM' {input.temp_tagAlign} |\
		shuf -n 15000000  |awk 'BEGIN{{OFS="\\t"}}{{$4="N";$5="1000";print $0}}'  > {output.subsample} 

		Rscript {config[path]}/run_spp_nodups.R -c={output.subsample} -p={threads} -filtchr=chrM -savp={output.cc_plot} -out={output.cc_score} > /dev/null 2>&1
        	sed -r  's/,[^\\t]+//g' {output.cc_score} > {output.temp_cc_score} 
		cp {output.temp_cc_score} {output.cc_score}		
		"""

From line 27 of modules/3_xcor_paired

shell:
	"""
	bamToBed -i {input} | awk 'BEGIN{{OFS="\\t"}}{{$4="N";$5="1000";print $0}}' |\
	tee {output.temp_tagAlign} |\
	gzip -c > {output.tagAlign}
	"""

From line 11 of modules/3_xcor_single

	shell:
		"""
		grep -v 'chrM' {input.temp_tagAlign} |\
		shuf -n 15000000  > {output.subsample} 

		Rscript {config[path]}/run_spp_nodups.R -c={output.subsample} -p={threads} -filtchr=chrM -savp={output.cc_plot} -out={output.cc_score} > /dev/null 2>&1
        	sed -r  's/,[^\\t]+//g' {output.cc_score} > {output.temp_cc_score} 
		cp {output.temp_cc_score} {output.cc_score}		
		"""

From line 27 of modules/3_xcor_single

	shell:
		"""
		#If the file is too big '--random-source' option will fail. That's why the '40000000'
		zcat {input} | shuf --random-source=<(zcat {input} | head -40000000) > {output.shuf}
		{config[split]} -d -nl/2 --additional-suffix=\".bed\" {output.shuf} bed_files/psr_{wildcards.sample}.

		{config[sort]} --parallel={threads} -S 2G \
                     -k1,1 -k2,2n {output.temp_psr1} | gzip -c > {output.psr1} 
		{config[sort]} --parallel={threads} -S 2G \
                     -k1,1 -k2,2n {output.temp_psr2} | gzip -c > {output.psr2} 
		"""

From line 13 of modules/4_pseudoreps

shell:
	"""
	zcat {input} | {config[sort]} --parallel={threads} -k1,1 -k2,2n | gzip -c > {output}
	"""

From line 32 of modules/4_pseudoreps

shell:
	"""
	zcat {input.input1} | {config[sort]} --parallel={threads} -k1,1 -k2,2n | gzip -c > {output.rep1}
	zcat {input.input2} | {config[sort]} --parallel={threads} -k1,1 -k2,2n | gzip -c > {output.rep2}
	"""

From line 47 of modules/4_pseudoreps

shell:
	"""
	zcat {input} | {config[sort]} --parallel={threads} -k1,1 -k2,2n | gzip -c > {output}
	"""

From line 59 of modules/4_pseudoreps

run:
	import os, sys, time,subprocess
	#Calling narrowPeaks	
	shell("""
	fraglen=`cat {input.cc_scores}| cut -f3`

	{config[macs2]} callpeak \

macs2 From line 32 of modules/5_peak_calling_macs_paired

run:
	import os, sys, time,subprocess
	sum_len = 0
	n_files = 0
	for files in {input.cc_scores}:
		for file in files:
			n_files+=1
			with open(file,'r') as f:
				sum_len+=int(f.readlines()[0].split("\t")[2])
	fraglen=int(sum_len/n_files)

	#Calling narrowPeaks	
	shell("""
	{config[macs2]} callpeak \

macs2 From line 409 of modules/5_peak_calling_macs_paired

run:
	import os, sys, time,subprocess
	sum_len = 0
	n_files = 0
	for files in {input.cc_scores}:
		for file in files:
			n_files+=1
			with open(file,'r') as f:
				sum_len+=int(f.readlines()[0].split("\t")[2])
	fraglen=int(sum_len/n_files)

	#Calling narrowPeaks	
	shell("""
	{config[macs2]} callpeak \

macs2 From line 589 of modules/5_peak_calling_macs_paired

run:
	import os, sys, time,subprocess
	sum_len = 0
	n_files = 0
	for files in {input.cc_scores}:
		for file in files:
			n_files+=1
			with open(file,'r') as f:
				sum_len+=int(f.readlines()[0].split("\t")[2])
	fraglen=int(sum_len/n_files)

	#Calling narrowPeaks	
	shell("""
	{config[macs2]} callpeak \

macs2 From line 769 of modules/5_peak_calling_macs_paired

run:
	import os, sys, time,subprocess
	#Calling narrowPeaks	
	shell("""
	fraglen=`cat {input.cc_scores}| cut -f3`

	{config[macs2]} callpeak \

macs2 From line 33 of modules/5_peak_calling_macs_paired_matching

run:
	import os, sys, time,subprocess
	sum_len = 0
	n_files = 0
	for files in {input.cc_scores}:
		for file in files:
			n_files+=1
			with open(file,'r') as f:
				sum_len+=int(f.readlines()[0].split("\t")[2])
	fraglen=int(sum_len/n_files)

	#Calling narrowPeaks	
	shell("""
	{config[macs2]} callpeak \

macs2 From line 412 of modules/5_peak_calling_macs_paired_matching

run:
	import os, sys, time,subprocess
	#Calling narrowPeaks	
	shell("""
	fraglen=`cat {input.cc_scores} | cut -f3`

	{config[macs2]} callpeak \

macs2 From line 32 of modules/5_peak_calling_macs_single

run:
	import os, sys, time,subprocess
	sum_len = 0
	n_files = 0
	for files in {input.cc_scores}:
		for file in files:
			n_files+=1
			with open(file,'r') as f:
				sum_len+=int(f.readlines()[0].split("\t")[2])
	fraglen=int(sum_len/n_files)

	#Calling narrowPeaks	
	shell("""
	{config[macs2]} callpeak \

macs2 From line 435 of modules/5_peak_calling_macs_single

run:
	import os, sys, time,subprocess
	#Calling narrowPeaks	
	shell("""
	fraglen=`cat {input.cc_scores} | cut -f3`

	{config[macs2]} callpeak \

macs2 From line 34 of modules/5_peak_calling_macs_single_matching

run:
	import os, sys, time,subprocess
	sum_len = 0
	n_files = 0
	for files in {input.cc_scores}:
		for file in files:
			n_files+=1
			with open(file,'r') as f:
				sum_len+=int(f.readlines()[0].split("\t")[2])
	fraglen=int(sum_len/n_files)

	#Calling narrowPeaks	
	shell("""
	{config[macs2]} callpeak \

macs2 From line 413 of modules/5_peak_calling_macs_single_matching

	shell:
		"""
		bedtools intersect \
                     -a {input.narrowPeak} -b {input.narrowPeak_psr1} -f 0.50 -F 0.50 -e -u |\
                     bedtools intersect \
                     -a stdin -b {input.narrowPeak_psr2} -f 0.50 -F 0.50 -e -u > {output.narrow}

		sort -k1,1 -k2,2n -o finalPeaks/tmp_{params.prefix}_narrow_bb {output.narrow}
		{config[path]}/bedToBigBed -type='bed6+4' -as={config[path]}/narrowPeak.as finalPeaks/tmp_{params.prefix}_narrow_bb {config[chrom]} {output.narrowPeak_bb}
		rm -f finalPeaks/tmp_{params.prefix}_narrow_bb

		bedtools intersect \
                     -a {input.broadPeak} -b {input.broadPeak_psr1} -f 0.50 -F 0.50 -e -u |\
                     bedtools intersect \
                     -a stdin -b {input.broadPeak_psr2} -f 0.50 -F 0.50 -e -u > {output.broad}

		sort -k1,1 -k2,2n -o finalPeaks/tmp_{params.prefix}_broad_bb {output.broad}
		{config[path]}/bedToBigBed -type='bed6+4' -as={config[path]}/broadPeak.as finalPeaks/tmp_{params.prefix}_broad_bb {config[chrom]} {output.broadPeak_bb}
		rm -f finalPeaks/tmp_{params.prefix}_broad_bb

		bedtools intersect \
                     -a {input.gappedPeak} -b {input.gappedPeak_psr1} -f 0.50 -F 0.50 -e -u |\
                     bedtools intersect \
                     -a stdin -b {input.gappedPeak_psr2} -f 0.50 -F 0.50 -e -u > {output.gapped}

		sort -k1,1 -k2,2n -o finalPeaks/tmp_{params.prefix}_gapped_bb {output.gapped}
		{config[path]}/bedToBigBed -type='bed6+4' -as={config[path]}/gappedPeak.as finalPeaks/tmp_{params.prefix}_gapped_bb {config[chrom]} {output.gappedPeak_bb}
		rm -f finalPeaks/tmp_{params.prefix}_gapped_bb

		sleep 120
		"""

BEDTools From line 25 of modules/6_overlap

	shell:
		"""
		bedtools intersect \
                     -a {input.narrowPeak} -b {input.narrowPeak_psr1} -f 0.50 -F 0.50 -e -u |\
                     bedtools intersect \
                     -a stdin -b {input.narrowPeak_psr2} -f 0.50 -F 0.50 -e -u > {output.narrow}

		sort -k1,1 -k2,2n -o finalPeaks/tmp_{params.prefix}_narrow_bb {output.narrow}
		{config[path]}/bedToBigBed -type='bed6+4' -as={config[path]}/narrowPeak.as finalPeaks/tmp_{params.prefix}_narrow_bb {config[chrom]} {output.narrowPeak_bb}
		rm -f finalPeaks/tmp_{params.prefix}_narrow_bb

		bedtools intersect \
                     -a {input.broadPeak} -b {input.broadPeak_psr1} -f 0.50 -F 0.50 -e -u |\
                     bedtools intersect \
                     -a stdin -b {input.broadPeak_psr2} -f 0.50 -F 0.50 -e -u > {output.broad}

		sort -k1,1 -k2,2n -o finalPeaks/tmp_{params.prefix}_broad_bb {output.broad}
		{config[path]}/bedToBigBed -type='bed6+4' -as={config[path]}/broadPeak.as finalPeaks/tmp_{params.prefix}_broad_bb {config[chrom]} {output.broadPeak_bb}
		rm -f finalPeaks/tmp_{params.prefix}_broad_bb

		bedtools intersect \
                     -a {input.gappedPeak} -b {input.gappedPeak_psr1} -f 0.50 -F 0.50 -e -u |\
                     bedtools intersect \
                     -a stdin -b {input.gappedPeak_psr2} -f 0.50 -F 0.50 -e -u > {output.gapped}

		sort -k1,1 -k2,2n -o finalPeaks/tmp_{params.prefix}_gapped_bb {output.gapped}
		{config[path]}/bedToBigBed -type='bed6+4' -as={config[path]}/gappedPeak.as finalPeaks/tmp_{params.prefix}_gapped_bb {config[chrom]} {output.gappedPeak_bb}
		rm -f finalPeaks/tmp_{params.prefix}_gapped_bb

		sleep 120
		"""

BEDTools From line 81 of modules/6_overlap

	shell:
		"""
		bedtools intersect \
                     -a {input.narrowPeak} -b {input.narrowPeak_psr1} -f 0.50 -F 0.50 -e -u |\
                     bedtools intersect \
                     -a stdin -b {input.narrowPeak_psr2} -f 0.50 -F 0.50 -e -u > {output.narrow}

		sort -k1,1 -k2,2n -o finalPeaks/tmp_{params.prefix}_narrow_bb {output.narrow}
		{config[path]}/bedToBigBed -type='bed6+4' -as={config[path]}/narrowPeak.as finalPeaks/tmp_{params.prefix}_narrow_bb {config[chrom]} {output.narrowPeak_bb}
		rm -f finalPeaks/tmp_{params.prefix}_narrow_bb

		bedtools intersect \
                     -a {input.broadPeak} -b {input.broadPeak_psr1} -f 0.50 -F 0.50 -e -u |\
                     bedtools intersect \
                     -a stdin -b {input.broadPeak_psr2} -f 0.50 -F 0.50 -e -u > {output.broad}

		sort -k1,1 -k2,2n -o finalPeaks/tmp_{params.prefix}_broad_bb {output.broad}
		{config[path]}/bedToBigBed -type='bed6+4' -as={config[path]}/broadPeak.as finalPeaks/tmp_{params.prefix}_broad_bb {config[chrom]} {output.broadPeak_bb}
		rm -f finalPeaks/tmp_{params.prefix}_broad_bb

		bedtools intersect \
                     -a {input.gappedPeak} -b {input.gappedPeak_psr1} -f 0.50 -F 0.50 -e -u |\
                     bedtools intersect \
                     -a stdin -b {input.gappedPeak_psr2} -f 0.50 -F 0.50 -e -u > {output.gapped}

		sort -k1,1 -k2,2n -o finalPeaks/tmp_{params.prefix}_gapped_bb {output.gapped}
		{config[path]}/bedToBigBed -type='bed6+4' -as={config[path]}/gappedPeak.as finalPeaks/tmp_{params.prefix}_gapped_bb {config[chrom]} {output.gappedPeak_bb}
		rm -f finalPeaks/tmp_{params.prefix}_gapped_bb

		sleep 120
		"""

BEDTools From line 25 of modules/6_overlap_matching

shell:
	"""
	bwa mem -M -t {threads} {config[genome]} {input} | gzip -c > {output.sam}

	samtools view -@ {threads} -Su {output.sam} | samtools sort -@ {threads} -T {params.prefix} -o  {output.bam}
	"""

SAMtools BWA From line 11 of TF/0_map_bwa_single

	shell:
		"""
			samtools view -@ {threads} -F 1804 -q 20 \
                        -b {input} -o {output.flt}
		"""

SAMtools From line 9 of TF/1_filter_single

shell:
	"""
	java -Xmx4G -jar {config[path]}/picard-tools-1.141/picard.jar MarkDuplicates\
	INPUT={input} OUTPUT={output.temp} \
	METRICS_FILE={output.qc} ASSUME_SORTED=true \
	VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=false

	"""

Picard From line 24 of TF/1_filter_single

		shell:
			"""
			samtools view -@ {threads} -h {input} |\
                        grep -vF -f {config[sponges]} - |\
                        samtools view -@ {threads} -b -o {output.flt} -

			"""

SAMtools From line 39 of TF/1_filter_single

		shell:
                        """
                        samtools view -@ {threads} -F1804 -b -o {output.final_bam} {input}
                        samtools index {output.final_bam} {output.final_bai}

			samtools sort -@ {threads} -n -T {params.prefix} -o {output.tmp_sort} {input}
                        """	

SAMtools From line 58 of TF/1_filter_single

shell:
	"""
	java -Xmx4G -jar {config[path]}/picard-tools-1.141/picard.jar MarkDuplicates\
	INPUT={input} OUTPUT={output.flt} \
	METRICS_FILE={output.qc} ASSUME_SORTED=true \
	VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=false

	"""

Picard From line 75 of TF/1_filter_single

		shell:
                        """
                        samtools view -@ {threads} -F1804 -b -o {output.final_bam} {input}
                        samtools index {output.final_bam} {output.final_bai}

			samtools sort -@ {threads} -n -T {params.prefix} -o {output.tmp_sort} {input}
                        """	

SAMtools From line 95 of TF/1_filter_single

shell:
	"""
	bedtools bamtobed -i {input} | \
	awk 'BEGIN{{OFS="\t"}}{{$4="N";$5="1000";print $0}}' | \
	{config[sort]} --parallel={threads} -k1,1 -k2,2n | \
	gzip -nc > {output.tagAlign} 
	"""

BEDTools From line 110 of TF/1_filter_single

shell: "samtools flagstat {input.final} > {output.final_mapstats}"

SAMtools From line 9 of TF/2_qc_single

shell: "samtools flagstat {input.raw} > {output.raw_mapstats}"

SAMtools From line 18 of TF/2_qc_single

	shell:
		"""
		samtools sort -n -@ {threads} -T {params.prefix} -o {output.tmp_sort_bam} {input.flt}
		echo 1 | \
                     awk '{{print "#TotalReadPairs\tDistinctReadPairs\tOneReadPair\tTwoReadPairs\tNRF=Distinct/Total\tPBC1=OnePair/Distinct\tPBC2=OnePair/TwoPair"}}' \
                     > {output.lib_complexity}

		bedtools bamtobed -i {output.tmp_sort_bam} | \
                     awk 'BEGIN{{OFS="\t"}}{{print $1,$2,$3,$6}}' | \
                     grep -v 'chrM' | sort | uniq -c | \
                     awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0; OFS="\t"}}
                               ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}}
                          END{{if(m2>0) {{print mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}}else{{print mt,m0,m1,m2,m0/mt,m1/m0,"-INF"}}}}'\
                     >> {output.lib_complexity}
		"""

SAMtools BEDTools From line 32 of TF/2_qc_single

	shell:
		"""
		NREADS=15000000

		zcat {input.tagAlign} | grep -v 'chrM' |\
		shuf -n $NREADS | gzip -nc > {output.subsample} 

		Rscript {config[path]}/run_spp_nodups.R -c={output.subsample} -p={threads} -filtchr=chrM -savp={output.cc_plot} -out={output.cc_score} > /dev/null 2>&1
        	sed -r  's/,[^\\t]+//g' {output.cc_score} > {output.cc_score}.tmp 
		mv {output.cc_score}.tmp {output.cc_score}		
		sleep 120
		"""

From line 10 of TF/3_xcor_single

	shell:
		"""
		#If the file is too big '--random-source' option will fail. That's why the '40000000'
		zcat {input} | shuf --random-source=<(zcat {input} | head -40000000) > {output.shuf}
		{config[split]} -d -nl/2 --additional-suffix=\".tagAlign\" {output.shuf} bed_files/psr_{wildcards.sample}.

		{config[sort]} --parallel={threads} -S 2G \
                     -k1,1 -k2,2n {output.temp_psr1} | gzip -c > {output.psr1}
		{config[sort]} --parallel={threads} -S 2G \
                     -k1,1 -k2,2n {output.temp_psr2} | gzip -c > {output.psr2}
		"""

From line 13 of TF/4_pseudoreps

shell:
	"""
	zcat {input} | {config[sort]} --parallel={threads} -k1,1 -k2,2n | gzip -c > {output}
	"""

From line 32 of TF/4_pseudoreps

shell:
	"""
	zcat {input} | {config[sort]} --parallel={threads} -k1,1 -k2,2n | gzip -c > {output}
	"""

From line 43 of TF/4_pseudoreps

shell:
	"""
	fraglen=`cat {input.cc_scores} | cut -f3`

	{config[macs2]} callpeak \
	-t {input.ip} -c {input.control}\
	-f BED -n peaks/{params.prefix}\
	-g {config[macs_g]} -p 1e-2 --nomodel --shift 0 --extsize $fraglen \
	--keep-dup all -B --SPMR

	sort -k 8gr,8gr {output.narrowPeak} |\
	awk 'BEGIN{{OFS="\t"}}{{$4="Peak_"NR ; print $0}}'| \
	gzip -nc > {output.narrowPeak_compressed}

	rm -f peaks/{params.prefix}_peaks.xls peaks/{params.prefix}_summits.bed
	touch {output.checkpoint}
	"""

macs2 From line 17 of TF/5_peak_calling_macs_tf_single

shell:
	"""
	fraglen=`cat {input.cc_scores} | cut -f3`

	{config[macs2]} callpeak \
	-t {input.ip_psr1} -c {input.control}\
	-f BED -n peaks/{params.prefix}\
	-g {config[macs_g]} -p 1e-2 --nomodel --shift 0 --extsize $fraglen \
	--keep-dup all -B --SPMR

	sort -k 8gr,8gr {output.narrowPeak_psr1} |\
	awk 'BEGIN{{OFS="\t"}}{{$4="Peak_"NR ; print $0}}'| \
	gzip -nc > {output.narrowPeak_psr1_compressed}

	rm -f peaks/{params.prefix}_peaks.xls peaks/{params.prefix}_summits.bed

	{config[macs2]} callpeak \
	-t {input.ip_psr2} -c {input.control}\
	-f BED -n peaks/{params.prefix_2}\
	-g {config[macs_g]} -p 1e-2 --nomodel --shift 0 --extsize $fraglen \
	 --keep-dup all -B --SPMR

	sort -k 8gr,8gr {output.narrowPeak_psr2} |\
	awk 'BEGIN{{OFS="\t"}}{{$4="Peak_"NR ; print $0}}'| \
	gzip -nc > {output.narrowPeak_psr2_compressed}

	rm -f peaks/{params.prefix_2}_peaks.xls peaks/{params.prefix_2}_summits.bed
	touch {output.checkpoint}
	"""

macs2 From line 57 of TF/5_peak_calling_macs_tf_single

shell:
	"""
	fraglen=`cat {input.cc_scores} | cut -f3`

	{config[macs2]} callpeak \
	-t {input.ip} -c {input.control}\
	-f BED -n peaks/{params.prefix}\
	-g {config[macs_g]} -p 1e-2 --broad --nomodel --shift 0 --extsize $fraglen \
	--keep-dup all

	sort -k 8gr,8gr {output.broadPeak} | \
	awk 'BEGIN{{OFS="\t"}}{{$4="Peak_"NR ; print $0}}'| \
	gzip -nc > {output.broadPeak_compressed}

	sort -k 14gr,14gr {output.broadPeak} | \
	awk 'BEGIN{{OFS="\t"}}{{$4="Peak_"NR ; print $0}}'| \
	gzip -nc > {output.gappedPeak_compressed}


	rm -f peaks/{params.prefix}_peaks.xls peaks/{params.prefix}_summits.bed

	"""

macs2 From line 102 of TF/5_peak_calling_macs_tf_single

shell:
	"""
	{config[macs2]} bdgcmp  \
	-t {input.treat} -c {input.control_lambda} \
	--outdir peaks/signals -o {params.prefix}_FE.bdg -m FE

	bedtools slop -i {output.fe_bdg} -g {config[chrom]} -b 0 |\
	awk '{{ if($3!=-1) print $0}}' | \
	{config[path]}/bedClip stdin {config[chrom]} {output.fc_signal_bdg}


	{config[path]}/bedGraphToBigWig {output.fc_signal_bdg} {config[chrom]} {output.fc_signal}


	chipReads=$(zcat {input.ip} | wc -l | awk '{{printf "%f", $1/1000000}}');
	controlReads=$(zcat {input.control} | wc -l | awk '{{printf "%f", $1/1000000}}');

	sval=$(echo "${{chipReads}} ${{controlReads}}" | awk '$1>$2{{printf "%f",$2}} $1<=$2{{printf "%f",$1}}');

	macs2 bdgcmp -t {input.treat} -c {input.control_lambda} --outdir peaks/signals -o {params.prefix}_ppois.bdg -m ppois -S ${{sval}}

	slopBed -i {output.ppois} -g {config[chrom]} -b 0 | \
	awk '{{if ($3 != -1) print $0}}' |  \
	bedClip stdin {config[chrom]} {output.pval_bdg}

	bedGraphToBigWig {output.pval_bdg} {config[chrom]} {output.pval}

	"""

BEDTools macs2 bedGraphToBigWig ucsc-bedclip From line 145 of TF/5_peak_calling_macs_tf_single

shell:
	"""
	fraglen=`cat {input.cc_scores} | cut -f3`

	{config[macs2]} callpeak \
	  -t {input.ip_psr1} -c {input.control}\
	  -f BED -n peaks/{params.prefix}\
	  -g {config[macs_g]} -p 1e-2 --broad --nomodel --shift 0 --extsize $fraglen \
	  --keep-dup all

	sort -k 8gr,8gr {output.broadPeak_psr1} | \
	awk 'BEGIN{{OFS="\t"}}{{$4="Peak_"NR ; print $0}}'| \
	gzip -nc > {output.broadPeak_psr1_compressed}

	sort -k 14gr,14gr {output.broadPeak_psr1} | \
	awk 'BEGIN{{OFS="\t"}}{{$4="Peak_"NR ; print $0}}'| \
	gzip -nc > {output.gappedPeak_psr1_compressed}


	rm -f peaks/{params.prefix}_peaks.xls peaks/{params.prefix}_summits.bed


	{config[macs2]} callpeak \
	-t {input.ip_psr2} -c {input.control}\
	-f BED -n peaks/{params.prefix_2}\
	-g {config[macs_g]} -p 1e-2 --broad --nomodel --shift 0 --extsize $fraglen \
	--keep-dup all

	sort -k 8gr,8gr {output.broadPeak_psr2} | \
	awk 'BEGIN{{OFS="\t"}}{{$4="Peak_"NR ; print $0}}'| \
	gzip -nc > {output.broadPeak_psr2_compressed}

	sort -k 14gr,14gr {output.broadPeak_psr2} | \
	awk 'BEGIN{{OFS="\t"}}{{$4="Peak_"NR ; print $0}}'| \
	gzip -nc > {output.gappedPeak_psr2_compressed}


	rm -f peaks/{params.prefix_2}_peaks.xls peaks/{params.prefix_2}_summits.bed
	"""

macs2 From line 197 of TF/5_peak_calling_macs_tf_single

run:
	import os, sys, time,subprocess
	sum_len = 0
	n_files = 0
	for files in {input.cc_scores}:
		for file in files:
			n_files+=1
			with open(file,'r') as f:
				sum_len+=int(f.readlines()[0].split("\t")[2])
	fraglen=int(sum_len/n_files)

	shell("""
	{config[macs2]} callpeak \

macs2 From line 253 of TF/5_peak_calling_macs_tf_single

run:
	import os, sys, time,subprocess
	sum_len = 0
	n_files = 0
	for files in {input.cc_scores}:
		for file in files:
			n_files+=1
			with open(file,'r') as f:
				sum_len+=int(f.readlines()[0].split("\t")[2])
	fraglen=int(sum_len/n_files)

	shell("""

	{config[macs2]} callpeak \

macs2 From line 295 of TF/5_peak_calling_macs_tf_single

	shell:
		"""
		{config[macs2]} bdgcmp  \
                 -t {input.treat} -c {input.control_lambda} \
                 --outdir peaks/signals -o {params.prefix}_FE.bdg -m FE

		bedtools slop -i {output.fe_bdg} -g {config[chrom]} -b 0 |\
		awk '{{ if($3!=-1) print $0}}' | \
                {config[path]}/bedClip stdin {config[chrom]} {output.fc_signal_bdg}


		{config[path]}/bedGraphToBigWig {output.fc_signal_bdg} {config[chrom]} {output.fc_signal}


		chipReads=$(zcat {input.ip} | wc -l | awk '{{printf "%f", $1/1000000}}');
		controlReads=$(zcat {input.control} | wc -l | awk '{{printf "%f", $1/1000000}}');

		sval=$(echo "${{chipReads}} ${{controlReads}}" | awk '$1>$2{{printf "%f",$2}} $1<=$2{{printf "%f",$1}}');

		macs2 bdgcmp -t {input.treat} -c {input.control_lambda} --outdir peaks/signals -o {params.prefix}_ppois.bdg -m ppois -S ${{sval}}

		slopBed -i {output.ppois} -g {config[chrom]} -b 0 | \
		awk '{{if ($3 != -1) print $0}}' |  \
		bedClip stdin {config[chrom]} {output.pval_bdg}

		bedGraphToBigWig {output.pval_bdg} {config[chrom]} {output.pval}

		"""

BEDTools macs2 bedGraphToBigWig ucsc-bedclip From line 347 of TF/5_peak_calling_macs_tf_single

shell:
	"""
	fraglen=`cat {input.cc_scores} | cut -f3`

	#REP
	Rscript {config[path]}/run_spp.R -c={input.ip} -i={input.control} -p={threads} -npeak=300000 -odir=peaks_spp/{params.prefix} -speak=$fraglen -savr -savp -rf -out=peaks_spp/{params.prefix}/{params.prefix}.tagAlign.ccscores 

	zcat {output.narrowPeak_raw} | \
	awk 'BEGIN{{OFS="\t"}}{{ if ($2<0) $2=0; print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}}' | \
	gzip -f -nc > {output.narrowPeak_unfiltered}

	bedtools intersect -v -a <(zcat -f {output.narrowPeak_unfiltered}) -b <(zcat -f {config[path]}/blacklist_hg19.bed.gz) | \
	awk 'BEGIN{{OFS="\t"}} {{if ($5>1000) $5=1000; print $0}}' | \
	grep -P 'chr[\dXY]+[ \t]' | \
	gzip -nc > {output.narrowPeak_final}


	#SELF-PSEUDOREP1
	Rscript {config[path]}/run_spp.R -c={input.ip_psr1} -i={input.control} -p={threads} -npeak=300000 -odir=peaks_spp/psr00_{params.prefix} -speak=$fraglen -savr -savp -rf -out=peaks_spp/psr00_{params.prefix}/psr_{params.prefix}.00.tagAlign.ccscores 

	zcat {output.narrowPeak_raw_0} | \
	awk 'BEGIN{{OFS="\t"}}{{ if ($2<0) $2=0; print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}}' | \
	gzip -f -nc > {output.narrowPeak_unfiltered_0}

	bedtools intersect -v -a <(zcat -f {output.narrowPeak_unfiltered_0}) -b <(zcat -f {config[path]}/blacklist_hg19.bed.gz) | \
	awk 'BEGIN{{OFS="\t"}} {{if ($5>1000) $5=1000; print $0}}' | \
	grep -P 'chr[\dXY]+[ \t]' | \
	gzip -nc > {output.narrowPeak_final_0}

	#SELF-PSEUDOREP2
	Rscript {config[path]}/run_spp.R -c={input.ip_psr2} -i={input.control} -p={threads} -npeak=300000 -odir=peaks_spp/psr01_{params.prefix} -speak=$fraglen -savr -savp -rf -out=peaks_spp/psr01_{params.prefix}/psr_{params.prefix}.01.tagAlign.ccscores 

	zcat {output.narrowPeak_raw_1} | \
	awk 'BEGIN{{OFS="\t"}}{{ if ($2<0) $2=0; print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}}' | \
	gzip -f -nc > {output.narrowPeak_unfiltered_1}

	bedtools intersect -v -a <(zcat -f {output.narrowPeak_unfiltered_1}) -b <(zcat -f {config[path]}/blacklist_hg19.bed.gz) | \
	awk 'BEGIN{{OFS="\t"}} {{if ($5>1000) $5=1000; print $0}}' | \
	grep -P 'chr[\dXY]+[ \t]' | \
	gzip -nc > {output.narrowPeak_final_1}
	"""

BEDTools From line 23 of TF/5_peak_calling_spp_single

shell:
	"""
	idr --samples {input.narrowPeak_psr1} {input.narrowPeak_psr2} --peak-list {input.narrowPeak} --input-file-type narrowPeak --output-file idr/{params.prefix} --rank signal.value --soft-idr-threshold {params.threshold} --plot --use-best-multisummit-IDR

	IDR_THRESH_TRANSFORMED=`awk -v p={params.threshold} 'BEGIN{{print -log(p)/log(10)}}'`

	awk 'BEGIN{{OFS="\t"}} $12>='"$IDR_THRESH_TRANSFORMED"' {{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}}' idr/{params.prefix}.overlapped-peaks.txt | sort | uniq | sort -k7n,7n | gzip -nc > {output.idr_raw}

	NPEAKS_IDR=`zcat {output.idr_raw} | wc -l`

	bedtools intersect -v -a {output.idr_raw} -b {config[path]}/blacklist_hg19.bed.gz | grep -P 'chr[\dXY]+[ \t]' | awk 'BEGIN{{OFS="\t"}} {{if ($5>1000) $5=1000; print $0}}| gzip -nc > {output.idr_final}

	sleep 120
	"""

BEDTools From line 17 of TF/6_idr

	shell:
		"""
		bedtools intersect \
                     -a {input.narrowPeak} -b {input.narrowPeak_psr1} -f 0.50 -F 0.50 -e -u|\
                     bedtools intersect \
                     -a stdin -b {input.narrowPeak_psr2} -f 0.50 -F 0.50 -e -u > {output.narrow}

		sleep 120
		"""

BEDTools From line 14 of TF/6_overlap

	shell:
		"""
		bedtools intersect \
                     -a {input.broadPeak} -b {input.broadPeak_psr1} -f 0.50 -F 0.50 -e -u|\
                     bedtools intersect \
                     -a stdin -b {input.broadPeak_psr2} -f 0.50 -F 0.50 -e -u > {output.broad}


		bedtools intersect \
                     -a {input.gappedPeak} -b {input.gappedPeak_psr1} -f 0.50 -F 0.50 -e -u|\
                     bedtools intersect \
                     -a stdin -b {input.gappedPeak_psr2} -f 0.50 -F 0.50 -e -u > {output.gapped}

		sleep 120
		"""