Merge pull request #33 from Juke34/salmon

Juke34 · web-flow · commit 0ae338a39230 · 2025-03-19T12:01:03.000+01:00
Salmon
diff --git a/README.md b/README.md
@@ -48,21 +48,22 @@ You can choose to run one or several aligner in parallel.
 | bbmap | ✅ | ✅ | ⚠️ | ⚠️ |
 | bowtie | ✅ | ✅ | ⚠️ | ⚠️ |
 | bowtie2 | ✅ | ✅ | ⚠️ | ⚠️ |
-| bwaaln | ✅ | ✅ R1 and R2 independently aligned then merged with bwa sampe | ✅ | ✅ |
-| bwamem | ✅ | ✅ | ⚠️ | ⚠️ |
-| bwamem2 | ✅ | ✅ | ⚠️ | ⚠️ |
+| bwaaln | ✅ | ✅ R1 and R2 independently aligned then merged with bwa sampe | ⚠️ | ⚠️ |
+| bwamem | ✅ | ✅ | ✅ | ✅ |
+| bwamem2 | ✅ | ✅ | ✅ | ✅ |
 | bwasw | ✅ | ✅ | ⚠️ | ⚠️ |
 | graphmap2 | ⚠️ | ⚠️ R1 and R2 independently aligned then merged with cat | ✅ | ✅ |
 | hisat2 | ✅ | ✅ | ⚠️ | ⚠️ |
 | kallisto | ✅ | ✅ | ⚠️ | ⚠️ |
 | minimap2 | ⚠️ | ⚠️ | ✅ | ✅ |
-| ngmlr | ⚠️ | 🚫 | ✅ | ✅ |
+| ngmlr | ⚠️ | ⚠️ R1 and R2 independently aligned then merged with cat | ✅ | ✅ |
 | novoalign | ✅ | ✅ | ✅ | ⚠️ |
 | nucmer | ✅ | ✅ R1 and R2 are concatenated then aligned | ⚠️ | ⚠️ |
+| salmon | ✅ | ✅ | ⚠️ | ⚠️ |
 | star | ✅ | ✅ | ✅ use STARlong | ✅ use STARlong |
 | star 2pass mode | ✅ | ✅ | ⚠️ | ⚠️ |
 | subread | ✅ | ✅ | ⚠️ | ⚠️ |
-| sublong | ⚠️ | 🚫 | ✅ | ✅ |
+| sublong | ⚠️ | ⚠️ R1 and R2 independently aligned then merged with cat | ✅ | ✅ |
 
 *Legend*  
 ✅ Recommended  
@@ -94,6 +95,7 @@ It is then translated to the correct option in the following aligners:
 | ngmlr | 🚫 | 🚫 | 🚫 |
 | novoalign | 🚫 | 🚫 | 🚫 |
 | nucmer | 🚫 | 🚫 | 🚫 |
+| salmon | U SR SF IU MU OU ISF ISR MSF MSR OSR OSF | identical | strand information and read orientation | 
 | star | 🚫 | 🚫 | 🚫 |
 | star 2pass mode | 🚫 | 🚫 | 🚫 |
 | subread | -S fr / -S rf / -S ff | ISF ISR IU / OSF OSR OU / MSF MSR MU | read orientation |
@@ -128,6 +130,7 @@ If you provide an annotation file the pipeline will pass automatically the file
 | ngmlr | 🚫 |
 | novoalign | 🚫 |
 | nucmer | 🚫 |
+| salmon | 🚫 |
 | star | GTF / GFF ( --sjdbGTFfile + --sjdbGTFtagExonParentTranscript Parent in case of GFF ) |
 | star 2pass mode | GTF / GFF (--sjdbGTFfile + --sjdbGTFtagExonParentTranscript Parent in case of GFF ) |
 | subread | GTF or compatible GFF format (-a) |
@@ -343,18 +346,22 @@ On success you should get a message looking like this:
         --bowtie2_options           additional options for bowtie2
         --bwaaln_options            additional options for bwaaln
         --bwamem_options            additional options for bwamem
-        --bwamem2_options            additional options for bwamem2
+        --bwamem2_options           additional options for bwamem2
         --bwasw_options             additional options for bwasw
         --graphmap2_options         additional options for graphmap2
         --hisat2_options            additional options for hisat2
         --kallisto_options          additional options for kallisto
+        --kallisto_index_options    additional options for kallisto index
         --minimap2_options          additional options for minimap2 (default: -a (to get sam output))
         --minimap2_index_options    additional options for minimap2 index
         --ngmlr_options             additional options for ngmlr
         --novoalign_options         additional options for novoalign
         --novoalign_license         license for novoalign. You can ask for one month free trial license at http://www.novocraft.com/products/novoalign/
         --nucmer_options            additional options for nucmer
+        --salmon_options            additional options for salmon
+        --salmon_index_options      additional options for salmon index
         --star_options              additional options for star
+        --star_index_options        additional options for star index
         --star_2pass                set to true to run STAR in 2pass mode (default: false)
         --read_length               [Optional][used by STAR] length of the reads, if none provided it is automatically deduced
         --subread_options           additional options for subread
diff --git a/aline.nf b/aline.nf
diff --git a/modules/graphmap2.nf b/modules/graphmap2.nf
@@ -44,15 +44,16 @@ process graphmap2 {
         path "*graphmap2.log",  emit: graphmap2_summary
 
     script:
+        // catch filename
         fileName = reads[0].baseName.replace('.fastq','')
-        read_file=reads[0]
+
         // Check if the owler option is set
         if ( params.graphmap2_options.contains("owler") ){
 
             if (params.read_type == "short_paired"){
                 // For paired-end we concat output 
                 """
-                graphmap2 ${params.graphmap2_options} -t ${task.cpus} -r ${read_file} -d ${read_file}  -o ${fileName}_graphmap2.mhap 2> ${fileName}_graphmap2.log
+                graphmap2 ${params.graphmap2_options} -t ${task.cpus} -r ${reads[0]} -d ${reads[0]}  -o ${fileName}_graphmap2.mhap 2> ${fileName}_graphmap2.log
                 graphmap2 ${params.graphmap2_options} -t ${task.cpus} -r ${reads[1]} -d ${reads[1]}  -o ${reads[1].baseName}_graphmap2.mhap 2> ${reads[1].baseName}_graphmap2.log
                 cat ${fileName}_graphmap2.mhap > ${fileName}_graphmap2_concatR1R2.mhap
                 rm ${fileName}_graphmap2.mhap
@@ -61,7 +62,7 @@ process graphmap2 {
                 """
             } else {
                 """
-                graphmap2 ${params.graphmap2_options} -t ${task.cpus} -r ${read_file} -d ${read_file}  -o ${fileName}_graphmap2.mhap 2> ${fileName}_graphmap2.log
+                graphmap2 ${params.graphmap2_options} -t ${task.cpus} -r ${reads[0]} -d ${reads[0]}  -o ${fileName}_graphmap2.mhap 2> ${fileName}_graphmap2.log
                 """
             }
         }
@@ -76,7 +77,7 @@ process graphmap2 {
             if (params.read_type == "short_paired"){
                 
                 """
-                graphmap2 ${graphmap2_options} -i ${graphmap2_index_files} -t ${task.cpus} -r ${genome} -d ${read_file}  -o ${fileName}_graphmap2.sam 2> ${fileName}_graphmap2.log
+                graphmap2 ${graphmap2_options} -i ${graphmap2_index_files} -t ${task.cpus} -r ${genome} -d ${reads[0]}  -o ${fileName}_graphmap2.sam 2> ${fileName}_graphmap2.log
                 graphmap2 ${graphmap2_options} -i ${graphmap2_index_files} -t ${task.cpus} -r ${genome} -d ${reads[1]}  -o ${reads[1].baseName}_graphmap2.sam 2> ${reads[1].baseName}_graphmap2.log
                 
                 # Merge sam
@@ -88,7 +89,7 @@ process graphmap2 {
             } else {
                 """
                 
-                graphmap2 ${graphmap2_options} -i ${graphmap2_index_files} -t ${task.cpus} -r ${genome} -d ${read_file}  -o ${fileName}_graphmap2.sam 2> ${fileName}_graphmap2.log
+                graphmap2 ${graphmap2_options} -i ${graphmap2_index_files} -t ${task.cpus} -r ${genome} -d ${reads[0]}  -o ${fileName}_graphmap2.sam 2> ${fileName}_graphmap2.log
                 """
             }
         }
diff --git a/modules/kallisto.nf b/modules/kallisto.nf
@@ -55,6 +55,7 @@ process kallisto {
             } 
         }
 
+        // For paired-end reads, Kallisto automatically estimates the fragment length distribution from the data and does not require you to specify it manually
         if (params.read_type == "short_paired"){
             """
             kallisto quant  ${read_orientation} ${params.kallisto_options} \
@@ -70,7 +71,7 @@ process kallisto {
         } else {
             
             // Use read length (-l) and sd (-s) from params?
-            def l_s_params = params.kallisto_options
+            def l_s_params = ""
             def read_length_copy = read_length // to avoid error "Variable read_length already defined in the process scope "
             if ( !params.kallisto_options.contains("-l ") ){
                 l_s_params += " -l ${read_length}"
@@ -82,7 +83,8 @@ process kallisto {
             }
 
             """
-            kallisto quant  ${read_orientation} ${l_s_params} \
+            kallisto quant  ${read_orientation} ${params.kallisto_options} \
+                ${l_s_params} \
                 -t ${task.cpus} \
                 --pseudobam \
                 -i ${kallisto_index} \
diff --git a/modules/ngmlr.nf b/modules/ngmlr.nf
@@ -19,11 +19,25 @@ process ngmlr {
         path "*.log",  emit: ngmlr_summary
 
     script:
-
+        // catch filename
         fileName = reads[0].baseName.replace('.fastq','')
 
-        """
-        ngmlr ${params.ngmlr_options} -t ${task.cpus} -r ${genome} -q ${reads} -o ${fileName}_ngmlr.sam 2> ${fileName}_ngmlr.log 
-        """
+        // For paired-end we concat output 
+        if (params.read_type == "short_paired"){
+            """
+            ngmlr ${params.ngmlr_options} -t ${task.cpus} -r ${genome} -q ${reads[0]} -o ${fileName}_ngmlr.sam 2> ${fileName}_ngmlr.log 
+            ngmlr ${params.ngmlr_options} -t ${task.cpus} -r ${genome} -q ${reads[1]} -o ${reads[1].baseName}_ngmlr.sam 2> ${fileName}_ngmlr.log 
+            
+            # Merge sam
+            cat ${fileName}_ngmlr.sam > ${fileName}_ngmlr_concatR1R2.sam
+            rm ${fileName}_ngmlr.sam
+            awk '!/^@HD/ && !/^@SQ/ && !/^@RG/ && !/^@PG/ && !/^@CO/ && NF' ${reads[1].baseName}_ngmlr.sam >> ${fileName}_ngmlr_concatR1R2.sam
+            rm ${reads[1].baseName}_ngmlr.sam
+            """
+        } else {
+            """
+            ngmlr ${params.ngmlr_options} -t ${task.cpus} -r ${genome} -q ${reads[0]} -o ${fileName}_ngmlr.sam 2> ${fileName}_ngmlr.log 
+            """
+        }
 
 }
diff --git a/modules/salmon.nf b/modules/salmon.nf
@@ -14,7 +14,7 @@ process salmon_index {
 
     script:
         """
-        salmon index -t ${genome_fasta} -i salmon_index --threads ${task.cpus}
+        salmon index ${params.salmon_index_options} -t ${genome_fasta} -i salmon_index --threads ${task.cpus}
         """
 }
 
@@ -65,4 +65,72 @@ process set_tuple_withUserLib{
 
         """
         """
+}
+
+//  Use salmon as aligner - output sorted sam
+process salmon {
+    label 'salmon'
+    publishDir "${params.outdir}/${outpath}", pattern: "*/*.json", mode: 'copy'
+   
+    input:
+        tuple val(sample), path(fastq), val(library), val(read_length)
+        path salmon_index
+        val outpath
+
+    output:
+        tuple val(sample), path ("*.sam"), emit: tuple_sample_sam
+        path "*.log",  emit: salmon_summary
+   
+    script:
+
+        // set input according to read_type parameter
+        def input =  "-r ${fastq[0]}"
+        if (params.read_type == "short_paired"){
+            input =  "-1 ${fastq[0]} -2 ${fastq[1]}" // if short reads check paired or not
+        }
+
+        // deal with library type 
+        def read_orientation=""
+        if (! params.salmon_options.contains("-l ") && ! params.salmon_options.contains("--libType ") &&
+            ! params.skip_libray_usage){ 
+                read_orientation = "-l ${library}"
+        }
+
+        // catch filename
+        def filename = "${fastq[0].baseName.replace('.fastq','')}"
+       
+        // Salmon automatically estimates the fragment length distribution for paired-end reads (like Kallisto)
+        if (params.read_type == "short_paired"){
+            """
+                salmon quant -i ${salmon_index} ${params.salmon_options} \
+                    ${read_orientation} \
+                    ${input} \
+                    --thread ${task.cpus} \
+                    --writeMappings \
+                    --output ${filename} > ${filename}.sam 2> ${filename}.log
+            """
+        } else {
+            
+            // Use read length (--fldMean) and sd (--fldSD) from params?
+            def l_s_params = ""
+            def read_length_copy = read_length // to avoid error "Variable read_length already defined in the process scope "
+            if ( !params.salmon_options.contains("--fldMean ") ){
+                l_s_params += " --fldMean ${read_length}"
+            }
+            if ( !params.salmon_options.contains("--fldSD ") ){
+                // 10% of read length will be used as Estimated standard deviation of fragment length
+                def tenPercent = (read_length_copy.toInteger() * 10 / 100) as int 
+                l_s_params += " --fldSD ${tenPercent}"
+            }
+
+            """
+                salmon quant -i ${salmon_index} ${params.salmon_options} \
+                    ${l_s_params} \
+                    ${read_orientation} \
+                    ${input} \
+                    --thread ${task.cpus} \
+                    --writeMappings \
+                    --output ${filename} > ${filename}.sam 2> ${filename}.log
+            """
+        }
 }
diff --git a/modules/samtools.nf b/modules/samtools.nf
@@ -24,7 +24,6 @@ process samtools_sam2bam_nucmer {
 
 }
 
-
 process samtools_sam2bam {
     label 'samtools'
     tag "$sample"
@@ -41,6 +40,23 @@ process samtools_sam2bam {
             samtools view -@ ${task.cpus} ${sam} -b -o ${sam.baseName}.bam 
         """
 
+}
+process samtools_merge_bam {
+    label 'samtools'
+    tag "$sample"
+
+    input:
+        tuple val(sample), path(bam)
+
+    output:
+        tuple val(sample), path ("*.bam"), emit: tuple_sample_bam
+
+    script:
+
+        """
+            samtools merge -@ ${task.cpus} ${bam[0].baseName}_concatR1R2.bam *.bam
+        """
+
 }
 /*
 http://www.htslib.org/doc/samtools-sort.html
diff --git a/modules/subread.nf b/modules/subread.nf
@@ -108,7 +108,7 @@ process sublong {
     publishDir "${params.outdir}/${outpath}", pattern: "*.log", mode: 'copy'
 
     input:
-        tuple val(sample), path(fastq), val(library), val(read_length)
+        tuple val(sample), path(reads), val(library), val(read_length)
         path genome
         path index
         val outpath
@@ -120,12 +120,22 @@ process sublong {
     script:
 
         // remove fastq.gz
-        def fileName = fastq[0].baseName.replace('.fastq','') + "_sublong"
+        def fileName = reads[0].baseName.replace('.fastq','') + "_sublong"
         
         // prepare index name
         def index_prefix = genome.baseName + "_index"
 
-        """
-        sublong -T ${task.cpus} -i ${index_prefix} -r ${fastq} -o ${fileName}.bam ${params.sublong_options} > ${fileName}_sublong.log 
-        """
+
+
+        // For paired-end we concat output 
+        if (params.read_type == "short_paired"){
+            """
+            sublong -T ${task.cpus} -i ${index_prefix} -r ${reads[0]} -o ${fileName}.bam ${params.sublong_options} > ${fileName}_sublong.log 
+            sublong -T ${task.cpus} -i ${index_prefix} -r ${reads[1]} -o ${reads[1].baseName}.bam ${params.sublong_options} > ${fileName}_sublong.log 
+            """
+        } else {
+            """
+            sublong -T ${task.cpus} -i ${index_prefix} -r ${reads[0]} -o ${fileName}.bam ${params.sublong_options} > ${fileName}_sublong.log 
+            """
+        }
 }
diff --git a/profiles/test_illumina_paired.config b/profiles/test_illumina_paired.config
@@ -7,7 +7,7 @@
 params {
     reads = "$baseDir/test/illumina/"
     genome = "$baseDir/test/yeast.fa"
-    aligner = 'bbmap,bowtie,bowtie2,bwaaln,bwamem,bwamem2,bwasw,graphmap2,hisat2,minimap2,nucmer,star,subread'
+    aligner = 'bbmap,bowtie,bowtie2,bwaaln,bwamem,bwamem2,bwasw,graphmap2,hisat2,minimap2,ngmlr,nucmer,salmon,star,subread,sublong'
     star_options = "--genomeSAindexNbases 9" // the default 14 is too large for the genome size=1351857
     multiqc_config = "$baseDir/config/multiqc_conf.yml"
 }
diff --git a/profiles/test_illumina_single.config b/profiles/test_illumina_single.config
@@ -8,10 +8,11 @@ params {
     reads = "$baseDir/test/illumina/"
     genome = "$baseDir/test/yeast.fa"
     params.read_type = "short_single"
-    aligner = 'bbmap,bowtie,bowtie2,bwaaln,bwamem,bwamem2,bwasw,graphmap2,hisat2,kallisto,minimap2,ngmlr,nucmer,star,subread,sublong'
+    aligner = 'bbmap,bowtie,bowtie2,bwaaln,bwamem,bwamem2,bwasw,graphmap2,hisat2,kallisto,minimap2,ngmlr,nucmer,salmon,star,subread,sublong'
     trimming_fastp = true
     fastqc = true
     samtools_stats = true
+    salmon_options ="--minAssignedFrags 1"
     star_options = "--genomeSAindexNbases 9" // the default 14 is too large for the genome size=1351857
     multiqc_config = "$baseDir/config/multiqc_conf.yml"
 }
diff --git a/profiles/test_pacbio.config b/profiles/test_pacbio.config
@@ -8,7 +8,8 @@ params {
     reads = "$baseDir/test/pacbio/"
     genome = "$baseDir/test/yeast.fa"
     read_type = "pacbio"
-    aligner = 'bbmap,bowtie,bowtie2,,bwamem,bwamem2,graphmap2,hisat2,kallisto,minimap2,ngmlr,nucmer,star,subread,sublong'
+    aligner = 'bbmap,bowtie,bowtie2,,bwamem,bwamem2,graphmap2,hisat2,kallisto,minimap2,ngmlr,nucmer,salmon,star,subread,sublong'
+    salmon_options ="--minAssignedFrags 1"
     star_options = '--outFilterMismatchNmax 100 --seedSearchLmax 30   --seedSearchStartLmax 30 --seedPerReadNmax 100000 --seedPerWindowNmax 100 --alignTranscriptsPerReadNmax 100000 --alignTranscriptsPerWindowNmax 10000'
     star_index_options = '--genomeSAindexNbases 9'
     multiqc_config = "$baseDir/config/multiqc_conf.yml"

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`params {`
`8`	`8`	`reads = "$baseDir/test/illumina/"`
`9`	`9`	`genome = "$baseDir/test/yeast.fa"`
`10`		`- aligner = 'bbmap,bowtie,bowtie2,bwaaln,bwamem,bwamem2,bwasw,graphmap2,hisat2,minimap2,nucmer,star,subread'`
	`10`	`+ aligner = 'bbmap,bowtie,bowtie2,bwaaln,bwamem,bwamem2,bwasw,graphmap2,hisat2,minimap2,ngmlr,nucmer,salmon,star,subread,sublong'`
`11`	`11`	`star_options = "--genomeSAindexNbases 9" // the default 14 is too large for the genome size=1351857`
`12`	`12`	`multiqc_config = "$baseDir/config/multiqc_conf.yml"`
`13`	`13`	`}`