Newer
Older
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2
include { SHARED as SH } from './subworkflows/shared'
include { SHORT_READS as SR } from './subworkflows/short_reads'
include { DATABASES } from './subworkflows/00_databases'
include { FASTQC_HIFI as S04_HIFI_FASTQC } from './modules/fastqc'
include { HIFI_QUAST as S04_HIFI_QUAST } from './modules/metaquast'
include { GET_SOFTWARE_VERSIONS } from './modules/get_software_versions'
include { MULTIQC } from './modules/multiqc'
/*
* Define helpMessage
*/
def helpMessage() {
log.info"""
Usage:
The typical command for running the pipeline is as follows:
nextflow run -profile standard main.nf --input 'samplesheet.csv' --skip_host_filter --skip_kaiju
Mandatory arguments:
--input [path] Sample sheet: csv file with samples: sample,fastq_1,fastq_2,fasta[for HIFI]
Options:
MARTIN Pierre
committed
S01_CLEAN_QC options:
--stop_at_clean Stop the pipeline at this step
--skip_clean Skip this step.
--adapter1 Sequence of adapter 1. Default: Illumina TruSeq adapter.
--adapter2 Sequence of adapter 2. Default: Illumina TruSeq adapter.
--skip_sickle Skip sickle process.
--quality_type Type of quality values for sickle (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)). Default: 'sanger'.
--skip_host_filter Skip filter host reads process.
--host_fasta Full path to fasta of host genome ("PATH/name_genome.fasta").
--host_bwa_index Full path to directory containing BWA index including base name i.e ("PATH/name_genome.{amb,ann,bwt,pac,sa}").
You need to use --skip_host_filter or --host_fasta or --skip_01_clean_qc. If it is not the case, an error message will occur.
--skip_kaiju Skip taxonomic affiliation of reads with kaiju.
--kaiju_verbose Allow the generation of kaiju verbose output (file can be large)
--kaiju_db_dir Directory with kaiju database already built ("PATH/directory").
--kaiju_db_url Indicate kaiju database you want to build. Default: "https://kaiju.binf.ku.dk/database/kaiju_db_refseq_2021-02-26.tgz".
You need to use --kaiju_db_url or --kaiju_db_dir or --skip_kaiju. If it is not the case, an error message will occur.
MARTIN Pierre
committed
S02_ASSEMBLY options:
--stop_at_assembly Stop the pipeline at this step
--assembly Indicate the assembly tool ["metaspades" or "megahit" ]. Default: "metaspades".
--metaspades_mem [mem_value] Memory (in G) used by metaspades process. Default: 440.
MARTIN Pierre
committed
S03_FILTERING options:
--stop_at_filtering Stop the pipeline at this step
--skip_filtering Skip this step
--min_contigs_cpm [cutoff] CPM cutoff (Count Per Million) to filter contigs with low number of reads. Default: 10.
S04_STRUCTURAL_ANNOT options:
--stop_at_structural_annot Stop the pipeline at this step
MARTIN Pierre
committed
S05_ALIGNMENT options:
--diamond_bank Path to diamond bank used to align protein sequence of genes: "PATH/bank.dmnd".
This bank must be previously built with diamond makedb.
MARTIN Pierre
committed
S06_FUNC_ANNOT options:
--skip_func_annot Skip this step
--percentage_identity [nb] Sequence identity threshold. Default: 0.95 corresponding to 95%. Use a number between 0 and 1.
--eggnog_mapper_db_download Flag --eggnog_mapper_db_download to build the database. Default false: metagWGS didn't build this database.
--eggnog_mapper_db_dir Path to eggnog-mapper database "PATH/database_directory/" if it is already built. Default: false.
You need to use --eggnog_mapper_db_download or --eggnog_mapper_db_dir. If it is not the case, an error message will occur.
MARTIN Pierre
committed
S07_TAXO_AFFI options:
--skip_taxo_affi Skip this step
--accession2taxid FTP adress of file prot.accession2taxid.gz. Default: "ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz".
--taxdump FTP adress of file taxdump.tar.gz. Default: "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz".
--taxonomy_dir Directory if taxdump and accession2taxid already downloaded ("PATH/directory").
Other options:
--outdir The output directory where the results will be saved: "dir_name". Default "results".
--help Show this message and exit.
--multiqc_config If you want to change the configuration of multiqc report: "PATH/multiqc.yaml". Default "$baseDir/assets/multiqc_config.yaml".
-profile Configuration profile to use.
Available: singularity (use of Singularity containers), conda (use of conda environments), genotoul (use of metagWGS on genologin cluster),
different test profiles (test_genotoul_workq, test_genotoul_testq, big_test_genotoul, test_local) and debug profile.
""".stripIndent()
}
// Show help message.
if (params.help){
helpMessage()
exit 0
}
def getAndCheckHeader() {
File file = new File(params.input)
assert file.exists() : "${params.input} file not found"
def line="";
file.withReader { reader ->
line = reader.readLine()
}
def tab = line.split(/,/)
if (! ((tab[0] == "sample") && (tab[1] == "fastq_1") )) {
MARTIN Pierre
committed
exit 1, 'Error 1 while check samplesheet format please enter sample,fastq_1[,fastq_2][,assembly] with header line'
}
if (tab.size() == 3 ){
if (!((tab[2] == "fastq_2") || (tab[2] == "assembly"))) {
MARTIN Pierre
committed
exit 1, 'Error 2 while check samplesheet format please enter sample,fastq_1[,fastq_2][,assembly] with header line'
}
}
if (tab.size() == 4) {
if ( ! ((tab[2] == "fastq_2") && (tab[3] == "assembly"))) {
MARTIN Pierre
committed
exit 1, 'Error 3 while check samplesheet format please enter sample,fastq_1[,fastq_2][,assembly] with header line'
}
}
return tab
}
def returnFile(it) {
if (it == null) {
return null
} else {
if (!file(it).exists()) exit 1, "Missing file in CSV file: ${it}, see --help for more information"
}
return file(it)
}
def hasExtension(it, extension) {
it.toString().toLowerCase().endsWith(extension.toLowerCase())
}
// Check mandatory parameters
////////////
// Start check samplesheet
////////////
if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' }
skip_clean = params.skip_clean
if (params.type == 'SR') {
if (!['metaspades','megahit'].contains(params.assembly)){
exit 1, "Invalid short read aligner option: ${params.assembly}. Valid options: 'metaspades', 'megahit'"
}
if (!["solexa","illumina","sanger"].contains(params.quality_type)){
exit 1, "Invalid quality_type option: ${params.quality_type}. Valid options:'solexa','illumina','sanger'"
}
if (!(params.skip_host_filter) && !(params.host_fasta) && !(params.skip_clean)) {
exit 1, "You must specify --host_fasta or skip cleaning host step with option --skip_host_filter or skip all clean and qc modules with --skip_clean"
}
else {
skip_clean = true
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
if ( !(params.stop_at_structural_annot) && !(params.diamond_bank) ) {
exit 1, "You must specify --stop_at_structural_annot or specify a diamond bank with --diamond_bank"
}
header = getAndCheckHeader()
Channel.from(file(params.input)
.splitCsv ( header:true, sep:',' ) )
.map { row ->
def sample = row.sample
def paired = false
if (row.fastq_2 != null) {
paired = true
}
if (hasExtension(row.fastq_1, "fastq") || hasExtension(row.fastq_1, "fq") || hasExtension(row.fastq_2, "fastq") || hasExtension(row.fastq_2, "fq")) {
exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint."
}
["sample":row.sample,
"fastq_1":returnFile(row.fastq_1),
"fastq_2":returnFile(row.fastq_2),
"paired": paired,
"assembly":returnFile(row.assembly) ]
}
.set { ch_inputs }
////////////
// End check samplesheet
////////////
// Databases
ch_host_fasta = Channel.empty()
ch_host_index = Channel.empty()
ch_kaiju_db = Channel.empty()
ch_eggnog_db = Channel.empty()
ch_taxonomy = Channel.empty()
DATABASES (skip_clean)
ch_host_fasta = DATABASES.out.host_fasta
ch_host_index = DATABASES.out.host_index
ch_eggnog_db = DATABASES.out.eggnog
ch_taxonomy = DATABASES.out.taxonomy
ch_multiqc_config = Channel.empty()
// SR only report
ch_cutadapt_report = Channel.empty()
ch_sickle_report = Channel.empty()
ch_before_filter_report = Channel.empty()
ch_after_filter_report = Channel.empty()
ch_fastqc_raw_report = Channel.empty()
ch_fastqc_clean_report = Channel.empty()
ch_kaiju_report = Channel.empty()
ch_dedup_report = Channel.empty()
ch_assembly_report = Channel.empty()
ch_filtered_report = Channel.empty()
// HIFI only report
ch_hifi_fastqc_report = Channel.empty()
ch_hifi_quast_report = Channel.empty()
// Shared report
ch_prokka_report = Channel.empty()
ch_quant_report = Channel.empty()
ch_v_eggnogmapper = Channel.empty()
ch_multiqc_config = file(params.sr_multiqc_config, checkIfExists: true)
.map { item -> [ item.sample, item.fastq_1, item.fastq_2 ] }
ch_inputs
.map { item -> [ item.sample, item.paired ] }
.set { ch_paired }
//ch_reads.view{ it -> "${it}" }
//ch_paired.view{ it -> "${it}" }
SR (
ch_reads,
ch_paired,
ch_host_fasta,
ch_host_index,
ch_kaiju_db
)
ch_reads = SR.out.dedup
ch_assembly = SR.out.assembly
ch_cutadapt_report = SR.out.cutadapt_report
ch_sickle_report = SR.out.sickle_report
ch_before_filter_report = SR.out.before_filter_report
ch_after_filter_report = SR.out.after_filter_report
ch_fastqc_raw_report = SR.out.fastqc_raw_report
ch_fastqc_clean_report = SR.out.fastqc_clean_report
ch_kaiju_report = SR.out.kaiju_report
ch_dedup_report = SR.out.dedup_report
ch_assembly_report = SR.out.assembly_report
ch_filtered_report = SR.out.filtered_report
else if ( params.type.toUpperCase() == "HIFI" ) {
ch_multiqc_config = file(params.hifi_multiqc_config, checkIfExists: true)
ch_inputs.map { item -> [ item.sample, item.assembly ] } // [sample, assembly]
.set { ch_assembly }
ch_inputs.map { item -> [ item.sample, item.fastq_1 ] } // [sample, reads]
.set { ch_reads }
MARTIN Pierre
committed
S04_HIFI_FASTQC( ch_reads )
S04_HIFI_QUAST( ch_assembly )
ch_hifi_fastqc_report = S04_HIFI_FASTQC.out.zip
ch_hifi_quast_report = S04_HIFI_QUAST.out.report
else {
exit 1, "Invalid type option: ${params.type}. Valid options are 'HiFi' for long-read, 'SR' for short-read"
SH (
ch_reads,
ch_assembly,
ch_eggnog_db,
ch_taxonomy
)
ch_prokka_report = SH.out.prokka_report
ch_quant_report = SH.out.quant_report
ch_v_eggnogmapper = SH.out.v_eggnogmapper
GET_SOFTWARE_VERSIONS( ch_v_eggnogmapper.ifEmpty([]).first() )
ch_software_versions = GET_SOFTWARE_VERSIONS.out.yaml
MULTIQC (
ch_multiqc_config,
ch_software_versions,
ch_cutadapt_report.collect().ifEmpty([]),
ch_sickle_report.collect().ifEmpty([]),
ch_before_filter_report.collect().ifEmpty([]),
ch_after_filter_report.collect().ifEmpty([]),
ch_fastqc_raw_report.collect().ifEmpty([]),
ch_fastqc_clean_report.collect().ifEmpty([]),
ch_kaiju_report.collect().ifEmpty([]),
ch_dedup_report.collect().ifEmpty([]),
ch_assembly_report.collect().ifEmpty([]),
ch_filtered_report.collect().ifEmpty([]),
ch_hifi_fastqc_report.collect().ifEmpty([]),
ch_hifi_quast_report.collect().ifEmpty([]),
ch_prokka_report.collect().ifEmpty([]),
ch_quant_report.collect().ifEmpty([])
)
multiqc_report = MULTIQC.out.report