Newer
Older
#!/usr/bin/env nextflow
/*
========================================================================================
metagWGS
========================================================================================
metagWGS Analysis Pipeline.
#### Homepage / Documentation
https://forgemia.inra.fr/genotoul-bioinfo/metagwgs/
----------------------------------------------------------------------------------------
*/
nextflow.enable.dsl = 2
include { DATABASES } from './subworkflows/00_databases'
include { STEP_01_CLEAN_QC as S01_CLEAN_QC } from './subworkflows/01_clean_qc'
include { STEP_02_ASSEMBLY as S02_ASSEMBLY } from './subworkflows/02_assembly'
include { STEP_03_ASSEMBLY_FILTER as S03_FILTERING } from './subworkflows/03_filtering'
include { STEP_04_STRUCTURAL_ANNOT as S04_STRUCTURAL_ANNOT } from './subworkflows/04_structural_annot'
include { STEP_05_PROTEIN_ALIGNMENT as S05_PROTEIN_ALIGNMENT } from './subworkflows/05_protein_alignment'
include { STEP_06_FUNC_ANNOT as S06_FUNC_ANNOT } from './subworkflows/06_functionnal_annot'
include { STEP_07_TAXO_AFFI as S07_TAXO_AFFI } from './subworkflows/07_taxonomic_affi'
include { STEP_08_BINNING as S08_BINNING } from './subworkflows/08_binning'
include { GET_SOFTWARE_VERSIONS } from './modules/get_software_versions'
include { MULTIQC } from './modules/multiqc'
/*
* Define helpMessage
*/
def helpMessage() {
log.info"""
Usage:
The typical command for running the pipeline is as follows:
nextflow run -profile standard main.nf --input 'samplesheet.csv' --skip_host_filter --skip_kaiju
Mandatory arguments:
--input [path] Sample sheet: csv file with samples: sample,fastq_1,fastq_2,fasta[for HIFI]
--type Indicate the type of the sequencing data, "SR" : short-read Illumina data use, "HIFI" : long-read PacBio HiFi data
MARTIN Pierre
committed
S01_CLEAN_QC options:
--stop_at_clean Stop the pipeline at this step
--skip_clean Skip this step.
--adapter1 Sequence of adapter 1. Default: Illumina TruSeq adapter.
--adapter2 Sequence of adapter 2. Default: Illumina TruSeq adapter.
--use_sickle Use sickle process for high quality trimming.
--quality_type Type of quality values for sickle (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)). Default: 'sanger'.
--skip_host_filter Skip filter host reads process.
--host_fasta Full path to fasta of host genome ("PATH/name_genome.fasta").
--host_index Full path to directory containing BWA-MEM2 index including base name i.e ("PATH/name_genome.{0123,amb,ann,bwt.2bit.64,pac,sa}").
You need to use --skip_host_filter or --host_fasta or --skip_01_clean_qc. If it is not the case, an error message will occur.
--skip_kaiju Skip taxonomic affiliation of reads with kaiju.
--kaiju_verbose Allow the generation of kaiju verbose output (file can be large)
--kaiju_db_dir Directory with kaiju database already built ("PATH/directory").
--kaiju_db_url Indicate kaiju database you want to build. Default: "https://kaiju.binf.ku.dk/database/kaiju_db_refseq_2021-02-26.tgz".
You need to use --kaiju_db_url or --kaiju_db_dir or --skip_kaiju. If it is not the case, an error message will occur.
MARTIN Pierre
committed
S02_ASSEMBLY options:
--stop_at_assembly Stop the pipeline at this step
--assembly Indicate the assembly tool for short reads ["metaspades" or "megahit" ]. Default: "metaspades".
or for HiFi reads ["hifiasm-meta", "metaflye"]. Default: "hifiasm-meta".
--coassembly Assemble together samples labeled with the same group in the samplesheet
MARTIN Pierre
committed
S03_FILTERING options:
--stop_at_filtering Stop the pipeline at this step
--skip_filtering Skip this step
--min_contigs_cpm [cutoff] CPM cutoff (Count Per Million) to filter contigs with low number of reads. Default: 1.
S04_STRUCTURAL_ANNOT options:
--stop_at_structural_annot Stop the pipeline at this step
--diamond_bank Path to diamond bank used to align protein sequence of genes: "PATH/bank.dmnd".
This bank must be previously built with diamond makedb.
MARTIN Pierre
committed
S06_FUNC_ANNOT options:
--skip_func_annot Skip this step
--percentage_identity [nb] Sequence identity threshold. Default: 0.95 corresponding to 95%. Use a number between 0 and 1.
--eggnog_mapper_db_download Flag --eggnog_mapper_db_download to build the database. Default false: metagWGS didn't build this database.
--eggnog_mapper_db_dir Path to eggnog-mapper database "PATH/database_directory/" if it is already built. Default: false.
You need to use --eggnog_mapper_db_download or --eggnog_mapper_db_dir. If it is not the case, an error message will occur.
MARTIN Pierre
committed
S07_TAXO_AFFI options:
--skip_taxo_affi Skip this step
--accession2taxid Path or FTP adress of file prot.accession2taxid.FULL.gz. Default: "ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.FULL.gz".
--taxdump Path or FTP adress of file taxdump.tar.gz. Default: "ftp://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz".
--gtdbtk_bank Path to the GTDBTK database
--skip_binning Skip this step
--binning_cross_alignment Mapping strategy to compute co-abundances for binning: .
'individual': Each sample is aligned against the assembly of the sample in question
'group': The samples labeled with the same group in the samplesheet will be aligned against each assembly within the group of samples.
'all': All the samples will be aligned against all the assembly (WARNING: It could take a long time.) (default: individual).
DARBOT Vincent
committed
--min_completeness [nb] Minimum % of bins completeness for the bins to be kept after bin_refinement step. Default: 0 (all bins are kept).
According to MIMAG standards, 'Medium-quality' genomes require to have at least completeness estimates of ≥50% and less than 10% contamination.
--max_contamination [nb] Maximum % of bins contamination for the bins to be kept after bin_refinement step. Default: 100.
associated with a maximum contamination of 10% to be considered as "Medium-quality".
--drep_threshold [nb] Average Nucleotide Identity (ANI) threshold used for bins de-replication. Default: 0.95 corresponding to 95%. Use a number between 0 and 1.
Most studies agree that 95% ANI is an appropriate threshold for species-level de-replicationton. If the goal of dereplication is to generate a set
of genomes that are distinct when mapping short reads, 98% (0.98) ANI is an appropriate threshold.
Other options:
--outdir The output directory where the results will be saved: "dir_name". Default "results".
--help Show this message and exit.
--multiqc_config If you want to change the configuration of multiqc report: "PATH/multiqc.yaml". Default "$baseDir/assets/multiqc_config.yaml".
-profile Configuration profile to use.
Available: singularity (use of Singularity containers), conda (use of conda environments), genotoul (use of metagWGS on genologin cluster),
// Show help message.
if (params.help){
helpMessage()
exit 0
}
def getAndCheckHeader() {
File file = new File(params.input)
assert file.exists() : "${params.input} file not found"
def line="";
file.withReader { reader ->
line = reader.readLine()
}
def tab = line.split(/,/)
def list = ['sample','flowcell','group','fastq_1','fastq_2', 'assembly']
for (i in tab) {
if (!list.contains(i)) {
exit 1, 'Error 1 while check samplesheet format please enter sample[,flowcell],fastq_1[,fastq_2][,assembly] with header line'
}
if (!tab.contains("sample") || !tab.contains("fastq_1")){
exit 1, 'Error 1 while check samplesheet format please enter at least sample,fastq_1 with header line'
def header_size = tab.size()
file.eachLine { row ->
if (row.size()!=0){
str = row.split(/,/)
if (str.size()!=header_size) {
exit 1, 'Error 1 while check samplesheet format, each line must have the same number of columns'
for (val in str){
if (val.isEmpty()){
exit 1, 'Error 1 while check samplesheet format, missing elements '
}
def returnFile(it) {
if (it == null) {
return null
} else {
if (!file(it).exists()) exit 1, "Missing file in CSV file: ${it}, see --help for more information"
}
return file(it)
}
def hasExtension(it, extension) {
it.toString().toLowerCase().endsWith(extension.toLowerCase())
}
// Check mandatory parameters
if (params.type.toUpperCase() == 'SR') {
if ( assembly_tool == null || assembly_tool == ''){
assembly_tool = 'metaspades'
}
else if (!['metaspades','megahit'].contains(assembly_tool)){
exit 1, "Invalid short read assembler option: ${assembly_tool}. Valid options for short reads: 'metaspades', 'megahit'"
if (!["solexa","illumina","sanger"].contains(params.quality_type)){
exit 1, "Invalid quality_type option: ${params.quality_type}. Valid options:'solexa','illumina','sanger'"
}
if (!(params.skip_host_filter) && !(params.host_fasta) && !(params.skip_clean)) {
exit 1, "You must specify --host_fasta or skip cleaning host step with option --skip_host_filter or skip all clean and qc modules with --skip_clean"
}
else if (params.type.toUpperCase() == 'HIFI') {
if ( assembly_tool == null || assembly_tool == ""){
assembly_tool = 'hifiasm-meta'
}
else if (!['hifiasm-meta','metaflye'].contains(assembly_tool)){
exit 1, "Invalid long read assembler option: ${assembly_tool}. Valid options for HiFi reads: 'hifiasm-meta', 'metaflye'"
if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) && !(params.diamond_bank) ) {
exit 1, "You must specify --stop_at_structural_annot or specify a diamond bank with --diamond_bank"
}
if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) && !(params.skip_binning ) &&!(params.gtdbtk_bank) ) {
exit 1, "You must specify --skip_binning or specify a GTDB-TK bank with --gtdbtk_bank"
}
if ( params.coassembly && params.binning_cross_alignment == 'group'){
exit 1, "--binning_cross_alignment group must not be use use --coassembly."
}
////////////
// Start check samplesheet
////////////
if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' }
header = getAndCheckHeader()
Channel.from(file(params.input)
.splitCsv ( header:true, sep:',' ) )
.map { row ->
def sample = row.sample
if (hasExtension(row.fastq_1, "fastq") || hasExtension(row.fastq_1, "fq") || hasExtension(row.fastq_2, "fastq") || hasExtension(row.fastq_2, "fq")) {
exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint."
}
if ((params.binning_cross_alignment == 'group' || params.coassembly) && row.group == null){
exit 1, "You must specify groups in the samplesheet if you want to use --binning_cross_alignment 'group' or --coassembly"
["sample":row.sample,
"flowcell":row.flowcell,
"fastq_1":returnFile(row.fastq_1),
"fastq_2":returnFile(row.fastq_2),
"assembly":returnFile(row.assembly) ]
}
.set { ch_inputs }
ch_inputs
.map { item ->
def meta = [:]
meta.id = item.sample
if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell}
meta.sample = item.sample
meta.flowcell = item.flowcell
meta.group = item.group
meta.assembly = item.assembly!=null
meta.type = params.type.toUpperCase()
if (meta.type=="SR"){
return [meta,[item.fastq_1,item.fastq_2]]
}
else if (meta.type=="HIFI"){
return [meta,[item.fastq_1]]
}
}
.set{ch_reads}
ch_inputs
.map { item ->
def meta = [:]
meta.id = item.sample
if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell}
meta.sample = item.sample
meta.flowcell = item.flowcell
meta.group = item.group
meta.assembly = item.assembly!=null
meta.type = params.type.toUpperCase()
has_assembly = (file(params.input).splitCsv ( header:true, sep:',' ).assembly[0] != null)
has_flowcell = (file(params.input).splitCsv ( header:true, sep:',' ).flowcell[0] != null)
////////////
// End check samplesheet
////////////
// Databases
ch_host_fasta = Channel.empty()
ch_host_index = Channel.empty()
ch_kaiju_db = Channel.empty()
ch_eggnog_db = Channel.empty()
ch_taxonomy = Channel.empty()
ch_gtbdtk_db = Channel.empty()
ch_host_fasta = DATABASES.out.host_fasta
ch_host_index = DATABASES.out.host_index
ch_eggnog_db = DATABASES.out.eggnog
ch_taxonomy = DATABASES.out.taxonomy
ch_gtbdtk_db = DATABASES.out.gtdbtk
ch_multiqc_config = Channel.empty()
/////////
///report
/////////
ch_cutadapt_report = Channel.empty()
ch_sickle_report = Channel.empty()
ch_before_host_filter_report = Channel.empty()
ch_fastqc_raw_report = Channel.empty()
ch_fastqc_clean_report = Channel.empty()
ch_kaiju_report = Channel.empty()
ch_idxstats = Channel.empty()
ch_unfilter_assembly_flagstat_report = Channel.empty()
ch_final_assembly_flagstat_report = Channel.empty()
ch_assembly_report = Channel.empty()
ch_filtered_report = Channel.empty()
ch_bins_abundances_report = Channel.empty()
ch_bins_stats_report = Channel.empty()
ch_quast_before_filter_report = Channel.empty()
ch_quast_after_filter_report = Channel.empty()
ch_software_versions = Channel.empty()
ch_software_versions_S01 = Channel.empty()
ch_software_versions_S02 = Channel.empty()
ch_software_versions_S04 = Channel.empty()
ch_software_versions_S05 = Channel.empty()
ch_software_versions_S06 = Channel.empty()
ch_software_versions_S07 = Channel.empty()
ch_software_versions_S08 = Channel.empty()
ch_software_versions_total = Channel.empty()
ch_multiqc_config = file(params.multiqc_config, checkIfExists: true)
if ( params.type.toUpperCase() != "SR" && params.type.toUpperCase() != "HIFI" ) {
exit 1, "Invalid type option: ${params.type}. Valid options are 'HiFi' for long-read, 'SR' for short-read"
}
if ( !params.skip_clean ) {
S01_CLEAN_QC (
ch_reads,
ch_host_fasta,
ch_host_index,
)
ch_cutadapt_report = S01_CLEAN_QC.out.cutadapt_report
ch_sickle_report = S01_CLEAN_QC.out.sickle_report
ch_before_host_filter_report = S01_CLEAN_QC.out.before_filter_report
ch_fastqc_raw_report = S01_CLEAN_QC.out.fastqc_raw_report
ch_fastqc_clean_report = S01_CLEAN_QC.out.fastqc_clean_report
ch_kaiju_report = S01_CLEAN_QC.out.kaiju_report
ch_software_versions_S01 = S01_CLEAN_QC.out.software_versions
if ( !params.stop_at_clean ) {
S02_ASSEMBLY ( ch_reads, ch_assembly, has_assembly, assembly_tool, has_flowcell )
ch_assembly = S02_ASSEMBLY.out.assembly
ch_idxstats = S02_ASSEMBLY.out.idxstats
ch_unfilter_assembly_flagstat_report = S02_ASSEMBLY.out.flagstat
ch_quast_before_filter_report = S02_ASSEMBLY.out.assembly_report
ch_software_versions_S02 = S02_ASSEMBLY.out.software_versions
if ( !params.stop_at_clean && !params.stop_at_assembly && !params.skip_filtering ) {
ch_min_contigs_cpm = Channel.value(params.min_contigs_cpm)
ch_idxstats,
ch_bam,
ch_min_contigs_cpm,
ch_assembly = S03_FILTERING.out.selected_contigs
ch_quast_after_filter_report = S03_FILTERING.out.quast_report
ch_sam_coverage = S03_FILTERING.out.sam_coverage
ch_final_assembly_flagstat_report = S03_FILTERING.out.sam_flagstat
ch_annotation_ffn = Channel.empty()
ch_annotation_faa = Channel.empty()
ch_annotation_gff = Channel.empty()
if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering ) {
S04_STRUCTURAL_ANNOT ( ch_assembly )
ch_annotation_ffn = S04_STRUCTURAL_ANNOT.out.ffn
ch_annotation_faa = S04_STRUCTURAL_ANNOT.out.faa
ch_annotation_gff = S04_STRUCTURAL_ANNOT.out.gff
ch_annotation_report = S04_STRUCTURAL_ANNOT.out.report
ch_software_versions_S04 = S04_STRUCTURAL_ANNOT.out.software_versions
ch_diamond_result = Channel.empty()
if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && (!params.skip_func_annot || !params.skip_taxo_affi)) {
S05_PROTEIN_ALIGNMENT (ch_annotation_faa, ch_diamon_db)
ch_diamond_result = S05_PROTEIN_ALIGNMENT.out.diamond_result
ch_software_versions_S05 = S05_PROTEIN_ALIGNMENT.out.software_versions
ch_quant_report = Channel.empty()
ch_v_eggnogmapper = Channel.empty()
if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_func_annot) {
S06_FUNC_ANNOT ( ch_annotation_ffn, ch_annotation_faa, ch_annotation_gff, ch_bam, ch_diamond_result, ch_eggnog_db )
ch_quant_report = S06_FUNC_ANNOT.out.quant_report
ch_software_versions_S06 = S06_FUNC_ANNOT.out.software_versions
if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_taxo_affi) {
S07_TAXO_AFFI ( ch_taxonomy, ch_diamond_result, ch_sam_coverage)
ch_software_versions_S07 = S07_TAXO_AFFI.out.software_versions
if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_binning ) {
S08_BINNING( ch_reads, ch_assembly, ch_bam, ch_gtbdtk_db, ch_quast )
ch_bins_abundances_report = S08_BINNING.out.bins_abundances_report
ch_bins_stats_report = S08_BINNING.out.bins_stats_report
ch_software_versions_S08 = S08_BINNING.out.software_versions
ch_software_versions_total = ch_software_versions_S01.mix(ch_software_versions_S02,
ch_software_versions_S04,
ch_software_versions_S05,
ch_software_versions_S06,
ch_software_versions_S08).unique { it.getBaseName() }.collect()
GET_SOFTWARE_VERSIONS(ch_software_versions_total) // ch_v_eggnogmapper.ifEmpty([]).first()
ch_software_versions = GET_SOFTWARE_VERSIONS.out.yaml
MULTIQC (
ch_multiqc_config,
ch_software_versions,
ch_cutadapt_report.collect().ifEmpty([]),
ch_sickle_report.collect().ifEmpty([]),
ch_before_host_filter_report.collect().ifEmpty([]),
ch_fastqc_raw_report.collect().ifEmpty([]),
ch_fastqc_clean_report.collect().ifEmpty([]),
ch_kaiju_report.collect().ifEmpty([]),
ch_unfilter_assembly_flagstat_report.collect().ifEmpty([]),
ch_quast_before_filter_report.collect().ifEmpty([]),
ch_quast_after_filter_report.collect().ifEmpty([]),
ch_final_assembly_flagstat_report.collect().ifEmpty([]),
ch_bins_abundances_report.collect().ifEmpty([]),
ch_bins_stats_report.collect().ifEmpty([])
)
multiqc_report = MULTIQC.out.report