Skip to content
Snippets Groups Projects
main.nf 12.3 KiB
Newer Older
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2

include { SHARED as SH } from './subworkflows/shared'
include { SHORT_READS as SR } from './subworkflows/short_reads'
include { DATABASES } from './subworkflows/00_databases'
include { FASTQC_HIFI as S04_HIFI_FASTQC } from './modules/fastqc'
include { HIFI_QUAST as S04_HIFI_QUAST } from './modules/metaquast'
include { GET_SOFTWARE_VERSIONS } from './modules/get_software_versions'
include { MULTIQC } from './modules/multiqc'
/*
 * Define helpMessage
 */

 def helpMessage() {

   log.info"""
   Usage:

     The typical command for running the pipeline is as follows:
       nextflow run -profile standard main.nf --input 'samplesheet.csv' --skip_host_filter --skip_kaiju 
     Mandatory arguments:
       --input [path]                Sample sheet: csv file with samples: sample,fastq_1,fastq_2,fasta[for HIFI]

     Options:
     
       --stop_at_clean               Stop the pipeline at this step
       --skip_clean                  Skip this step.
       --adapter1                    Sequence of adapter 1. Default: Illumina TruSeq adapter.
       --adapter2                    Sequence of adapter 2. Default: Illumina TruSeq adapter.
       --skip_sickle                 Skip sickle process.
       --quality_type                Type of quality values for sickle (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)). Default: 'sanger'.
       --skip_host_filter            Skip filter host reads process.
       --host_fasta                  Full path to fasta of host genome ("PATH/name_genome.fasta").
       --host_bwa_index              Full path to directory containing BWA index including base name i.e ("PATH/name_genome.{amb,ann,bwt,pac,sa}").
       You need to use --skip_host_filter or --host_fasta or --skip_01_clean_qc. If it is not the case, an error message will occur.
       --skip_kaiju                  Skip taxonomic affiliation of reads with kaiju.
       --kaiju_verbose               Allow the generation of kaiju verbose output (file can be large)
       --kaiju_db_dir                Directory with kaiju database already built ("PATH/directory").
       --kaiju_db_url                Indicate kaiju database you want to build. Default: "https://kaiju.binf.ku.dk/database/kaiju_db_refseq_2021-02-26.tgz".
       You need to use --kaiju_db_url or --kaiju_db_dir or --skip_kaiju. If it is not the case, an error message will occur.
     
       --stop_at_assembly            Stop the pipeline at this step
       --assembly                    Indicate the assembly tool ["metaspades" or "megahit" ]. Default: "metaspades".
       --metaspades_mem [mem_value]  Memory (in G) used by metaspades process. Default: 440.
       --stop_at_filtering           Stop the pipeline at this step
       --skip_filtering              Skip this step
       --min_contigs_cpm [cutoff]    CPM cutoff (Count Per Million) to filter contigs with low number of reads. Default: 10.

     S04_STRUCTURAL_ANNOT options:
       --stop_at_structural_annot    Stop the pipeline at this step

       --diamond_bank                Path to diamond bank used to align protein sequence of genes: "PATH/bank.dmnd".
                                     This bank must be previously built with diamond makedb.
       
       --skip_func_annot             Skip this step
       --percentage_identity [nb]    Sequence identity threshold. Default: 0.95 corresponding to 95%. Use a number between 0 and 1.
       --eggnog_mapper_db_download   Flag --eggnog_mapper_db_download to build the database. Default false: metagWGS didn't build this database.
       --eggnog_mapper_db_dir        Path to eggnog-mapper database "PATH/database_directory/" if it is already built. Default: false.
       You need to use --eggnog_mapper_db_download or --eggnog_mapper_db_dir. If it is not the case, an error message will occur.
       
       --skip_taxo_affi              Skip this step
       --accession2taxid             FTP adress of file prot.accession2taxid.gz. Default: "ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz".
       --taxdump                     FTP adress of file taxdump.tar.gz. Default: "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz".
       --taxonomy_dir                Directory if taxdump and accession2taxid already downloaded ("PATH/directory").

     Other options:
       --outdir                      The output directory where the results will be saved: "dir_name". Default "results".
       --help                        Show this message and exit.
       --multiqc_config              If you want to change the configuration of multiqc report: "PATH/multiqc.yaml". Default "$baseDir/assets/multiqc_config.yaml".

       -profile                      Configuration profile to use.
                                     Available: singularity (use of Singularity containers), conda (use of conda environments), genotoul (use of metagWGS on genologin cluster),
                                     different test profiles (test_genotoul_workq, test_genotoul_testq, big_test_genotoul, test_local) and debug profile.
   """.stripIndent()
 }
// Show help message.

if (params.help){
  helpMessage()
  exit 0
}


def getAndCheckHeader() {
    File file = new File(params.input)
    assert file.exists() : "${params.input} file not found"
    def line="";
    file.withReader { reader ->
        line = reader.readLine()
    }
    def tab = line.split(/,/)
    if (! ((tab[0] == "sample") && (tab[1] == "fastq_1") )) {
        exit 1, 'Error 1 while check samplesheet format please enter sample,fastq_1[,fastq_2][,assembly] with header line' 
    }
    if (tab.size() == 3 ){
        if (!((tab[2] == "fastq_2") || (tab[2] == "assembly")))  {
            exit 1, 'Error 2 while check samplesheet format please enter sample,fastq_1[,fastq_2][,assembly] with header line' 
        }
    }
    if (tab.size() == 4) {
        if ( ! ((tab[2] == "fastq_2") && (tab[3] == "assembly")))  {
            exit 1, 'Error 3 while check samplesheet format please enter sample,fastq_1[,fastq_2][,assembly] with header line' 
        }
    }
    return tab
}

def returnFile(it) {
    if (it == null) {
        return null
    } else {
        if (!file(it).exists()) exit 1, "Missing file in CSV file: ${it}, see --help for more information"
    }
    return file(it)
}

def hasExtension(it, extension) {
    it.toString().toLowerCase().endsWith(extension.toLowerCase())
}

  ////////////
  // Start check samplesheet
  ////////////
  if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' }
  skip_clean = params.skip_clean
    if (!['metaspades','megahit'].contains(params.assembly)){
      exit 1, "Invalid short read aligner option: ${params.assembly}. Valid options: 'metaspades', 'megahit'"
    }
    if (!["solexa","illumina","sanger"].contains(params.quality_type)){
      exit 1, "Invalid quality_type option: ${params.quality_type}. Valid options:'solexa','illumina','sanger'"
    }
    if (!(params.skip_host_filter) && !(params.host_fasta) && !(params.skip_clean)) {
      exit 1, "You must specify --host_fasta or skip cleaning host step with option --skip_host_filter or skip all clean and qc modules with --skip_clean"
    }
  if ( !(params.stop_at_structural_annot) && !(params.diamond_bank) ) {
      exit 1, "You must specify --stop_at_structural_annot or specify a diamond bank with --diamond_bank"
  }
  header = getAndCheckHeader()

  Channel.from(file(params.input)
    .splitCsv ( header:true, sep:',' ) )
    .map { row ->
           def sample  = row.sample
           def paired    = false
           if (row.fastq_2 != null) {
              paired    = true
           }
           if (hasExtension(row.fastq_1, "fastq") || hasExtension(row.fastq_1, "fq") || hasExtension(row.fastq_2, "fastq") || hasExtension(row.fastq_2, "fq")) {
              exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint."
           }
           ["sample":row.sample,
            "fastq_1":returnFile(row.fastq_1),
            "fastq_2":returnFile(row.fastq_2),
            "paired": paired,
            "assembly":returnFile(row.assembly) ]
          }
      .set  { ch_inputs }

  ////////////
  // End check samplesheet  
  ////////////

  // Databases
  ch_host_fasta = Channel.empty()
  ch_host_index = Channel.empty()
  ch_kaiju_db = Channel.empty()
  ch_eggnog_db = Channel.empty()
  ch_taxonomy = Channel.empty()

  DATABASES (skip_clean)
  ch_host_fasta = DATABASES.out.host_fasta
  ch_host_index = DATABASES.out.host_index
  ch_kaiju_db = DATABASES.out.kaiju_db
  ch_eggnog_db = DATABASES.out.eggnog
  ch_taxonomy = DATABASES.out.taxonomy
  ch_multiqc_config = Channel.empty()

  // SR only report
  ch_cutadapt_report = Channel.empty()
  ch_sickle_report = Channel.empty()
  ch_before_filter_report = Channel.empty()
  ch_after_filter_report = Channel.empty()
  ch_fastqc_raw_report = Channel.empty()
  ch_fastqc_clean_report = Channel.empty()
  ch_kaiju_report = Channel.empty()
  ch_dedup_report = Channel.empty()
  ch_assembly_report = Channel.empty()
  ch_filtered_report = Channel.empty()

  // HIFI only report
  ch_hifi_fastqc_report = Channel.empty()
  ch_hifi_quast_report = Channel.empty()

  // Shared report
  ch_prokka_report = Channel.empty()
  ch_quant_report = Channel.empty()
  ch_v_eggnogmapper = Channel.empty()
  if ( params.type.toUpperCase() == "SR" ) {
    ch_multiqc_config = file(params.sr_multiqc_config, checkIfExists: true)
  	println("Entering SR")
      .map { item -> [ item.sample, item.fastq_1, item.fastq_2 ] }
      .set { ch_reads }
    ch_inputs
      .map { item -> [ item.sample, item.paired ] }
      .set { ch_paired }
    //ch_reads.view{ it -> "${it}" }
    //ch_paired.view{ it -> "${it}" }
    SR (
      ch_reads,
      ch_paired,
      ch_host_fasta,
      ch_host_index,
      ch_kaiju_db
    )
    ch_reads = SR.out.dedup
    ch_assembly = SR.out.assembly

    ch_cutadapt_report = SR.out.cutadapt_report
    ch_sickle_report = SR.out.sickle_report
    ch_before_filter_report = SR.out.before_filter_report
    ch_after_filter_report = SR.out.after_filter_report
    ch_fastqc_raw_report = SR.out.fastqc_raw_report
    ch_fastqc_clean_report = SR.out.fastqc_clean_report
    ch_kaiju_report = SR.out.kaiju_report
    ch_dedup_report = SR.out.dedup_report
    ch_assembly_report = SR.out.assembly_report
    ch_filtered_report = SR.out.filtered_report
  else if ( params.type.toUpperCase() == "HIFI" ) {

    ch_multiqc_config = file(params.hifi_multiqc_config, checkIfExists: true)
  	println("Entering HiFi")
    ch_inputs.map { item -> [ item.sample, item.assembly ] } // [sample, assembly] 
             .set { ch_assembly }

    ch_inputs.map { item -> [ item.sample, item.fastq_1 ] } // [sample, reads] 
         .set { ch_reads }
    S04_HIFI_FASTQC( ch_reads )
    S04_HIFI_QUAST( ch_assembly )
    ch_hifi_fastqc_report = S04_HIFI_FASTQC.out.zip
    ch_hifi_quast_report = S04_HIFI_QUAST.out.report
	else {
		exit 1, "Invalid type option: ${params.type}. Valid options are 'HiFi' for long-read, 'SR' for short-read"
  SH (
    ch_reads,
    ch_assembly,
    ch_eggnog_db,
    ch_taxonomy
  )

  ch_prokka_report = SH.out.prokka_report
  ch_quant_report = SH.out.quant_report
  ch_v_eggnogmapper = SH.out.v_eggnogmapper

  GET_SOFTWARE_VERSIONS( ch_v_eggnogmapper.ifEmpty([]).first() )
  ch_software_versions = GET_SOFTWARE_VERSIONS.out.yaml

  MULTIQC (
    ch_multiqc_config,
    ch_software_versions,
    ch_cutadapt_report.collect().ifEmpty([]),
    ch_sickle_report.collect().ifEmpty([]),
    ch_before_filter_report.collect().ifEmpty([]),
    ch_after_filter_report.collect().ifEmpty([]),
    ch_fastqc_raw_report.collect().ifEmpty([]),
    ch_fastqc_clean_report.collect().ifEmpty([]),
    ch_kaiju_report.collect().ifEmpty([]),
    ch_dedup_report.collect().ifEmpty([]),
    ch_assembly_report.collect().ifEmpty([]),
    ch_filtered_report.collect().ifEmpty([]),
    ch_hifi_fastqc_report.collect().ifEmpty([]),
    ch_hifi_quast_report.collect().ifEmpty([]),
    ch_prokka_report.collect().ifEmpty([]),
    ch_quant_report.collect().ifEmpty([])
  )
  multiqc_report = MULTIQC.out.report