From 0db84a697ef53ec2946e5ede0d7165d1521561e7 Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Tue, 4 Feb 2025 13:36:09 +0100 Subject: [PATCH 01/10] remove unapropriate mdbg references --- README.md | 2 +- docs/source/output.md | 2 +- docs/source/usage.md | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 31a795d..f243e25 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Many of these steps are optional and their necessity depends on the desired anal * controls the quality of raw and cleaned data ([FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) * makes a taxonomic classification of cleaned reads ([Kaiju MEM](https://github.com/bioinformatics-centre/kaiju) + [kronaTools](https://github.com/marbl/Krona/wiki/KronaTools) + [plot_kaiju_stat.py](bin/plot_kaiju_stat.py) + [merge_kaiju_results.py](bin/merge_kaiju_results.py)) * `S02_ASSEMBLY` - * assembles reads ([metaSPAdes](https://github.com/ablab/spades) or [Megahit](https://github.com/voutcn/megahit) or [Hifiasm_meta](https://github.com/lh3/hifiasm-meta), [metaFlye](https://github.com/fenderglass/Flye) or [metaMDBG](https://github.com/GaetanBenoitDev/metaMDBG)) + * assembles reads ([metaSPAdes](https://github.com/ablab/spades) or [Megahit](https://github.com/voutcn/megahit) or [Hifiasm_meta](https://github.com/lh3/hifiasm-meta), [metaFlye](https://github.com/fenderglass/Flye)) * assesses the quality of assembly ([metaQUAST](http://quast.sourceforge.net/metaquast)) * reads deduplication, alignment against contigs for short reads ([BWA-MEM2](https://github.com/bwa-mem2/bwa-mem2) + [Samtools](http://www.htslib.org/)) * reads alignment against contigs for HiFi reads ([Minimap2](https://github.com/lh3/minimap2) + [Samtools](http://www.htslib.org/)) diff --git a/docs/source/output.md b/docs/source/output.md index cbea201..e9ca730 100644 --- a/docs/source/output.md +++ b/docs/source/output.md @@ -53,7 +53,7 @@ The `results/` directory contains a sub-directory for each step launched: ```{note} In this directory you have either the results of assembly with `metaspades` or `megahit` -if you analyse short read data and `hifiasm-meta`, `metaflye` or `metamdbg` +if you analyse short read data and `hifiasm-meta` or `metaflye`. if you analyse HiFi data. You have chosen your assembly tool with `--assembly` parameter. ``` diff --git a/docs/source/usage.md b/docs/source/usage.md index c49c5b2..5aae59f 100644 --- a/docs/source/usage.md +++ b/docs/source/usage.md @@ -351,7 +351,7 @@ See [II. Input files](usage.md#ii-input-files) and warnings. | Parameter | Description | | ------------ | ----------- | -| `--assembly` | allows to indicate the assembly tool.<br /> For short reads: `["metaspades" or "megahit"]`: Default: `metaspades`.<br /> For HiFi reads: `["hifiasm-meta", "metaflye", "metamdbg"]`. Default: `hifiasm-meta`.| +| `--assembly` | allows to indicate the assembly tool.<br /> For short reads: `["metaspades" or "megahit"]`: Default: `metaspades`.<br /> For HiFi reads: `["hifiasm-meta", "metaflye"]`. Default: `hifiasm-meta`.| | `--coassembly` | allows to assemble together the samples labeled with the same group in the samplesheet. <br /> It will generate one assembly for each group. To co-assemble all of your samples together, <br /> you must indicate a unique group for each sample in the samplesheet.| ```{warning} @@ -362,7 +362,7 @@ for each group co-assembled and automatically mapping with every sample of his g * For short reads, the user can choose between `metaspades` or `megahit` for `--assembly` parameter. The choice can be based on CPUs and memory availability: `metaspades` needs more CPUs and memory than `megahit` but our tests showed that assembly metrics are better for `metaspades` than `megahit`. For PacBio HiFi reads, the user -can choose between `hifiasm-meta`, `metaflye` or `metamdbg`. +can choose between `hifiasm-meta` or `metaflye`. ``` ```{note} -- GitLab From 8b2275fd6a044cb10def4b381a3d660db329a754 Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Tue, 4 Feb 2025 13:37:00 +0100 Subject: [PATCH 02/10] use external gtdbtk image --- conf/base.config | 1 + env/binning.yml | 1 - modules/gtdbtk.nf | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/base.config b/conf/base.config index aa3d2cd..65e0d69 100644 --- a/conf/base.config +++ b/conf/base.config @@ -105,6 +105,7 @@ process { cpus = 10 } withName: GTDBTK { + container = 'quay.io/biocontainers/gtdbtk:2.4.0--pyhdfd78af_2' memory = { 64.GB * task.attempt } cpus = 16 } diff --git a/env/binning.yml b/env/binning.yml index 77d3f73..45a2405 100644 --- a/env/binning.yml +++ b/env/binning.yml @@ -26,6 +26,5 @@ dependencies: - metabat2=2.15 - maxbin2=2.2.7 - concoct=1.1.0 - - gtdbtk=2.1 - drep=3.0.0 - pprodigal diff --git a/modules/gtdbtk.nf b/modules/gtdbtk.nf index 73499ba..ba9b86d 100644 --- a/modules/gtdbtk.nf +++ b/modules/gtdbtk.nf @@ -15,7 +15,7 @@ process GTDBTK { export GTDBTK_DATA_PATH=$gtdbtk_db - gtdbtk classify_wf --genome_dir $drep_bins_folder -x fa --out_dir ./ --pplacer_cpus ${task.cpus} --cpus ${task.cpus} + gtdbtk classify_wf --genome_dir $drep_bins_folder -x fa --out_dir ./ --skip_ani_screen --pplacer_cpus ${task.cpus} --cpus ${task.cpus} echo \$(gtdbtk -h 2>&1) &> v_gtdbtk.txt """ } \ No newline at end of file -- GitLab From d6ea7ddd7f2b5f3ed952a9e016f5570a54909212 Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Thu, 16 Jan 2025 13:42:10 +0100 Subject: [PATCH 03/10] calcul cpm for each sample one by one reorder kept_contigs and idxstat --- bin/filter_contig_per_cpm.py | 123 ++++++++++++++++++++++++++--------- 1 file changed, 91 insertions(+), 32 deletions(-) diff --git a/bin/filter_contig_per_cpm.py b/bin/filter_contig_per_cpm.py index fdea68d..21f2fe6 100755 --- a/bin/filter_contig_per_cpm.py +++ b/bin/filter_contig_per_cpm.py @@ -35,7 +35,7 @@ import pyfastx def parse_arguments(): """Parse script arguments.""" - parser = ArgumentParser(description="...", + parser = ArgumentParser(description="Calculate CPM per contig", formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument("-i", "--samtools_idxstats", nargs='+', required = True, @@ -59,36 +59,77 @@ def parse_arguments(): args = parser.parse_args() return args -def combine_idxstat_files(idxstat_files): +def combine_idxstat_files(idxstat_dfs): """ - Combine multiple idxstat files that have the same contigs. + Combine multiple idxstat df that have the same contigs. Sum the #_mapped_read_segments column over multiple idxstat files that have the same reference sequences. """ + #columns_names = ['reference_sequence_name', + # 'sequence_length', + # '#_mapped_read_segments', + # '#_unmapped_read-segments'] + + combined_df = idxstat_dfs[0].copy() + + for idxstat_df in idxstat_dfs[1:]: + combined_df['#_mapped_read_segments'] += idxstat_df['#_mapped_read_segments'] + combined_df['cpm_count'] += idxstat_df['cpm_count'] + + return combined_df + + #idxstat_df = pd.read_csv(idxstat_files[0], + # sep ='\t', + # names = columns_names, + # usecols = ['reference_sequence_name', + # 'sequence_length', + # '#_mapped_read_segments',], + # comment="*").set_index('reference_sequence_name') + + #for idxstat_file in idxstat_files[1:]: + # other_idxstat_df = pd.read_csv(idxstat_file, + # sep ='\t', + # names = columns_names, + # usecols = ['reference_sequence_name', + # '#_mapped_read_segments',], + # comment="*").set_index('reference_sequence_name') + + # idxstat_df['#_mapped_read_segments'] += other_idxstat_df['#_mapped_read_segments'] + + #return idxstat_df + +def filter_cpm(idxstat_file, cpm_cutoff): + """ + Calculate and filter byCPM for each contig. + """ columns_names = ['reference_sequence_name', 'sequence_length', - '#_mapped_read_segments', + '#_mapped_read_segments', '#_unmapped_read-segments'] - idxstat_df = pd.read_csv(idxstat_files[0], - sep ='\t', + idxstat_df = pd.read_csv(idxstat_file, sep ='\t', names = columns_names, usecols = ['reference_sequence_name', 'sequence_length', '#_mapped_read_segments',], comment="*").set_index('reference_sequence_name') - - for idxstat_file in idxstat_files[1:]: - other_idxstat_df = pd.read_csv(idxstat_file, - sep ='\t', - names = columns_names, - usecols = ['reference_sequence_name', - '#_mapped_read_segments',], - comment="*").set_index('reference_sequence_name') - - idxstat_df['#_mapped_read_segments'] += other_idxstat_df['#_mapped_read_segments'] + + + logging.info(f'idxstat_file: {idxstat_df}') + + + sum_reads = idxstat_df['#_mapped_read_segments'].sum() + idxstat_df['cpm_count'] = 1e6 * (idxstat_df['#_mapped_read_segments'] / sum_reads) + + # Apply CPM cutoff + #filtered_idxstat_df = idxstat_df[idxstat_df['cpm_count'] >= cpm_cutoff] + kept_contigs = idxstat_df.loc[idxstat_df["cpm_count"] >= cpm_cutoff].index + + logging.info(f'length of idxstat_file: {len(idxstat_df)}') + logging.info(f'length of kept_contigs: {len(kept_contigs)}') + + return kept_contigs, idxstat_df - return idxstat_df def main(): args = parse_arguments() @@ -103,28 +144,46 @@ def main(): cpm_cutoff = float(args.cutoff_cpm) # Read input tables - idxstat_df = combine_idxstat_files(args.samtools_idxstats) - + # idxstat_df = combine_idxstat_files(args.samtools_idxstats) + + #Â logging.info(f'idxstat file: {idxstat_file}') + idxstat_results = [filter_cpm(idxstat_file, cpm_cutoff) for idxstat_file in args.samtools_idxstats] + + # Separate idxstat and kept_contigs + kept_contigs, idxstat_dfs = zip(*idxstat_results) + + logging.info(f'number of kept_contigs: {len(kept_contigs)}') + logging.info(f'number of idxstat_file: {len(idxstat_dfs)}') + + # Combine idxstat dataframes + #combined_idxstat_df = combine_idxstat_files(idxstat_dfs) + #logging.info(f'length of list_idxstat_df: {len(combined_idxstat_df)}') + #logging.info(f'head of list_idxstat_df[0]: {combined_idxstat_df.head()}') + # Calculates cpm for each contig - sum_reads = idxstat_df['#_mapped_read_segments'].sum() - logging.info(f'Total number of mapped reads {sum_reads}') - - logging.info(f'With a cpm cutoff of {args.cutoff_cpm}, contigs with less than {(sum_reads*cpm_cutoff)/1e6} reads are removed.') - idxstat_df['cpm_count'] = 1e6 * idxstat_df['#_mapped_read_segments']/sum_reads + #sum_reads = idxstat_df['#_mapped_read_segments'].sum() + #logging.info(f'Total number of mapped reads {sum_reads}') + #logging.info(f'With a cpm cutoff of {args.cutoff_cpm}, contigs with less than {(sum_reads*cpm_cutoff)/1e6} reads are removed.') + #idxstat_df['cpm_count'] = 1e6 * idxstat_df['#_mapped_read_segments']/sum_reads + + #list_idxstat_df.append(idxstat_df) + #logging.info(f'head of list_idxstat_df[1]: {list_idxstat_df[1].head()}') + + # Contigs with nb reads > cutoff - kept_contigs = idxstat_df.loc[idxstat_df["cpm_count"] >= cpm_cutoff].index + #kept_contigs = idxstat_df.loc[idxstat_df["cpm_count"] >= cpm_cutoff].index - logging.info(f'{len(kept_contigs)}/{len(idxstat_df)} contigs are kept with a cpm cutoff of {cpm_cutoff}.') + #logging.info(f'{len(kept_contigs)}/{len(idxstat_df)} contigs are kept with a cpm cutoff of {cpm_cutoff}.') # Write new fasta files with kept and unkept contigs - with open(args.select, "w") as out_select_handle, open(args.discard, "w") as out_discard_handle: + # with open(args.select, "w") as out_select_handle, open(args.discard, "w") as out_discard_handle: - for contig, seq in pyfastx.Fasta(args.fasta_file, build_index=False): - if contig in kept_contigs: - out_select_handle.write(f'>{contig}\n{seq}\n') - else: - out_discard_handle.write(f'>{contig}\n{seq}\n') + # for contig, seq in pyfastx.Fasta(args.fasta_file, build_index=False): + # if contig in kept_contigs: + # out_select_handle.write(f'>{contig}\n{seq}\n') + # else: + # out_discard_handle.write(f'>{contig}\n{seq}\n') if __name__ == "__main__": main() -- GitLab From d8d8c4e1a3aad2621ae38bb04c2f35758746b72a Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Thu, 16 Jan 2025 15:31:35 +0100 Subject: [PATCH 04/10] combine output of filter_cpm --- bin/filter_contig_per_cpm.py | 65 ++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 10 deletions(-) diff --git a/bin/filter_contig_per_cpm.py b/bin/filter_contig_per_cpm.py index 21f2fe6..e7e72a4 100755 --- a/bin/filter_contig_per_cpm.py +++ b/bin/filter_contig_per_cpm.py @@ -59,7 +59,7 @@ def parse_arguments(): args = parser.parse_args() return args -def combine_idxstat_files(idxstat_dfs): +def combine_idxstat_files_old(idxstat_dfs): """ Combine multiple idxstat df that have the same contigs. @@ -98,6 +98,44 @@ def combine_idxstat_files(idxstat_dfs): #return idxstat_df +def combine_idxstat_files(idxstat_dfs): + """ + Combine multiple idxstat df that have the same contigs. + + Sum the #_mapped_read_segments column over multiple idxstat files that have the same reference sequences. + """ + #columns_names = ['reference_sequence_name', + # 'sequence_length', + # '#_mapped_read_segments', + # '#_unmapped_read-segments'] + + common_contigs = set(idxstat_dfs[0].index) + for df in idxstat_dfs[1:]: + common_contigs = common_contigs.intersection(set(df.index)) + + # Convert set to list for pandas indexing + common_contigs_list = sorted(list(common_contigs)) + + # Filter dataframes to keep only common contigs + filtered_dfs = [df.loc[common_contigs_list] for df in idxstat_dfs] + + # Combine dataframes + combined_df = filtered_dfs[0].copy() + + for df in filtered_dfs[1:]: + combined_df['#_mapped_read_segments'] += df['#_mapped_read_segments'] + combined_df['cpm_count'] += df['cpm_count'] + + return combined_df + +def combine_kept_contigs(kept_contigs_list): + """Combine multiple lists of kept contigs.""" + # Get contigs that appear in all lists (intersection) + common_contigs = set(kept_contigs_list[0]) + for contigs in kept_contigs_list[1:]: + common_contigs = common_contigs.intersection(set(contigs)) + return pd.Index(list(common_contigs)) + def filter_cpm(idxstat_file, cpm_cutoff): """ Calculate and filter byCPM for each contig. @@ -152,14 +190,18 @@ def main(): # Separate idxstat and kept_contigs kept_contigs, idxstat_dfs = zip(*idxstat_results) + print(kept_contigs) + logging.info(f'number of kept_contigs: {len(kept_contigs)}') logging.info(f'number of idxstat_file: {len(idxstat_dfs)}') # Combine idxstat dataframes - #combined_idxstat_df = combine_idxstat_files(idxstat_dfs) - #logging.info(f'length of list_idxstat_df: {len(combined_idxstat_df)}') + combined_idxstat_df = combine_idxstat_files(idxstat_dfs) + logging.info(f'length of list_idxstat_df: {len(combined_idxstat_df)}') #logging.info(f'head of list_idxstat_df[0]: {combined_idxstat_df.head()}') + combined_kept_contigs = combine_kept_contigs(kept_contigs) + logging.info(f'length of combined_kept_contigs: {len(combined_kept_contigs)}') # Calculates cpm for each contig #sum_reads = idxstat_df['#_mapped_read_segments'].sum() #logging.info(f'Total number of mapped reads {sum_reads}') @@ -175,15 +217,18 @@ def main(): # Contigs with nb reads > cutoff #kept_contigs = idxstat_df.loc[idxstat_df["cpm_count"] >= cpm_cutoff].index - #logging.info(f'{len(kept_contigs)}/{len(idxstat_df)} contigs are kept with a cpm cutoff of {cpm_cutoff}.') + logging.info(f'{len(combined_kept_contigs)}/{len(combined_idxstat_df)} contigs are kept with a cpm cutoff of {cpm_cutoff}.') # Write new fasta files with kept and unkept contigs - # with open(args.select, "w") as out_select_handle, open(args.discard, "w") as out_discard_handle: + with open(args.select, "w") as out_select_handle, open(args.discard, "w") as out_discard_handle: - # for contig, seq in pyfastx.Fasta(args.fasta_file, build_index=False): - # if contig in kept_contigs: - # out_select_handle.write(f'>{contig}\n{seq}\n') - # else: - # out_discard_handle.write(f'>{contig}\n{seq}\n') + for contig, seq in pyfastx.Fasta(args.fasta_file, build_index=False): + logging.info(f'contig in last step: {contig}\n{seq}\n') + if contig in kept_contigs: + logging.info(f'contig in kept_contigs: {contig}') + #out_select_handle.write(f'>{contig}\n{seq}\n') + else: + logging.info(f'contig not in kept_contigs: {contig}') + #out_discard_handle.write(f'>{contig}\n{seq}\n') if __name__ == "__main__": main() -- GitLab From 319324d42f1e5d0678cdf182407b7abccd16b615 Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Fri, 17 Jan 2025 10:55:15 +0100 Subject: [PATCH 05/10] delete old code delete unused library --- bin/filter_contig_per_cpm.py | 82 ++++-------------------------------- 1 file changed, 8 insertions(+), 74 deletions(-) diff --git a/bin/filter_contig_per_cpm.py b/bin/filter_contig_per_cpm.py index e7e72a4..8963aa3 100755 --- a/bin/filter_contig_per_cpm.py +++ b/bin/filter_contig_per_cpm.py @@ -25,7 +25,6 @@ __status__ = 'dev' from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter import pandas as pd -import numpy as np import logging import pyfastx @@ -59,56 +58,13 @@ def parse_arguments(): args = parser.parse_args() return args -def combine_idxstat_files_old(idxstat_dfs): - """ - Combine multiple idxstat df that have the same contigs. - - Sum the #_mapped_read_segments column over multiple idxstat files that have the same reference sequences. - """ - #columns_names = ['reference_sequence_name', - # 'sequence_length', - # '#_mapped_read_segments', - # '#_unmapped_read-segments'] - - combined_df = idxstat_dfs[0].copy() - - for idxstat_df in idxstat_dfs[1:]: - combined_df['#_mapped_read_segments'] += idxstat_df['#_mapped_read_segments'] - combined_df['cpm_count'] += idxstat_df['cpm_count'] - - return combined_df - - #idxstat_df = pd.read_csv(idxstat_files[0], - # sep ='\t', - # names = columns_names, - # usecols = ['reference_sequence_name', - # 'sequence_length', - # '#_mapped_read_segments',], - # comment="*").set_index('reference_sequence_name') - - #for idxstat_file in idxstat_files[1:]: - # other_idxstat_df = pd.read_csv(idxstat_file, - # sep ='\t', - # names = columns_names, - # usecols = ['reference_sequence_name', - # '#_mapped_read_segments',], - # comment="*").set_index('reference_sequence_name') - - # idxstat_df['#_mapped_read_segments'] += other_idxstat_df['#_mapped_read_segments'] - - #return idxstat_df - def combine_idxstat_files(idxstat_dfs): """ Combine multiple idxstat df that have the same contigs. Sum the #_mapped_read_segments column over multiple idxstat files that have the same reference sequences. """ - #columns_names = ['reference_sequence_name', - # 'sequence_length', - # '#_mapped_read_segments', - # '#_unmapped_read-segments'] - + common_contigs = set(idxstat_dfs[0].index) for df in idxstat_dfs[1:]: common_contigs = common_contigs.intersection(set(df.index)) @@ -159,8 +115,7 @@ def filter_cpm(idxstat_file, cpm_cutoff): sum_reads = idxstat_df['#_mapped_read_segments'].sum() idxstat_df['cpm_count'] = 1e6 * (idxstat_df['#_mapped_read_segments'] / sum_reads) - # Apply CPM cutoff - #filtered_idxstat_df = idxstat_df[idxstat_df['cpm_count'] >= cpm_cutoff] + # Apply CPM cutoff kept_contigs = idxstat_df.loc[idxstat_df["cpm_count"] >= cpm_cutoff].index logging.info(f'length of idxstat_file: {len(idxstat_df)}') @@ -180,55 +135,34 @@ def main(): logging.basicConfig(format="%(levelname)s: %(message)s") cpm_cutoff = float(args.cutoff_cpm) - - # Read input tables - # idxstat_df = combine_idxstat_files(args.samtools_idxstats) #Â logging.info(f'idxstat file: {idxstat_file}') idxstat_results = [filter_cpm(idxstat_file, cpm_cutoff) for idxstat_file in args.samtools_idxstats] # Separate idxstat and kept_contigs kept_contigs, idxstat_dfs = zip(*idxstat_results) - - print(kept_contigs) - + logging.info(f'number of kept_contigs: {len(kept_contigs)}') logging.info(f'number of idxstat_file: {len(idxstat_dfs)}') # Combine idxstat dataframes combined_idxstat_df = combine_idxstat_files(idxstat_dfs) logging.info(f'length of list_idxstat_df: {len(combined_idxstat_df)}') - #logging.info(f'head of list_idxstat_df[0]: {combined_idxstat_df.head()}') combined_kept_contigs = combine_kept_contigs(kept_contigs) + combined_kept_contigs = set(combined_kept_contigs.tolist()) logging.info(f'length of combined_kept_contigs: {len(combined_kept_contigs)}') - # Calculates cpm for each contig - #sum_reads = idxstat_df['#_mapped_read_segments'].sum() - #logging.info(f'Total number of mapped reads {sum_reads}') - - #logging.info(f'With a cpm cutoff of {args.cutoff_cpm}, contigs with less than {(sum_reads*cpm_cutoff)/1e6} reads are removed.') - #idxstat_df['cpm_count'] = 1e6 * idxstat_df['#_mapped_read_segments']/sum_reads - - #list_idxstat_df.append(idxstat_df) - #logging.info(f'head of list_idxstat_df[1]: {list_idxstat_df[1].head()}') - - - # Contigs with nb reads > cutoff - #kept_contigs = idxstat_df.loc[idxstat_df["cpm_count"] >= cpm_cutoff].index - logging.info(f'{len(combined_kept_contigs)}/{len(combined_idxstat_df)} contigs are kept with a cpm cutoff of {cpm_cutoff}.') + # Write new fasta files with kept and unkept contigs with open(args.select, "w") as out_select_handle, open(args.discard, "w") as out_discard_handle: for contig, seq in pyfastx.Fasta(args.fasta_file, build_index=False): - logging.info(f'contig in last step: {contig}\n{seq}\n') - if contig in kept_contigs: - logging.info(f'contig in kept_contigs: {contig}') - #out_select_handle.write(f'>{contig}\n{seq}\n') + if contig in combined_kept_contigs: + out_select_handle.write(f'>{contig}\n{seq}\n') else: - logging.info(f'contig not in kept_contigs: {contig}') - #out_discard_handle.write(f'>{contig}\n{seq}\n') + out_discard_handle.write(f'>{contig}\n{seq}\n') if __name__ == "__main__": main() -- GitLab From a1f47bd4c78313f589e11c4565db74c87f56122b Mon Sep 17 00:00:00 2001 From: Philippe Ruiz <philippe.ruiz@inrae.fr> Date: Fri, 17 Jan 2025 15:36:45 +0100 Subject: [PATCH 06/10] use union in place of intersection to merge table --- bin/filter_contig_per_cpm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/filter_contig_per_cpm.py b/bin/filter_contig_per_cpm.py index 8963aa3..7ec0b9d 100755 --- a/bin/filter_contig_per_cpm.py +++ b/bin/filter_contig_per_cpm.py @@ -67,7 +67,7 @@ def combine_idxstat_files(idxstat_dfs): common_contigs = set(idxstat_dfs[0].index) for df in idxstat_dfs[1:]: - common_contigs = common_contigs.intersection(set(df.index)) + common_contigs = common_contigs.union(set(df.index)) # Convert set to list for pandas indexing common_contigs_list = sorted(list(common_contigs)) @@ -86,10 +86,10 @@ def combine_idxstat_files(idxstat_dfs): def combine_kept_contigs(kept_contigs_list): """Combine multiple lists of kept contigs.""" - # Get contigs that appear in all lists (intersection) + # Get contigs that appear in all lists (union) common_contigs = set(kept_contigs_list[0]) for contigs in kept_contigs_list[1:]: - common_contigs = common_contigs.intersection(set(contigs)) + common_contigs = common_contigs.union(set(contigs)) return pd.Index(list(common_contigs)) def filter_cpm(idxstat_file, cpm_cutoff): -- GitLab From ab9d99b20f59bda4a2b2cb5d716003262ba788e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Noirot=20C=C3=A9line?= <celine.noirot@inra.fr> Date: Mon, 10 Feb 2025 15:01:02 +0100 Subject: [PATCH 07/10] Upgrade multiQC --- conf/base.config | 1 + conf/test.config | 1 + modules/multiqc.nf | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 65e0d69..919746c 100644 --- a/conf/base.config +++ b/conf/base.config @@ -28,6 +28,7 @@ process { memory = { 8.GB * task.attempt } } withName: MULTIQC { + container = 'quay.io/biocontainers/multiqc:1.27.1--pyhdfd78af_0' memory = { 8.GB * task.attempt } } withName: HOST_FILTER { diff --git a/conf/test.config b/conf/test.config index 9bac780..cd3005c 100644 --- a/conf/test.config +++ b/conf/test.config @@ -18,6 +18,7 @@ process { memory = { 2.GB * task.attempt } } withName: MULTIQC { + container = 'quay.io/biocontainers/multiqc:1.27.1--pyhdfd78af_0' executor = "local" memory = { 2.GB * task.attempt } } diff --git a/modules/multiqc.nf b/modules/multiqc.nf index d4581ad..68b8816 100644 --- a/modules/multiqc.nf +++ b/modules/multiqc.nf @@ -26,7 +26,7 @@ process MULTIQC { script: """ - multiqc . --config ${multiqc_config} -m custom_content -m fastqc -m cutadapt -m sickle -m kaiju -m quast -m prokka -m featureCounts -m samtools + multiqc . --config ${multiqc_config} -m custom_content -m fastqc -m cutadapt -m sickle -m kaiju -m quast -m prokka -m featurecounts -m samtools multiqc --version > v_multiqc.txt """ } -- GitLab From fd725c16488d537e07b6bbcb705b34e4219eae13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Noirot=20C=C3=A9line?= <celine.noirot@inra.fr> Date: Mon, 10 Feb 2025 15:08:41 +0100 Subject: [PATCH 08/10] remove multiQC version from process --- modules/get_software_versions.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/get_software_versions.nf b/modules/get_software_versions.nf index a60f217..97cbef9 100644 --- a/modules/get_software_versions.nf +++ b/modules/get_software_versions.nf @@ -17,7 +17,6 @@ process GET_SOFTWARE_VERSIONS { echo $workflow.manifest.version > v_pipeline.txt echo $workflow.nextflow.version > v_nextflow.txt python --version &> v_python.txt - multiqc --version &> v_multiqc.txt scrape_software_versions.py > software_versions_mqc.yaml """ } \ No newline at end of file -- GitLab From f8e8403aac248adc41ec8fc0f712745e9617ba68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Noirot=20C=C3=A9line?= <celine.noirot@inra.fr> Date: Mon, 10 Feb 2025 15:36:54 +0100 Subject: [PATCH 09/10] remove multiQC from yml --- env/metagWGS.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/env/metagWGS.yml b/env/metagWGS.yml index 463510d..dbde1c5 100644 --- a/env/metagWGS.yml +++ b/env/metagWGS.yml @@ -19,7 +19,6 @@ dependencies: - krona=2.8.1 - megahit=1.2.9 - minimap2=2.24 - - multiqc=1.14 - pandas=1.5.2 - plotly - prodigal=2.6.3 -- GitLab From 53045c1546a93ba9a21ee639f6705112a044e97c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Noirot=20C=C3=A9line?= <celine.noirot@inra.fr> Date: Mon, 10 Feb 2025 16:46:09 +0100 Subject: [PATCH 10/10] Upgrade cutadapt to 5.0 --- conf/base.config | 1 + conf/test.config | 1 - env/metagWGS.yml | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/conf/base.config b/conf/base.config index 919746c..061d014 100644 --- a/conf/base.config +++ b/conf/base.config @@ -17,6 +17,7 @@ process { maxErrors = '-1' withName: CUTADAPT { + container = 'quay.io/biocontainers/cutadapt:5.0--py310h1fe012e_0' cpus = 8 memory = { 8.GB * task.attempt } } diff --git a/conf/test.config b/conf/test.config index cd3005c..9bac780 100644 --- a/conf/test.config +++ b/conf/test.config @@ -18,7 +18,6 @@ process { memory = { 2.GB * task.attempt } } withName: MULTIQC { - container = 'quay.io/biocontainers/multiqc:1.27.1--pyhdfd78af_0' executor = "local" memory = { 2.GB * task.attempt } } diff --git a/env/metagWGS.yml b/env/metagWGS.yml index dbde1c5..cb02fd8 100644 --- a/env/metagWGS.yml +++ b/env/metagWGS.yml @@ -8,7 +8,6 @@ dependencies: - bcbio-gff=0.6.9 - bwa-mem2=2.2.1 - cd-hit=4.8.1 - - cutadapt=4.2 - diamond=2.0.15 - eggnog-mapper=2.1.9 - fastqc=0.11.9 -- GitLab