├── setup.cfg
├── tests
├── test_bash
│ ├── test.sh
│ └── Snakefile
├── test_executor
│ ├── test.sh
│ ├── test.py
│ └── Snakefile
└── test_spades
│ ├── test.sh
│ ├── metapi.yaml
│ └── Snakefile
├── docs
├── mag_workflow.png
└── metapi.dio
├── metapi
├── profiles
│ ├── lsf
│ │ ├── lsf_jobscript.sh
│ │ ├── config.yaml
│ │ ├── CookieCutter.py
│ │ ├── lsf_config.py
│ │ ├── OSLayer.py
│ │ └── memory_units.py
│ ├── slurm
│ │ ├── slurm-jobscript.sh
│ │ ├── settings.json
│ │ ├── config.yaml
│ │ ├── CookieCutter.py
│ │ ├── slurm-status.py
│ │ └── slurm-submit.py
│ ├── pbs-torque
│ │ ├── pbs-jobscript.sh
│ │ ├── config.yaml
│ │ └── pbs-status.py
│ ├── sge
│ │ ├── sge-jobscript.sh
│ │ ├── config.yaml
│ │ └── sge-status.py
│ └── generic
│ │ ├── config.yaml
│ │ ├── lsf_status.py
│ │ ├── pbs_status.py
│ │ ├── key_mapping.yaml
│ │ ├── cluster_config.yaml
│ │ ├── slurm_status.py
│ │ └── scheduler.py
├── __about__.py
├── envs
│ ├── drep.yaml
│ ├── kmcp.yaml
│ ├── blast.yaml
│ ├── cdhit.yaml
│ ├── checkv.yaml
│ ├── fastqc.yaml
│ ├── multiqc.yaml
│ ├── kneaddata.yaml
│ ├── plass.yaml
│ ├── quast.yaml
│ ├── virsorter2.yaml
│ ├── metabat2.yaml
│ ├── taxonkit.yaml
│ ├── gtdbtk.yaml
│ ├── idba.yaml
│ ├── simulate.yaml
│ ├── spades.yaml
│ ├── dastools.yaml
│ ├── galah.yaml
│ ├── predict.yaml
│ ├── maxbin2.yaml
│ ├── megahit.yaml
│ ├── report.yaml
│ ├── checkm.yaml
│ ├── trimming.yaml
│ ├── krakenuniq.yaml
│ ├── raw.yaml
│ ├── kraken2.yaml
│ ├── phamb.yaml
│ ├── align.yaml
│ ├── concoct.yaml
│ ├── deepvirfinder.yaml
│ ├── vamb.yaml
│ └── semibin.yaml
├── wrappers
│ ├── concoct_postprocess.py
│ ├── maxbin2_postprocess.py
│ ├── dastools_postprocess.py
│ ├── hmmsearch_wrapper.py
│ ├── vamb
│ │ ├── write_abundances.py
│ │ ├── abundances_mask.py
│ │ ├── create_abundances.py
│ │ └── concatenate.py
│ ├── misc.py
│ ├── prokka_wrapper.py
│ ├── simulate_reads.py
│ ├── prodigal_wrapper.py
│ └── gtdbtk_postprocess.py
├── snakefiles
│ ├── simulate_wf.smk
│ └── gene_wf.smk
├── visualization
│ └── dada2_stats_barplot.R
├── tooler.py
├── rules
│ ├── qcreport.smk
│ ├── simulate.smk
│ ├── upload.smk
│ └── binning_report.smk
├── predictor.py
├── __init__.py
├── simulator.py
└── checkmer.py
├── requirements.txt
├── run_metapi.py
├── environment.yml
├── scripts
├── perl_test.pl
├── cout_seq_by_line.py
├── samples_validator.py
├── job.py
├── find_ATG.pl
├── contigs_filter_by_len.py
├── parse_mgs_profile.py
├── merge_sig_csv.py
├── get_bins_id.py
├── print_reads_length.py
├── batch_prokka.py
├── checkm_link.py
├── merge_fasta_by_len.py
├── filter_pe_fastq_by_len.py
├── fasta_length_tab.py
├── kraken2_reads_merger.py
├── animf_cluster.py
├── find_path.py
├── taxonomy_info_covert.py
├── get_prodigal_gbk_result.py
├── asm_status_wrapper.py
├── cut_up_fasta_concoct.py
├── aggregate_genomecov.py
├── contigs_from_sample.py
├── fastq_contig_size.py
├── split_fx.py
├── megahit_hadoop.sh
├── get_bin_id_by_ccsh.py
├── insert_size_ploter.py
├── megahit_sge.py
├── split_mummer.py
├── assembly_info.r
├── metapi_config_update.py
├── merge_checkm_out.py
├── clstr_szie_tab.py
├── rename_fasta_id.py
├── asub.py
├── kraken2_demultiplex_summary.py
├── qc_report.py
├── clean_statout_to_matrix.py
├── extract_bins_from_mgs_profile.py
├── filter_pe_fastq_by_size.py
├── post_assembly_binning.py
├── estimate_T2T_data_size.py
├── mapping_statistics.py
└── t2d_abundance_merger.py
├── MANIFEST.in
├── .gitignore
├── .circleci
└── config.yml
└── setup.py
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 |
--------------------------------------------------------------------------------
/tests/test_bash/test.sh:
--------------------------------------------------------------------------------
1 | snakemake --snakefile Snakefile -c 1 --until all
2 |
--------------------------------------------------------------------------------
/tests/test_executor/test.sh:
--------------------------------------------------------------------------------
1 |
2 | snakemake --snakefile Snakefile -c 1 --until all
--------------------------------------------------------------------------------
/docs/mag_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ohmeta/metapi/HEAD/docs/mag_workflow.png
--------------------------------------------------------------------------------
/metapi/profiles/lsf/lsf_jobscript.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # properties = {properties}
3 | {exec_job}
--------------------------------------------------------------------------------
/tests/test_spades/test.sh:
--------------------------------------------------------------------------------
1 | snakemake --snakefile Snakefile -c 1 --until all --use-conda
2 |
--------------------------------------------------------------------------------
/metapi/profiles/slurm/slurm-jobscript.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # properties = {properties}
3 | {exec_job}
4 |
--------------------------------------------------------------------------------
/metapi/profiles/pbs-torque/pbs-jobscript.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # properties = {properties}
3 | {exec_job}
4 |
--------------------------------------------------------------------------------
/metapi/__about__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | __version__ = '3.0.0'
4 | __author__ = "Jie Zhu, Fangming Yang, Ye Peng"
5 |
--------------------------------------------------------------------------------
/metapi/envs/drep.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - drep=3.5.0
7 | - pigz
8 | - jq
--------------------------------------------------------------------------------
/metapi/envs/kmcp.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - kmcp=0.9.4
7 | - pigz
8 | - jq
--------------------------------------------------------------------------------
/tests/test_spades/metapi.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - defaults
5 | dependencies:
6 | - metapi=2.3.0
7 |
--------------------------------------------------------------------------------
/metapi/envs/blast.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - blast=2.15.0
7 | - pigz
8 | - jq
--------------------------------------------------------------------------------
/metapi/envs/cdhit.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - cd-hit=4.8.1
7 | - pigz
8 | - jq
--------------------------------------------------------------------------------
/metapi/envs/checkv.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - checkv=1.0.1
7 | - pigz
8 | - jq
--------------------------------------------------------------------------------
/metapi/envs/fastqc.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - fastqc=0.12.1
7 | - pigz
8 | - jq
--------------------------------------------------------------------------------
/metapi/envs/multiqc.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - multiqc=1.21
7 | - pigz
8 | - jq
--------------------------------------------------------------------------------
/metapi/envs/kneaddata.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - kneaddata=0.12.0
7 | - pigz
8 | - jq
--------------------------------------------------------------------------------
/metapi/envs/plass.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - plass=4.687d7
7 | - pigz
8 | - jq
9 |
--------------------------------------------------------------------------------
/metapi/envs/quast.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - quast=5.2.0
7 | - pigz
8 | - jq
9 |
--------------------------------------------------------------------------------
/metapi/envs/virsorter2.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - virsorter=2.2.4
7 | - pigz
8 | - jq
--------------------------------------------------------------------------------
/metapi/profiles/sge/sge-jobscript.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # properties = {properties}
3 |
4 | # exit on first error
5 | set -o errexit
6 |
7 | {exec_job}
8 |
--------------------------------------------------------------------------------
/metapi/envs/metabat2.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - metabat2=2.15
7 | - pigz
8 | - jq
9 |
--------------------------------------------------------------------------------
/metapi/envs/taxonkit.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - taxonkit
7 | - csvtk
8 | - pigz
9 | - jq
--------------------------------------------------------------------------------
/metapi/envs/gtdbtk.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - gtdbtk=2.3.2
7 | - pandas
8 | - pigz
9 | - jq
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | numpy
3 | ruamel.yaml
4 | snakemake >=7.0
5 | openpyxl
6 | natsort
7 | biopython >=1.73
8 | seaborn
9 | matplotlib
10 | executor
--------------------------------------------------------------------------------
/tests/test_executor/test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 |
4 | from executor import execute
5 |
6 | output = str(snakemake.output)
7 | execute(f'''touch {output}''')
--------------------------------------------------------------------------------
/metapi/envs/idba.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - idba=1.1.3
7 | - seqtk
8 | - pigz
9 | - jq
10 |
--------------------------------------------------------------------------------
/metapi/envs/simulate.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - insilicoseq=2.0.1
7 | - pandas
8 | - biopython
9 | - pigz
--------------------------------------------------------------------------------
/metapi/envs/spades.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - spades=3.15.5
7 | - pigz
8 | - fd-find
9 | - jq
10 |
--------------------------------------------------------------------------------
/metapi/envs/dastools.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - python
7 | - das_tool=1.1.7
8 | - pigz
9 | - jq
10 |
--------------------------------------------------------------------------------
/metapi/profiles/slurm/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "SBATCH_DEFAULTS": "",
3 | "CLUSTER_NAME": "",
4 | "CLUSTER_CONFIG": "cluster.yaml",
5 | "ADVANCED_ARGUMENT_CONVERSION": "no"
6 | }
--------------------------------------------------------------------------------
/metapi/envs/galah.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - galah=0.4.0
7 | - dashing
8 | - fastani
9 | - pigz
10 | - jq
--------------------------------------------------------------------------------
/metapi/envs/predict.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - prodigal=2.6.3
7 | - prokka=1.14.6
8 | - pigz
9 | - jq
10 |
--------------------------------------------------------------------------------
/run_metapi.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | import sys
5 |
6 | sys.path.insert(0, os.path.dirname(__file__))
7 |
8 | from metapi.corer import main
9 | main()
10 |
--------------------------------------------------------------------------------
/metapi/envs/maxbin2.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - maxbin2=2.2.7
7 | - fraggenescan=1.31
8 | - pigz
9 | - jq
10 |
--------------------------------------------------------------------------------
/metapi/envs/megahit.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - megahit=1.2.9
7 | - gfa1
8 | - pigz
9 | - fd-find
10 | - jq
11 |
--------------------------------------------------------------------------------
/metapi/envs/report.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - seqtk=1.4
7 | - seqkit=2.8.0
8 | - bioawk=1.0
9 | - pigz
10 | - jq
--------------------------------------------------------------------------------
/metapi/envs/checkm.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - checkm-genome=1.2.2
7 | - prodigal=2.6.3
8 | - pandas=1.5.1
9 | - pigz
10 | - jq
--------------------------------------------------------------------------------
/metapi/envs/trimming.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - sickle-trim=1.33
7 | - fastp=0.23.4
8 | - trimmomatic=0.39
9 | - pigz
10 | - jq
--------------------------------------------------------------------------------
/metapi/envs/krakenuniq.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - krakenuniq=1.0.4
7 | - bracken=2.9
8 | - jellyfish=1.0.3
9 | - pigz
10 | - jq
11 |
--------------------------------------------------------------------------------
/metapi/envs/raw.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - python
7 | - coreutils
8 | - seqkit
9 | - pigz
10 | - jq
11 | - executor
12 | - sra-tools=3.0.3
--------------------------------------------------------------------------------
/metapi/envs/kraken2.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - kraken2=2.1.3
7 | - krakentools=1.2
8 | - bracken=2.9
9 | - krona=2.8.1
10 | - pigz
11 | - jq
12 |
--------------------------------------------------------------------------------
/metapi/profiles/pbs-torque/config.yaml:
--------------------------------------------------------------------------------
1 | cluster: "pbs-submit.py --depend \"{dependencies}\""
2 | cluster-status: "pbs-status.py"
3 | jobscript: "pbs-jobscript.sh"
4 | jobs: 5000
5 | immediate-submit: False
6 | verbose: true
7 | notemp: true
8 |
--------------------------------------------------------------------------------
/metapi/profiles/sge/config.yaml:
--------------------------------------------------------------------------------
1 | restart-times: 3
2 | jobscript: sge-jobscript.sh
3 | cluster: "sge-submit.py"
4 | cluster-status: "sge-status.py"
5 | max-jobs-per-second: 1
6 | max-status-checks-per-second: 1
7 | latency-wait: 60
8 | local-cores: 1
9 |
--------------------------------------------------------------------------------
/metapi/envs/phamb.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - phamb=1.0.1
7 | - hmmer=3.4
8 | - pyhmmer=0.10.9
9 | - joblib
10 | - pandas
11 | - numpy
12 | - biopython
13 | - pigz
14 | - jq
--------------------------------------------------------------------------------
/metapi/profiles/slurm/config.yaml:
--------------------------------------------------------------------------------
1 | restart-times: 3
2 | jobscript: "slurm-jobscript.sh"
3 | cluster-config: "cluster.yaml"
4 | cluster: "slurm-submit.py"
5 | cluster-status: "slurm-status.py"
6 | max-jobs-per-second: 10
7 | max-status-checks-per-second: 10
8 | latency-wait: 80
--------------------------------------------------------------------------------
/metapi/envs/align.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - minimap2=2.27
7 | - samtools=1.19.2
8 | - bwa=0.7.17
9 | - bwa-mem2=2.2.1
10 | - bowtie2=2.5.3
11 | - seqtk
12 | - seqkit
13 | - pigz
14 | - jq
--------------------------------------------------------------------------------
/metapi/envs/concoct.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - concoct=1.1.0
7 | - libopenblas=*=openmp*
8 | - mkl
9 | - python>=3
10 | - samtools>=1.9
11 | - scikit-learn=1.1.*
12 | - pigz
13 | - jq
14 | variables:
15 | USE_OPENMP: 1
--------------------------------------------------------------------------------
/metapi/envs/deepvirfinder.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - nodefaults
5 | dependencies:
6 | - python=3.6
7 | - numpy
8 | - theano=1.0.3
9 | - keras=2.2.4
10 | - scikit-learn
11 | - Biopython
12 | - h5py=2.10.0
13 | - mkl-service=2.3.0
14 | - pigz
15 | - jq
--------------------------------------------------------------------------------
/metapi/envs/vamb.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - pytorch
3 | - conda-forge
4 | - bioconda
5 | - nodefaults
6 | dependencies:
7 | - pigz
8 | - pytorch=*=*cuda11.3*
9 | - pysam
10 | - numpy
11 | - pandas
12 | - jq
13 | - pip
14 | - pip:
15 | - git+https://github.com/RasmussenLab/vamb@v4.1.3
16 |
--------------------------------------------------------------------------------
/metapi/profiles/lsf/config.yaml:
--------------------------------------------------------------------------------
1 | latency-wait: "5"
2 | jobscript: "lsf_jobscript.sh"
3 | use-conda: "False"
4 | use-singularity: "False"
5 | printshellcmds: "False"
6 | restart-times: "0"
7 | jobs: "500"
8 | cluster: "lsf_submit.py"
9 | cluster-status: "lsf_status.py"
10 | max-jobs-per-second: "10"
11 | max-status-checks-per-second: "10"
--------------------------------------------------------------------------------
/metapi/envs/semibin.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - pytorch
3 | - conda-forge
4 | - bioconda
5 | - nodefaults
6 | dependencies:
7 | - pytorch=*=*cuda11.3*
8 | - mkl=2023.2.0
9 | - pigz
10 | - jq
11 | - pandas
12 | - numexpr=2.9.0
13 | - mmseqs2
14 | - hmmer
15 | - prodigal
16 | - bedtools
17 | - samtools
18 | - fraggenescan
19 | - semibin=2.1.0
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: metapi
2 | channels:
3 | - conda-forge
4 | - bioconda
5 | - nodefaults
6 | dependencies:
7 | - setuptools
8 | - pandas
9 | - numpy
10 | - ruamel.yaml
11 | - snakemake >=7.0
12 | - openpyxl
13 | - natsort
14 | - biopython >=1.73
15 | - seaborn
16 | - matplotlib
17 | - seqtk
18 | - seqkit
19 | - pigz
20 | - fd-find
21 | - executor
--------------------------------------------------------------------------------
/tests/test_executor/Snakefile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env snakemake
2 |
3 |
4 | rule touch_1:
5 | output:
6 | "done1"
7 | script:
8 | "test.py"
9 |
10 |
11 | rule touch_2:
12 | output:
13 | "done2"
14 | run:
15 | from executor import execute
16 | execute(f'''touch {output}''')
17 |
18 |
19 | rule all:
20 | input:
21 | "done1",
22 | "done2"
--------------------------------------------------------------------------------
/scripts/perl_test.pl:
--------------------------------------------------------------------------------
1 | #/usr/bin/env perl
2 | use strict;
3 | use warnings;
4 | my $a = "hello/world";
5 | my $b = join('', $a, "/summary");
6 | print "$a\n";
7 | print "$b\n";
8 | print "$ARGV[0]\n";
9 | print "$ARGV[1]\n";
10 | print "$ARGV[2]\n";
11 | print "$ARGV[3]\n";
12 |
13 | # eg : perl perl_test.pl a b c d
14 | # output:
15 | # hello/world
16 | # hello/world/summary
17 | # a
18 | # b
19 | # c
20 | # d
--------------------------------------------------------------------------------
/metapi/profiles/generic/config.yaml:
--------------------------------------------------------------------------------
1 | restart-times: 0
2 | cluster-config: "cluster_config.yaml" #abs path
3 | cluster: "scheduler.py" #
4 | cluster-status: "slurm_status.py" #
5 | max-jobs-per-second: 10
6 | max-status-checks-per-second: 10
7 | cores: 99 # how many jobs you want to submit to your cluster queue
8 | local-cores: 1
9 | rerun-incomplete: true # recomended for cluster submissions
10 | keep-going: false
11 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.md *.txt
2 | include LICENSE
3 |
4 | recursive-include metapi/ *
5 | recursive-include metapi/wrappers *
6 | recursive-include metapi/rules *
7 | recursive-include metapi/snakefiles *
8 | recursive-include metapi/envs *
9 | recursive-include metapi/config *
10 |
11 | global-exclude metapi/ *.pyc
12 | global-exclude metapi/__pycache__ *.pyc
13 | global-exclude metapi/wrappers/ *.pyc
14 | global-exclude metapi/wrappers/__pycache__ *.pyc
15 |
--------------------------------------------------------------------------------
/scripts/cout_seq_by_line.py:
--------------------------------------------------------------------------------
1 | #!/ust/bin/env python
2 |
3 | import sys
4 |
5 | with open(sys.argv[1], 'r') as handle:
6 | num = 0
7 | for line in handle:
8 | num += 1
9 | if num == int(sys.argv[2]):
10 | print(str(num) + ":\t" + line)
11 | break
12 |
13 | # awk 'NR==YOUR_LINE{print}' file_name
14 | # sed -n -e YOUR_LINEp file_name
15 | # perl -wnl -e '$. == YOUR_LINE and print and exit;'
16 | # less -SN +TOUR_LINEg file_name (nice!)
--------------------------------------------------------------------------------
/scripts/samples_validator.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import pandas as pd
3 | import sys
4 | import os
5 |
6 |
7 | def main():
8 | samples = pd.read_csv(sys.argv[1], sep='\t').set_index("id", drop=False)
9 | for i in samples.index:
10 | fq1, fq2 = samples.loc[i, ["fq1", "fq2"]]
11 | if (not os.path.exists(fq1)) or (not os.path.exists(fq2)):
12 | print("error: %s\t%s\t%s" % (i, fq1, fq2))
13 |
14 |
15 | if __name__ == '__main__':
16 | main()
17 |
--------------------------------------------------------------------------------
/scripts/job.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import os
3 | import sys
4 |
5 | from snakemake.utils import read_job_properties
6 |
7 | jobscript = sys.argv[1]
8 | job_properties = read_job_properties(jobscript)
9 |
10 | # do something useful with the threads
11 | threads = job_properties[threads]
12 |
13 | # access property defined in the cluster configuration file (snakemake >=3.6.0)
14 | job_properties["cluster"]["time"]
15 |
16 | os.system("qsub -t {threads} {script}".format(threads=threads, script=jobscript))
17 |
--------------------------------------------------------------------------------
/metapi/wrappers/concoct_postprocess.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os
4 | import sys
5 | import subprocess
6 |
7 |
8 | with os.scandir(sys.argv[1]) as itr:
9 | i = 0
10 | for entry in itr:
11 | bin_id, suffix = os.path.splitext(entry.name)
12 | if suffix == ".fa":
13 | i += 1
14 | subprocess.run('''mv %s %s''' \
15 | % (os.path.join(sys.argv[1], entry.name),
16 | os.path.join(sys.argv[2] + "." + str(i) + ".fa")), shell=True)
--------------------------------------------------------------------------------
/scripts/find_ATG.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | use strict;
3 | use warnings;
4 | # QQ group: perlchina
5 | # question: find longest ATG+ sequences
6 |
7 | my $seq = "ATGATGASFSAGATGATGATGSFAATGATGATGATGDSFS";
8 |
9 | my @atg = $seq =~ /((ATG)+)/g;
10 | my @atg_len = map { length($_) } @atg;
11 | print "@atg\n";
12 | print "@atg_len\n\n";
13 |
14 | print((sort {$b cmp $a} ($seq =~ /(?:ATG)+/g))[0]);
15 | print "\n\n";
16 |
17 | my @atg_2 = $seq =~ /(?:ATG)+/g;
18 | my @atg_len_2 = map { length($_) } @atg_2;
19 | print "@atg_2\n";
20 | print "@atg_len_2\n";
--------------------------------------------------------------------------------
/metapi/profiles/generic/lsf_status.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 |
4 | import os
5 | import sys
6 | import warnings
7 | import subprocess
8 |
9 |
10 | jobid = sys.argv[1]
11 |
12 | out= subprocess.run(['bjobs','-noheader',jobid],stdout=subprocess.PIPE).stdout.decode('utf-8')
13 |
14 | state = out.strip().split()[2]
15 |
16 |
17 | map_state={"PEND":'running',
18 | "RUN":'running',
19 | "PROV":"running",
20 | "WAIT":'running',
21 | "DONE":'success',
22 | "":'success'}
23 |
24 | print(map_state.get(state,'failed'))
25 |
--------------------------------------------------------------------------------
/metapi/snakefiles/simulate_wf.smk:
--------------------------------------------------------------------------------
1 |
2 | #!/usr/bin/env snakemake
3 |
4 | import sys
5 | from pprint import pprint
6 | import pandas as pd
7 |
8 | from snakemake.utils import min_version
9 | min_version("7.0")
10 | shell.executable("bash")
11 |
12 | import metapi
13 |
14 | METAPI_DIR = metapi.__path__[0]
15 | WRAPPER_DIR = os.path.join(METAPI_DIR, "wrappers")
16 | DATA_DIR = os.path.join(METAPI_DIR, "data")
17 |
18 |
19 | SAMPLES, DATA_TYPE = metapi.parse_samples(config["params"]["samples"])
20 |
21 |
22 | include: "../rules/simulate.smk"
23 |
24 |
25 | rule all:
26 | input:
27 | rules.simulate_all.input
--------------------------------------------------------------------------------
/metapi/wrappers/maxbin2_postprocess.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | import sys
4 | import subprocess
5 |
6 |
7 | with os.scandir(sys.argv[1]) as itr:
8 | for entry in itr:
9 | bin_id, bin_suffix = os.path.splitext(entry.name)
10 | bin_name, cluster_num = bin_id.rsplit(".", maxsplit=1)
11 | bin_id = bin_name + "." + cluster_num.lstrip("0")
12 | if bin_suffix == ".fasta":
13 | subprocess.run('''mv %s %s''' \
14 | % (os.path.join(sys.argv[1], entry.name),
15 | os.path.join(sys.argv[1], bin_id + ".fa")), shell=True)
16 |
17 |
18 |
--------------------------------------------------------------------------------
/metapi/wrappers/dastools_postprocess.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os
4 | import sys
5 | import glob
6 | import subprocess
7 |
8 |
9 | bins_prefix = sys.argv[1] #.replace("dastools.bin", "")
10 |
11 | mags_list = glob.glob(os.path.join(sys.argv[1] + "_DASTool_bins", "*.fa"))
12 |
13 | if len(mags_list) > 0:
14 | for bin_fa in mags_list:
15 | if (os.path.getsize(bin_fa) > 0) and (not "*" in bin_fa):
16 | binner = os.path.basename(bin_fa).split(".")[0]
17 | if (binner != "unbinned") and (binner != "*"):
18 | bin_fa_ = bins_prefix + "." + os.path.basename(bin_fa).replace(binner, f'''{binner}_dastools.bin''')
19 | subprocess.run(f'''mv {bin_fa} {bin_fa_}''', shell=True)
--------------------------------------------------------------------------------
/scripts/contigs_filter_by_len.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from contigs_to_gene import cut_fasta_by_len
3 | import argparse
4 |
5 | def main():
6 | parser = argparse.ArgumentParser(description="cut fasta by len")
7 | parser.add_argument('-fa', type=str, help='scaffolds or contigs file path')
8 | parser.add_argument('-sclen', type=int, help='scaffold or contigs length cutoff, default: 500', default=500)
9 | parser.add_argument('-outdir', type=str, help='output dir store gene prediction results')
10 | parser.add_argument('-prefix', type=str, help='prefix for file name')
11 | args = parser.parse_args()
12 | cut_fasta_by_len(args.fa, args.sclen, args.outdir, args.prefix, ".fa")
13 |
14 | if __name__ == '__main__':
15 | main()
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # vscode
2 | .vscode
3 |
4 | # pycharm
5 | .idea
6 |
7 | # kdevelop
8 | .kdev4
9 | metapi.kdev4
10 |
11 | # snakemake
12 | .snakemake
13 |
14 | metapi/__pycache__/
15 | metapi/*.pyc
16 |
17 | # pipenv
18 | build/
19 | dist/
20 | metapi.egg-info/
21 | release
22 |
23 | conda/*.gz
24 |
25 | notebooks/
26 | notebook
27 |
28 | # test
29 | test/
30 | test/simulation_test/metaconfig.yaml
31 |
32 | # examples
33 | example/basic_test/data/01.trimmed
34 | example/basic_test/data/02.assembly
35 | example/basic_test/data/03.alignment
36 | example/basic_test/data/04.binning
37 | example/basic_test/data/05.checkm
38 | example/basic_test/data/logs
39 | examples/simulation_test
40 | examples
41 |
42 | # docs
43 | docs/hello.py
44 |
45 | # others
46 | index.list
47 | data.tar.gz
--------------------------------------------------------------------------------
/scripts/parse_mgs_profile.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import re
4 | import sys
5 | from pprint import pprint
6 |
7 |
8 | def parse(mgs_profile):
9 | count = 0
10 | with open(mgs_profile, 'r') as ih:
11 | for line in ih:
12 | line_list = re.split(r"\s+|,", line)
13 | cag_id = line_list[0]
14 | seq_count = line_list[1]
15 | seq_id_list = line_list[2:]
16 | count += 1
17 | a = 0
18 | if count == 1:
19 | for i in seq_id_list:
20 | print(i)
21 | a += 1
22 | print(a)
23 | print(seq_count)
24 | break
25 |
26 |
27 | def main():
28 | parse(sys.argv[1])
29 |
30 |
31 | if __name__ == '__main__':
32 | main()
33 |
--------------------------------------------------------------------------------
/metapi/profiles/generic/pbs_status.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import sys
4 | import subprocess
5 | import xml.etree.cElementTree as ET
6 |
7 | jobid = sys.argv[1]
8 |
9 | try:
10 | res = subprocess.run("qstat -f -x {}".format(jobid), check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
11 |
12 | xmldoc = ET.ElementTree(ET.fromstring(res.stdout.decode())).getroot()
13 | job_state = xmldoc.findall('.//job_state')[0].text
14 |
15 | if job_state == "C":
16 | exit_status = xmldoc.findall('.//exit_status')[0].text
17 | if exit_status == '0':
18 | print("success")
19 | else:
20 | print("failed")
21 | else:
22 | print("running")
23 |
24 | except (subprocess.CalledProcessError, IndexError, KeyboardInterrupt) as e:
25 | print("failed")
26 |
--------------------------------------------------------------------------------
/metapi/profiles/pbs-torque/pbs-status.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import sys
4 | import subprocess
5 | import xml.etree.cElementTree as ET
6 |
7 | jobid = sys.argv[1]
8 |
9 | try:
10 | res = subprocess.run("qstat -f -x {}".format(jobid), check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
11 |
12 | xmldoc = ET.ElementTree(ET.fromstring(res.stdout.decode())).getroot()
13 | job_state = xmldoc.findall('.//job_state')[0].text
14 |
15 | if job_state == "C":
16 | exit_status = xmldoc.findall('.//exit_status')[0].text
17 | if exit_status == '0':
18 | print("success")
19 | else:
20 | print("failed")
21 | else:
22 | print("running")
23 |
24 | except (subprocess.CalledProcessError, IndexError, KeyboardInterrupt) as e:
25 | print("failed")
26 |
--------------------------------------------------------------------------------
/scripts/merge_sig_csv.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import csv
4 | import argparse
5 | import pandas as pd
6 |
7 | def merge_csv(csvlist, output):
8 | frame = pd.DataFrame()
9 | list = []
10 | with open(csvlist, 'r') as csv_l:
11 | for csv_f in csv_l:
12 | df = pd.read_csv(csv_f.strip(), index_col=None, header=0)
13 | list.append(df)
14 | frame = pd.concat(list)
15 | frame.to_csv(output)
16 |
17 | def main():
18 | parser = argparse.ArgumentParser(description="merge sourmash sigs to a csv file")
19 | parser.add_argument('-csvlist', type=str, help='a file contain sig file path list')
20 | parser.add_argument('-output', type=str, help='output csv file')
21 | args = parser.parse_args()
22 | merge_csv(args.csvlist, args.output)
23 |
24 | if __name__ == '__main__':
25 | main()
26 |
--------------------------------------------------------------------------------
/metapi/wrappers/hmmsearch_wrapper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import pyhmmer
3 | import sys
4 |
5 | hmm_threads = sys.argv[0]
6 | hmm_evalue = sys.argv[1]
7 | hmm_tbl = sys.argv[2]
8 | hmm_db = sys.argv[3]
9 | hmm_seq = sys.argv[4]
10 |
11 | # reference
12 | # https://github.com/althonos/pyhmmer/issues/22
13 |
14 | alphabet = pyhmmer.easel.Alphabet.amino()
15 |
16 | with pyhmmer.easel.SequenceFile(hmm_seq, digital=True, alphabet=alphabet) as seq_file:
17 | sequences = list(seq_file)
18 |
19 | with open(hmm_tbl, "wb") as dst:
20 | with pyhmmer.plan7.HMMFile(hmm_db) as hmm_file:
21 | for i, hits in enumerate(pyhmmer.hmmsearch(hmm_file, sequences, cpus=hmm_threads, E=hmm_evalue)):
22 | hits.write(dst, format="targets", header=i==0)
23 |
24 | # example
25 | # python hmmsearch_wrapper.py 8 0.01 output.tbl virus.hmm test.faa
26 |
--------------------------------------------------------------------------------
/scripts/get_bins_id.py:
--------------------------------------------------------------------------------
1 | ##!/usr/bin/env python
2 | import glob
3 | import os
4 | import pprint
5 | import sys
6 |
7 | import pandas
8 |
9 |
10 | def parse_mags(mags_dir):
11 | bin_list = []
12 | pattern = mags_dir + "/*/*.fa"
13 | for bin in glob.glob(pattern):
14 | bin_dict = {}
15 | bin_fa = os.path.basename(bin)
16 | bin_id = bin_fa.rstrip(".fa")
17 | id = ".".join(bin_fa.split(".")[:-3])
18 | bin_dict["bin_path"] = bin.strip()
19 | bin_dict["bin_id"] = bin_id
20 | bin_dict["id"] = id
21 | bin_list.append(bin_dict)
22 | pprint.pprint(bin_list)
23 | #bin_df = pandas.DataFrame(bin_list).set_index("bin_id", drop=False)
24 | #pprint.pprint(bin_df)
25 | # a = bin_df.loc["s1.bin.2", ["bin_path"]].dropna()[0]
26 | # print(a)
27 |
28 |
29 | parse_mags(sys.argv[1])
30 |
--------------------------------------------------------------------------------
/metapi/profiles/lsf/CookieCutter.py:
--------------------------------------------------------------------------------
1 | class CookieCutter:
2 | """
3 | Cookie Cutter wrapper
4 | """
5 |
6 | @staticmethod
7 | def get_default_threads() -> int:
8 | return int("1")
9 |
10 | @staticmethod
11 | def get_default_mem_mb() -> int:
12 | return int("1024")
13 |
14 | @staticmethod
15 | def get_log_dir() -> str:
16 | return "logs/cluster"
17 |
18 | @staticmethod
19 | def get_default_queue() -> str:
20 | return ""
21 |
22 | @staticmethod
23 | def get_lsf_unit_for_limits() -> str:
24 | return "KB"
25 |
26 | @staticmethod
27 | def get_unknwn_behaviour() -> str:
28 | return "wait"
29 |
30 | @staticmethod
31 | def get_zombi_behaviour() -> str:
32 | return "ignore"
33 |
34 | @staticmethod
35 | def get_latency_wait() -> float:
36 | return float("5")
37 |
--------------------------------------------------------------------------------
/metapi/profiles/slurm/CookieCutter.py:
--------------------------------------------------------------------------------
1 | #
2 | # Based on lsf CookieCutter.py
3 | #
4 | import os
5 | import json
6 |
7 | d = os.path.dirname(__file__)
8 | with open(os.path.join(d, "settings.json")) as fh:
9 | settings = json.load(fh)
10 |
11 |
12 | class CookieCutter:
13 |
14 | SBATCH_DEFAULTS = settings['SBATCH_DEFAULTS']
15 | CLUSTER_NAME = settings['CLUSTER_NAME']
16 | CLUSTER_CONFIG = settings['CLUSTER_CONFIG']
17 | ADVANCED_ARGUMENT_CONVERSION = settings['ADVANCED_ARGUMENT_CONVERSION']
18 |
19 | @staticmethod
20 | def get_cluster_option() -> str:
21 | cluster = CookieCutter.CLUSTER_NAME
22 | if cluster != "":
23 | return f"--cluster={cluster}"
24 | return ""
25 |
26 | @staticmethod
27 | def get_advanced_argument_conversion() -> bool:
28 | val = {"yes": True, "no": False}[
29 | CookieCutter.ADVANCED_ARGUMENT_CONVERSION
30 | ]
31 | return val
32 |
--------------------------------------------------------------------------------
/scripts/print_reads_length.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | "get each reads length form fasta/fastq file"
3 | import argparse
4 | import gzip
5 | from Bio import SeqIO
6 |
7 | def print_len(infile, seqtype):
8 | '''print_len function'''
9 | if infile.endswith(".gz"):
10 | handle = gzip.open(infile, 'rt')
11 | else:
12 | handle = open(infile, 'rt')
13 | for reads in SeqIO.parse(handle, seqtype):
14 | print(reads.id, "\t", len(reads))
15 | handle.close()
16 |
17 |
18 | def main():
19 | '''main function'''
20 | parser = argparse.ArgumentParser(description='print each reads id and length info form fasta/fastq file')
21 | parser.add_argument('--infile', action='store', dest='infile', help='input fasta/fastq file')
22 | parser.add_argument('--seqtype', action='store', dest='seqtype', help='input file seq type, fasta or fastq')
23 | args = parser.parse_args()
24 | print_len(args.infile, args.seqtype)
25 |
26 | if __name__ == "__main__":
27 | main()
28 |
--------------------------------------------------------------------------------
/tests/test_bash/Snakefile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env snakemake
2 |
3 | rule download:
4 | output:
5 | r1 = "ecoli_1K.1.fq.gz",
6 | r2 = "ecoli_1K.2.fq.gz"
7 | threads:
8 | 1
9 | shell:
10 | '''
11 | curl -o ecoli_1K.1.fq.gz https://raw.githubusercontent.com/ablab/spades/spades_3.15.5/assembler/test_dataset/ecoli_1K_1.fq.gz
12 | curl -o ecoli_1K.2.fq.gz https://raw.githubusercontent.com/ablab/spades/spades_3.15.5/assembler/test_dataset/ecoli_1K_2.fq.gz
13 | '''
14 |
15 |
16 | rule decompress:
17 | input:
18 | r1 = "ecoli_1K.1.fq.gz",
19 | r2 = "ecoli_1K.2.fq.gz"
20 | output:
21 | r1 = "ecoli_1K.1.fq",
22 | r2 = "ecoli_1K.2.fq"
23 | shell:
24 | '''
25 | R1={input.r1}
26 | R2={input.r2}
27 | pigz -dc {input.r1} > ${{R1%.gz}}
28 | pigz -dc {input.r2} > ${{R2%.gz}}
29 | '''
30 |
31 |
32 | rule all:
33 | input:
34 | "ecoli_1K.1.fq",
35 | "ecoli_1K.2.fq"
36 |
--------------------------------------------------------------------------------
/scripts/batch_prokka.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import glob
4 | import os
5 | import pprint
6 | import sys
7 |
8 |
9 | def run(dir_list, outdir, logdir):
10 | cmd_list = []
11 | count = 1
12 | with open(dir_list) as f:
13 | for dir in f:
14 | count += 1
15 | bin_list = glob.glob(dir.strip() + "/*.fa")
16 | for bin in bin_list:
17 | bin_id = os.path.basename(bin).rstrip(".fa")
18 | prokka_dir = os.path.join(outdir,
19 | os.path.basename(dir.strip()))
20 | log = os.path.join(logdir, bin_id + ".prokka.log")
21 | cmd = "prokka %s --outdir %s --prefix %s --kingdom Bacteria --cpus 8 2> %s" % (
22 | bin.strip(), prokka_dir, bin_id, log)
23 | cmd_list.append(cmd)
24 | if count == 2:
25 | break
26 | return cmd_list
27 |
28 |
29 | cmd_list = run(sys.argv[1], sys.argv[2], sys.argv[3])
30 | pprint.pprint(cmd_list)
31 |
--------------------------------------------------------------------------------
/metapi/profiles/generic/key_mapping.yaml:
--------------------------------------------------------------------------------
1 | # only parameters defined in key_mapping (see below) are passed to the command in the order specified.
2 | system: "slurm" #check if system is defined below
3 |
4 | slurm:
5 | command: "sbatch --parsable"
6 | key_mapping:
7 | name: "--job-name={}"
8 | threads: "-n {}"
9 | mem: "--mem={}g"
10 | account: "--account={}"
11 | queue: "--partition={}"
12 | time: "--time={}"
13 | nodes: "-N {}"
14 | pbs:
15 | command: "qsub"
16 | key_mapping:
17 | name: "-N {}"
18 | account: "-A {}"
19 | queue: "-q {}"
20 | threads: "-l nodes=1:ppn={}" # always use 1 node
21 | mem: "-l mem={}gb"
22 | time: "-l walltime={}00" #min= seconds x 100
23 | lsf:
24 | command: "bsub -e lsf_%J.log -o lsf_%J.log"
25 | key_mapping:
26 | queue: "-q {}"
27 | name: "-J {}"
28 | threads: "-n {}"
29 | mem: '-R "rusage[mem={}000]"'
30 | account: "-P {}"
31 | nodes: "-C {}"
32 |
33 |
34 |
35 | # for other cluster systems see: https://slurm.schedmd.com/rosetta.pdf
36 |
--------------------------------------------------------------------------------
/metapi/profiles/generic/cluster_config.yaml:
--------------------------------------------------------------------------------
1 | ## This is a yaml file, defining options for specific rules or by default.
2 | ## The '#' defines a comment.
3 | ## the two spaces at the beginning of rows below rulenames are important.
4 | ## For more information see https://snakemake.readthedocs.io/en/stable/executing/cluster-cloud.html#cluster-execution
5 |
6 | # default parameter for all rules
7 | __default__:
8 | #queue: normal
9 | nodes: 1
10 |
11 |
12 | # The following rules in atlas need need more time/memory.
13 | # If you need to submit them to different queues you can configure this as outlined.
14 |
15 | # run_megahit:
16 | # queue: bigmem
17 | # run_spades:
18 | # queue: bigmem
19 |
20 | #gtdb-tk classify uses 'large_mem' and log time
21 | # classify:
22 | # queue: bigmem-long
23 |
24 | # run_checkm_lineage_wf:
25 | # queue: long
26 |
27 | # run_all_checkm_lineage_wf:
28 | # queue: long
29 |
30 | # You can overwrite values for specific rules
31 | # rulename:
32 | # queue: long
33 | # account: ""
34 | # time: # h
35 | # threads:
36 |
--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | # Check https://circleci.com/docs/2.0/language-python/ for more details
2 |
3 | version: 2.1
4 |
5 | orbs:
6 | python: circleci/python@0.2.1
7 |
8 | jobs:
9 | build-and-test:
10 | executor: python/default
11 | steps:
12 | - run:
13 | name: conda create
14 | command: |
15 | ls $HOME
16 | if [ ! -d "/home/circleci/conda" ]; then
17 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
18 | /bin/bash Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/conda
19 | else
20 | echo "Miniconda is already installed, continuing to build."
21 | fi
22 | - save_cache:
23 | paths:
24 | - /home/circleci/conda
25 | key: v2-dependencies
26 |
27 | - run:
28 | name: conda build
29 | command: |
30 | cd ~/metapi
31 | /bin/bash ~/metapi/conda
32 | conda build ./
33 | - store_artifacts:
34 | path: ~/repo/build
35 | destination: singularity-containers
36 |
37 | workflows:
38 | main:
39 | jobs:
40 | - build-and-test
41 |
--------------------------------------------------------------------------------
/metapi/wrappers/vamb/write_abundances.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import argparse
3 | import vamb
4 | from pathlib import Path
5 |
6 |
7 | def write_abundances(
8 | mask_refhash: Path, bampath: Path, min_identity: float, outfile: Path
9 | ):
10 | """For every sample, compute the abundances given the mask and refhashes"""
11 | loadnpz = np.load(mask_refhash)
12 | refhash = loadnpz["refhash"]
13 | mask = loadnpz["mask"]
14 | refhash = refhash.reshape(1)[0]
15 | (abundance, _) = vamb.parsebam.Abundance.run_pycoverm(
16 | paths=[bampath],
17 | minid=min_identity,
18 | target_refhash=refhash,
19 | target_identifiers=None,
20 | mask=mask,
21 | )
22 | vamb.vambtools.write_npz(outfile, abundance.ravel())
23 |
24 |
25 | if __name__ == "__main__":
26 | parser = argparse.ArgumentParser()
27 | parser.add_argument("--msk", type=Path, help="mask refhash")
28 | parser.add_argument("--b", type=Path, help=" bam path")
29 | parser.add_argument("--min_id", type=float, help="min identity for alignment")
30 | parser.add_argument("--out", type=Path, help="abundances outfile")
31 |
32 | opt = parser.parse_args()
33 |
34 | write_abundances(opt.msk, opt.b, opt.min_id, opt.out)
35 |
--------------------------------------------------------------------------------
/docs/metapi.dio:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/metapi/wrappers/vamb/abundances_mask.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import argparse
3 | from vamb.vambtools import RefHasher
4 | from pathlib import Path
5 |
6 |
7 | def abundances_mask(headers: Path, mask_refhash: Path, min_contig_size: int):
8 | """# Using the headers above, compute the mask and the refhash"""
9 |
10 | mask = []
11 | identifiers = []
12 |
13 | with open(headers) as file:
14 | for line in file:
15 | # SN:S27C112075 LN:2239
16 | (sn, ln) = line.split("\t")
17 | if sn[:3] != "SN:" or ln[:3] != "LN:":
18 | raise ValueError("Unknown format")
19 | passed = int(ln[3:]) >= min_contig_size
20 | mask.append(passed)
21 | if passed:
22 | identifiers.append(sn[3:])
23 |
24 | np.savez_compressed(
25 | mask_refhash,
26 | mask=np.array(mask, dtype=bool),
27 | refhash=RefHasher.hash_refnames(identifiers),
28 | )
29 |
30 |
31 | if __name__ == "__main__":
32 | parser = argparse.ArgumentParser()
33 | parser.add_argument("--h", type=Path, help=" Headers file")
34 | parser.add_argument("--msk", type=Path, help="mask refhash")
35 |
36 | parser.add_argument("--minsize", type=int, help="min contig size")
37 |
38 | opt = parser.parse_args()
39 |
40 | abundances_mask(opt.h, opt.msk, opt.minsize)
41 |
--------------------------------------------------------------------------------
/scripts/checkm_link.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | import argparse
5 |
6 |
7 | def link(link_dir, batch_num, bin_list):
8 | bins = []
9 | with open(bin_list, "r") as ih:
10 | for line in ih:
11 | bins.append(os.path.abspath(line.strip()))
12 |
13 | os.makedirs(link_dir, exist_ok=True)
14 |
15 | if len(bins) > 0:
16 | for batch_id in range(0, len(bins), batch_num):
17 | batch_dir = os.path.join(link_dir, "bins_%d" % batch_id)
18 | os.makedirs(batch_dir, exist_ok=True)
19 |
20 | for bin_fa in bins[batch_id : batch_id + batch_num]:
21 | os.symlink(bin_fa, os.path.join(batch_dir, os.path.basename(bin_fa)))
22 | else:
23 | os.makedirs(os.path.join(link_dir, "bins_0"), exist_ok=True)
24 |
25 |
26 | def main():
27 | parser = argparse.ArgumentParser("checkm link")
28 | parser.add_argument("--link_dir", help="a dir contains checkm input link")
29 | parser.add_argument(
30 | "--batch_num",
31 | type=int,
32 | default=500,
33 | help="how many bins each cehckm run, default: 500",
34 | )
35 | parser.add_argument("--bin_list", help="a file contains all bin path")
36 | args = parser.parse_args()
37 |
38 | link(args.link_dir, args.batch_num, args.bin_list)
39 |
40 |
41 | if __name__ == "__main__":
42 | main()
43 |
--------------------------------------------------------------------------------
/metapi/wrappers/vamb/create_abundances.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import argparse
3 | import vamb
4 | from pathlib import Path
5 |
6 |
7 | def create_abundances(
8 | abundances: list[Path], mask_refhash: Path, min_id: float, outfile: Path
9 | ):
10 | """Merge the abundances to a single Abundance object and save it"""
11 | refhash = np.load(mask_refhash)["refhash"]
12 |
13 | n_samples = len(abundances)
14 | first = vamb.vambtools.read_npz(abundances[0])
15 | print(len(first), n_samples)
16 | print(first.shape)
17 | matrix = np.empty((len(first), n_samples), dtype=np.float32)
18 | matrix[:, 0] = first
19 | for i, path in enumerate(abundances[1:]):
20 | matrix[:, i + 1] = vamb.vambtools.read_npz(path)
21 | abundance = vamb.parsebam.Abundance(
22 | matrix, [str(i) for i in abundances], min_id, refhash
23 | )
24 | abundance.save(outfile)
25 |
26 |
27 | if __name__ == "__main__":
28 | parser = argparse.ArgumentParser()
29 | parser.add_argument("--msk", type=Path, help="mask refhash")
30 | parser.add_argument("--ab", type=Path, nargs="+", help=" abundancaes list of files")
31 | parser.add_argument("--min_id", type=float, help="min identity for alignment")
32 | parser.add_argument("--out", type=Path, help="abundances outfile")
33 |
34 | opt = parser.parse_args()
35 |
36 | create_abundances(opt.ab, opt.msk, opt.min_id, opt.out)
37 |
--------------------------------------------------------------------------------
/scripts/merge_fasta_by_len.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 | import gzip
4 | import os
5 |
6 | from Bio import SeqIO
7 |
8 |
9 | def merge_fa_by_len(falist, minlen, maxlen, outfa):
10 | with open(falist, 'r') as falist_h, open(outfa, 'w') as out_h:
11 | for fa_file in falist_h:
12 | fa_file = fa_file.rstrip()
13 | if fa_file.endswith(".gz"):
14 | fa_h = gzip.open(fa_file, 'rt')
15 | else:
16 | fa_h = open(fa_file, 'r')
17 | for record in SeqIO.parse(fa_h, 'fasta'):
18 | if (len(record.seq) >= minlen) and (len(record.seq) <= maxlen):
19 | SeqIO.write(record, out_h, 'fasta')
20 | fa_h.close()
21 |
22 |
23 | def main():
24 | parser = argparse.ArgumentParser(description="merge many fasta file to a fasta file by length cutoff")
25 | parser.add_argument('--falist', type=str, help='input file contain fasta path list')
26 | parser.add_argument('--minlen', type=int, help='sequences min length cutoff', default=1)
27 | parser.add_argument('--maxlen', type=int, help='sequences max length cutoff', default=10000000000)
28 | parser.add_argument('--outfa', type=str, help='output fasta contain sequences which length between [minlen, maxlen]')
29 | args = parser.parse_args()
30 |
31 | merge_fa_by_len(args.falist, args.minlen, args.maxlen, args.outfa)
32 |
33 |
34 | if __name__ == "__main__":
35 | main()
36 |
--------------------------------------------------------------------------------
/metapi/wrappers/misc.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | import argparse
5 | import subprocess
6 | import sys
7 |
8 |
9 | def link_or_cat(args):
10 | fq_gz = os.path.join(args.output_dir, args.basename + ".fq.gz")
11 |
12 | if (not os.path.exists(fq_gz)) or (os.path.getsize(fq_gz) == 0):
13 | subprocess.call(
14 | f'''rm -rf {fq_gz}''',
15 | shell=True, stdout=sys.stdout, stderr=sys.stderr)
16 |
17 | if len(args.input_file) == 1:
18 | reads = os.path.realpath(args.input_file[0])
19 | subprocess.call(
20 | f'''
21 | pushd {args.output_dir} && \
22 | ln -s {reads} {args.basename}.fq.gz && \
23 | popd
24 | ''', shell=True, stdout=sys.stdout, stderr=sys.stderr)
25 | else:
26 | reads = " ".join(args.input_file)
27 | subprocess.call(
28 | f'''
29 | cat {reads} > {args.output_dir}/{args.basename}.fq.gz
30 | ''', shell=True, stdout=sys.stdout, stderr=sys.stderr)
31 |
32 |
33 | def main():
34 | parser = argparse.ArgumentParser("metapi misc")
35 | parser.add_argument("--basename", dest="basename")
36 | parser.add_argument("--input-file", dest="input_file", nargs="+")
37 | parser.add_argument("--output-dir", dest="output_dir")
38 |
39 | args = parser.parse_args()
40 |
41 | link_or_cat(args)
42 |
43 |
44 | if __name__ == "__main__":
45 | main()
46 |
--------------------------------------------------------------------------------
/scripts/filter_pe_fastq_by_len.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 | import gzip
4 | from Bio import SeqIO, bgzf
5 |
6 | def filter_pe_fasq_by_len(fq_1, fq_2, minlen, prefix):
7 | '''filter pe reads by min length'''
8 | fq_1_ = prefix + ".gt" + str(minlen) + ".1.fq.gz"
9 | fq_2_ = prefix + ".gt" + str(minlen) + ".2.fq.gz"
10 | with bgzf.BgzfWriter(fq_1_, 'wb') as out_1, bgzf.BgzfWriter(fq_2_, 'wb') as out_2:
11 | with gzip.open(fq_1, 'rt') as in_1, gzip.open(fq_2, 'rt') as in_2:
12 | for rec_a, rec_b in zip(SeqIO.parse(in_1, 'fastq'), SeqIO.parse(in_2, 'fastq')):
13 | if (len(rec_a.seq) > minlen) and (len(rec_b.seq) > minlen):
14 | SeqIO.write(rec_a, out_1, 'fastq')
15 | SeqIO.write(rec_b, out_2, 'fastq')
16 |
17 | def main():
18 | '''main function'''
19 | parser = argparse.ArgumentParser(
20 | description='filter fastq file by reads length')
21 | parser.add_argument('-1', '--read1', help='paired-end fastq file one')
22 | parser.add_argument('-2', '--read2', help='paired-end fastq file two')
23 | parser.add_argument('-l', '--minlen', type=int, default=80,
24 | help='remove reads if length < min-len')
25 | parser.add_argument('-p', '--prefix',
26 | help='output prefix')
27 | args = parser.parse_args()
28 |
29 | filter_pe_fasq_by_len(args.read1, args.read2, args.minlen, args.prefix)
30 |
31 | if __name__ == '__main__':
32 | main()
33 |
--------------------------------------------------------------------------------
/metapi/visualization/dada2_stats_barplot.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | library(ggplot2)
4 |
5 |
6 | dada2_stats_barplot <- function(df, stack=FALSE, pretty=FALSE)
7 | {
8 | df <- df %>% dplyr::arrange(`non-chimeric`)
9 | df_l <- df %>%
10 | dplyr::select("sample-id", "input", "filtered", "denoised", "non-chimeric") %>%
11 | tidyr::pivot_longer(!"sample-id", names_to="step", values_to="count") %>%
12 | dplyr::mutate(step=factor(step,
13 | levels=c("input", "filtered", "denoised", "non-chimeric")),
14 | `sample-id`=factor(`sample-id`,
15 | levels=df$`sample-id`))
16 |
17 | position = position_dodge(0.8)
18 | if (stack) { position = "stack" }
19 |
20 | if (pretty) {
21 | p <-
22 | ggpubr::ggbarplot(df_l, x="sample-id", y="count",
23 | fill="step", color="step", x.text.angle=90,
24 | stat="identity", position=position)
25 | } else {
26 | p <-
27 | ggplot(df_l, aes(x=`sample-id`, y=count)) +
28 | geom_bar(aes(color=step, fill=step),
29 | stat="identity", position=position, width=0.7) +
30 | theme_classic() +
31 | theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5,
32 | size=12, color="black"),
33 | axis.text.y=element_text(size=12, color="black"))
34 | }
35 |
36 | print(p)
37 | return(p)
38 | }
--------------------------------------------------------------------------------
/scripts/fasta_length_tab.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from Bio.SeqIO.FastaIO import SimpleFastaParser
3 | import argparse
4 | """get each sequence length from a fasta file and pring it to a file, then plot"""
5 |
6 | def gen_fa_len_tab(fa_file, len_out):
7 | with open(len_out, 'w') as out_handle:
8 | with open(fa_file, 'r') as in_handle:
9 | for title, seq in SimpleFastaParser(in_handle):
10 | #out_handle.write(title + "\t" + str(len(seq)))
11 | # just print id and seq length
12 | out_handle.write(title.split(' ')[0] + "\t" + str(len(seq)) + "\n")
13 |
14 | # megahit contigs header contains contigs length info
15 | def gen_fa_len_tab_megahit(fa_file, len_out):
16 | with open(len_out, 'w') as out_handle:
17 | with open(fa_file, 'r') as in_handle:
18 | for title, seq in SimpleFastaParser(in_handle):
19 | # maybe wrong
20 | len = title.split(' ')[-1].split('=')[-1]
21 | out_handle.write(title + "\t" + len + "\n")
22 |
23 | def main():
24 | parser = argparse.ArgumentParser(description='get fasta length info')
25 | parser.add_argument('--fasta', type=str, help='fasta file')
26 | parser.add_argument('--out', type=str, help='fasta length output file')
27 | args = parser.parse_args()
28 |
29 | gen_fa_len_tab(args.fasta, args.out)
30 |
31 | # fasta input must contigs file which was assemblyed by megahit
32 | # gen_fa_len_tab(args.fasta, args.out)
33 |
34 | if __name__ == '__main__':
35 | main()
--------------------------------------------------------------------------------
/scripts/kraken2_reads_merger.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import argparse
4 | import os
5 | import sys
6 | from glob import glob
7 |
8 |
9 | def merger_reads(inputdir, outputdir):
10 | merger = {}
11 | for i in glob(inputdir.rstrip("/") + "/*/*.1.fq.gz"):
12 | taxid = int(os.path.basename(i).split(".")[1])
13 | if taxid in merger:
14 | merger[taxid].append(i)
15 | else:
16 | merger[taxid] = [i]
17 |
18 | for taxid in merger:
19 | r1_str = " ".join(merger[taxid])
20 | r2_str = r1_str.replace("1.fq.gz", "2.fq.gz")
21 | r1 = os.path.join(outputdir, "%d.1.fq.gz" % taxid)
22 | r2 = os.path.join(outputdir, "%d.2.fq.gz" % taxid)
23 | if len(merger[taxid]) > 1:
24 | cmd = 'cat %s > %s && rm -rf %s && cat %s > %s && rm -rf %s' % (r1_str, r1, r1_str, r2_str, r2, r2_str)
25 | print(cmd)
26 | else:
27 | cmd = 'mv %s %s && mv %s %s' % (r1_str, r1, r2_str, r2)
28 | print(cmd)
29 |
30 |
31 | def main(args_):
32 | parser = argparse.ArgumentParser("merge kraken2 partition reads of many samples")
33 | parser.add_argument(
34 | '-i',
35 | '--input_dir',
36 | help='a directory contains many sample-specific directory'
37 | )
38 | parser.add_argument(
39 | '-o',
40 | '--output_dir',
41 | help='output directory'
42 | )
43 |
44 | args = parser.parse_args(args_)
45 | merger_reads(args.input_dir, args.output_dir)
46 |
47 |
48 | if __name__ == '__main__':
49 | main(sys.argv[1:])
50 |
--------------------------------------------------------------------------------
/scripts/animf_cluster.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os
4 | import pandas as pd
5 | import drep
6 |
7 | def check_drep_exists():
8 | try:
9 | from drep import argumentParser
10 | print("drep version: %s" % argumentParser.version())
11 | except ImportError:
12 | print("drep doesn't exists")
13 |
14 |
15 | def cluster(Bdb, Cdb, work_dir):
16 | Ndb = pd.DataFrame()
17 | for bdb, name in drep.d_cluster.iteratre_clusters(Bdb, Cdb):
18 | genome_list = bdb["location"].tolist()
19 | anin_folder = os.path.join(work_dir, "ANImf_files")
20 |
21 | org_lengths = {}
22 | files = []
23 | deltafiles = []
24 |
25 | # genome1_vs_genome2.delta
26 | # genome1_vs_genome2.filtered.delta
27 | for g1 in genome_list:
28 | cur_folder = os.path.join(anin_folder, os.path.basename(g1))
29 | org_lengths[os.path.basename(g1)] = \
30 | drep.d_filter.calc_fasta_length(g1)
31 | for g2 in genome_list:
32 | file_name = "{0}/{1}_vs_{2}".format(
33 | cur_folder,
34 | os.path.basename(g1),
35 | os.path.basename(g2)
36 | )
37 | deltafiles.append(file_name + ".filtered.delta")
38 | df = drep.d_cluster.process_deltafiles(deltafiles,
39 | org_lengths,
40 | coverage_method="larger")
41 |
42 |
43 |
44 |
45 | def main():
46 | pass
47 |
48 |
49 | if __name__ == '__main__':
50 | main()
--------------------------------------------------------------------------------
/scripts/find_path.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import os
3 |
4 | def find_path(dir, suffix):
5 | path = {}
6 | for f in os.listdir(dir):
7 | if f.endswith(suffix):
8 | key = f.rstrip(suffix)
9 | path[key] = os.path.join(dir, f)
10 | return path
11 |
12 | def find_path_tag(dir, tag):
13 | if tag == "raw":
14 | r1 = {}
15 | r2 = {}
16 | for f in os.listdir(dir):
17 | if f.endswith("1.fq.gz"):
18 | key = f.rstrip(".|-|_" + "1.fq.gz")
19 | r1[key] = os.path.join(dir, f)
20 | if f.endswith("2.fq.gz"):
21 | key = f.rstrip(".|-|_" + "2.fq.gz")
22 | r2[key] = os.path.join(dir, f)
23 | return (r1, r2)
24 | elif tag == "clean" or tag == "rmhost":
25 | r1 = {}
26 | r2 = {}
27 | rs = {}
28 | rt = {}
29 | for f in os.listdir(dir):
30 | if f.endswith(tag + ".1.fq.gz"):
31 | key = f.rstrip(".|-|_" + tag + ".1.fq.gz")
32 | r1[key] = os.path.join(dir, f)
33 | if f.endswith(tag + ".2.fq.gz"):
34 | key = f.rstrip(".|-|_" + tag + ".2.fq.gz")
35 | r1[key] = os.path.join(dir, f)
36 | if f.endswith(tag + ".single.fq.gz"):
37 | key = f.rstrip(".|-|_" + tag + ".single.fq.gz")
38 | r1[key] = os.path.join(dir, f)
39 | if f.endswith(tag + ".stat_out"):
40 | key = f.rstrip(".|-|_" + tag + ".stat_out")
41 | r1[key] = os.path.join(dir, f)
42 | return (r1, r2, rs, rt)
43 |
--------------------------------------------------------------------------------
/metapi/wrappers/prokka_wrapper.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import time
4 | import subprocess
5 |
6 |
7 | PROKKA_SUFFIX = ["err", "log", "faa", "ffn", "fna", "fsa",
8 | "gbk", "gff", "sqn", "tbl", "tsv", "txt"]
9 |
10 | bin_list = glob.glob(snakemake.input["mags_dir"] + "/*.fa.gz")
11 | gff_count = 0
12 |
13 | for bin_fa in bin_list:
14 | bin_id = os.path.basename(os.path.splitext(os.path.splitext(bin_fa)[0])[0])
15 | output_dir = os.path.join(snakemake.params["output_dir"], bin_id)
16 | gff_file = os.path.join(output_dir, bin_id + ".gff")
17 |
18 | subprocess(f'''echo "\nProcessing {bin_fa}\n" >> {snakemake.log}''', shell=True)
19 |
20 | # https://github.com/tseemann/prokka/pull/130
21 | # Uncompressing 1000's of gzip'ed fasta files just to run them through prokka can be a bit of pain.
22 | subprocess(
23 | f'''
24 | prokka <(zcat {bin_fa}) \
25 | --force \
26 | --centre X \
27 | --compliant \
28 | --cpus {snakemake.threads} \
29 | --outdir {output_dir} \
30 | --locustag {bin_id} \
31 | --prefix {bin_id} \
32 | --kingdom {snakemake.params["kingdom"]} \
33 | 2>> {snakemake.log}
34 | ''', shell=True)
35 |
36 | if os.path.exists(gff_file):
37 | gff_count += 1
38 |
39 | if gff_count == len(bin_list):
40 | subprocess('''touch {snakemake.output["done"]}''', shell=True)
41 |
42 | for suffix in PROKKA_SUFFIX:
43 | prokka_f = os.path.join(output_dir, f'''{bin_id}.{suffix}''')
44 | if os.path.exists(prokka_f):
45 | subprocess.run(f'''pigz -f {prokka_f}''', shell=True)
--------------------------------------------------------------------------------
/scripts/taxonomy_info_covert.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 | import csv
4 | import os
5 |
6 | def parse_lca_classify(taxonomy_csv, output):
7 | # taxonomy = ['superkingdom', 'phylum', 'order', 'class', 'family', 'genus', 'species']
8 | headers = ["ID", "status", "lineage"]
9 | rows = []
10 | with open(taxonomy_csv, 'r') as csv_h:
11 | f_csv = csv.DictReader(csv_h)
12 | # print(type(f_csv))
13 | for row in f_csv:
14 | row_dict = {}
15 | row_dict["ID"] = os.path.basename(row["ID"])
16 | row_dict["status"] = row["status"]
17 | row_dict["lineage"] = row["superkingdom"] + ";" + \
18 | row["phylum"] + ";" + \
19 | row["order"] + ";" + \
20 | row["class"] + ";" + \
21 | row["family"] + ";" + \
22 | row["genus"] + ";" + \
23 | row["species"]
24 | rows.append(row_dict)
25 |
26 | with open(output, 'w') as csv_out:
27 | csv_f = csv.DictWriter(csv_out, headers)
28 | csv_f.writeheader()
29 | csv_f.writerows(rows)
30 |
31 | def main():
32 | parser = argparse.ArgumentParser(description="convert sourmash lca classify resuts to metacoder input")
33 | parser.add_argument('-csv', type=str, help="sourmash lca classify results csv file")
34 | parser.add_argument('-out', type=str, help='coverted csv file')
35 | args = parser.parse_args()
36 | parse_lca_classify(args.csv, args.out)
37 |
38 | if __name__ == '__main__':
39 | main()
40 |
--------------------------------------------------------------------------------
/metapi/tooler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | import concurrent.futures
5 | import pandas as pd
6 |
7 |
8 | def parse(stats_file):
9 | if os.path.exists(stats_file):
10 | try:
11 | df = pd.read_csv(stats_file, sep="\t")
12 | except pd.errors.EmptyDataError:
13 | print("%s is empty, please check" % stats_file)
14 | return None
15 |
16 | if not df.empty:
17 | return df
18 | else:
19 | return None
20 | else:
21 | print("%s is not exists" % stats_file)
22 | return None
23 |
24 |
25 | def merge(input_list, func, workers, **kwargs):
26 | df_list = []
27 | with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
28 | for df in executor.map(func, input_list):
29 | if df is not None:
30 | df_list.append(df)
31 |
32 | df_ = pd.concat(df_list)
33 |
34 | if "output" in kwargs:
35 | df_.to_csv(kwargs["output"], sep="\t", index=False)
36 | return df_
37 |
38 |
39 | def merge2(input_list, func, workers, **kwargs):
40 | df1_list = []
41 | df2_list = []
42 | with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
43 | for df1, df2 in executor.map(func, input_list):
44 | if df1 is not None:
45 | df1_list.append(df1)
46 | if df2 is not None:
47 | df2_list.append(df2)
48 |
49 | df_1 = pd.concat(df1_list)
50 | df_2 = pd.concat(df2_list)
51 |
52 | if "output_1" in kwargs:
53 | df_1.to_csv(kwargs["output_1"], sep="\t", index=False)
54 | if "output_2" in kwargs:
55 | df_2.to_csv(kwargs["output_2"], sep="\t", index=False)
56 |
57 | return df_1, df_2
58 |
--------------------------------------------------------------------------------
/metapi/snakefiles/gene_wf.smk:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env snakemake
2 |
3 | import sys
4 | import metapi
5 | import pandas as pd
6 | from snakemake.utils import min_version
7 |
8 | min_version(7.0)
9 |
10 | shell.executable("bash")
11 |
12 | METAPI_DIR = metapi.__path__[0]
13 | WRAPPER_DIR = os.path.join(METAPI_DIR, "wrappers")
14 |
15 |
16 | RMHOST_DO = any([
17 | config["params"]["rmhost"]["bwa"]["do"],
18 | config["params"]["rmhost"]["bowtie2"]["do"]])
19 |
20 |
21 | TRIMMING_DO = any([
22 | config["params"]["trimming"]["sickle"]["do"],
23 | config["params"]["trimming"]["fastp"]["do"],
24 | config["params"]["trimming"]["trimmomatic"]["do"]])
25 |
26 |
27 | ASSEMBLERS = []
28 | if config["params"]["assembly"]["megahit"]["do"]:
29 | ASSEMBLERS += ["megahit"]
30 | if config["params"]["assembly"]["idba_ud"]["do"]:
31 | ASSEMBLERS += ["idba_ud"]
32 | if config["params"]["assembly"]["metaspades"]["do"]:
33 | ASSEMBLERS += ["metaspades"]
34 | if config["params"]["assembly"]["spades"]["do"]:
35 | ASSEMBLERS += ["spades"]
36 |
37 |
38 | SAMPLES, DATA_TYPE = metapi.parse_samples(config["params"]["samples"])
39 |
40 |
41 | include: "../rules/raw.smk"
42 | include: "../rules/trimming.smk"
43 | include: "../rules/rmhost.smk"
44 | include: "../rules/qcreport.smk"
45 | include: "../rules/assembly.smk"
46 | include: "../rules/predict_scaftigs.smk"
47 | include: "../rules/dereplicate_cds.smk"
48 | include: "../rules/upload.smk"
49 |
50 |
51 | rule all:
52 | input:
53 | rules.raw_all.input,
54 | rules.trimming_all.input,
55 | rules.rmhost_all.input,
56 | rules.qcreport_all.input,
57 | rules.assembly_all.input,
58 | rules.predict_scaftigs_gene_all.input,
59 | rules.dereplicate_gene_all.input,
60 | rules.upload_all.input
61 |
--------------------------------------------------------------------------------
/tests/test_spades/Snakefile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env snakemake
2 |
3 | rule download_reads:
4 | output:
5 | r1 = "test/reads/ecoli_1K.1.fq.gz",
6 | r2 = "test/reads/ecoli_1K.2.fq.gz"
7 | threads:
8 | 1
9 | shell:
10 | '''
11 | curl -o test/reads/ecoli_1K.1.fq.gz https://raw.githubusercontent.com/ablab/spades/spades_3.15.5/assembler/test_dataset/ecoli_1K_1.fq.gz
12 | curl -o test/reads/ecoli_1K.2.fq.gz https://raw.githubusercontent.com/ablab/spades/spades_3.15.5/assembler/test_dataset/ecoli_1K_2.fq.gz
13 | '''
14 |
15 |
16 | rule prepare_samples_tsv:
17 | input:
18 | r1 = "test/reads/ecoli_1K.1.fq.gz",
19 | r2 = "test/reads/ecoli_1K.2.fq.gz"
20 | output:
21 | "test/samples.tsv"
22 | threads:
23 | 1
24 | shell:
25 | '''
26 | fd -t f fq.gz $(pwd)/test/reads | \
27 | sort | uniq | paste - - | \
28 | awk 'BEGIN{{print "sample_id\tassembly_group\tbinning_group\tfq1\tfq2"}};{{print "ecoli_1K\tecoli_1K\tecoli_1K\t" $0}}' \
29 | > {output}
30 | '''
31 |
32 |
33 | rule metapi_init:
34 | input:
35 | "test/samples.tsv"
36 | output:
37 | "test/config.yaml"
38 | conda:
39 | "metapi.yaml"
40 | shell:
41 | '''
42 | pushd test
43 | metapi init -d . -s $(basename {input}) -b assembly --assembler spades
44 | popd
45 | '''
46 |
47 |
48 | rule metapi_run_assembly:
49 | input:
50 | "test/config.yaml"
51 | output:
52 | "test/results/04.assembly/report/assembly_stats_spades.tsv"
53 | conda:
54 | "metapi.yaml"
55 | shell:
56 | '''
57 | pushd test
58 | metapi mag_wf assembly_all --run-local --use-conda
59 | popd
60 | '''
61 |
62 |
63 | rule all:
64 | input:
65 | "test/results/04.assembly/report/assembly_stats_spades.tsv"
66 |
--------------------------------------------------------------------------------
/metapi/rules/qcreport.smk:
--------------------------------------------------------------------------------
1 | STEPS = ["raw"]
2 | if TRIMMING_DO:
3 | STEPS += ["trimming"]
4 | if RMHOST_DO:
5 | STEPS += ["rmhost"]
6 |
7 | SAMPLESDIR = os.path.join(config["output"][STEPS[-1]])
8 |
9 | if config["params"]["qcreport"]["do"]:
10 | rule qcreport_summary:
11 | input:
12 | expand(os.path.join(config["output"]["qcreport"], "{step}_stats.tsv"),
13 | step=STEPS)
14 | output:
15 | summary_l = os.path.join(config["output"]["qcreport"], "qc_stats_l.tsv"),
16 | summary_w = os.path.join(config["output"]["qcreport"], "qc_stats_w.tsv")
17 | priority:
18 | 30
19 | threads:
20 | config["params"]["qcreport"]["seqkit"]["threads"]
21 | run:
22 | df = metapi.merge(input, metapi.parse, threads)
23 | df = metapi.compute_host_rate(df, STEPS, SAMPLES_ID_LIST, allow_miss_samples=True, output=output.summary_l)
24 | metapi.qc_summary_merge(df, output=output.summary_w)
25 |
26 |
27 | rule qcreport_plot:
28 | input:
29 | rules.qcreport_summary.output
30 | output:
31 | os.path.join(config["output"]["qcreport"], "qc_reads_num_barplot.pdf")
32 | priority:
33 | 30
34 | run:
35 | df = metapi.parse(input[0])
36 | metapi.qc_bar_plot(df, "seaborn", output=output[0])
37 |
38 |
39 | rule qcreport_all:
40 | input:
41 | os.path.join(config["output"]["qcreport"], "qc_stats_l.tsv"),
42 | os.path.join(config["output"]["qcreport"], "qc_stats_w.tsv"),
43 | os.path.join(config["output"]["qcreport"], "qc_reads_num_barplot.pdf")
44 |
45 | else:
46 | rule qcreport_summary:
47 | input:
48 |
49 |
50 | rule qcreport_plot:
51 | input:
52 |
53 |
54 | rule qcreport_all:
55 | input:
56 |
57 |
58 | localrules:
59 | qcreport_summary,
60 | qcreport_plot,
61 | qcreport_all
--------------------------------------------------------------------------------
/metapi/profiles/generic/slurm_status.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import re
3 | import subprocess as sp
4 | import shlex
5 | import sys
6 | import time
7 | import logging
8 | logger = logging.getLogger("__name__")
9 |
10 | STATUS_ATTEMPTS = 20
11 |
12 | jobid = sys.argv[1]
13 |
14 | for i in range(STATUS_ATTEMPTS):
15 | try:
16 | sacct_res = sp.check_output(shlex.split("sacct -P -b -j {} -n".format(jobid)))
17 | res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")}
18 | break
19 | except sp.CalledProcessError as e:
20 | logger.error("sacct process error")
21 | logger.error(e)
22 | except IndexError as e:
23 | pass
24 | # Try getting job with scontrol instead in case sacct is misconfigured
25 | try:
26 | sctrl_res = sp.check_output(shlex.split("scontrol -o show job {}".format(jobid)))
27 | m = re.search("JobState=(\w+)", sctrl_res.decode())
28 | res = {jobid: m.group(1)}
29 | break
30 | except sp.CalledProcessError as e:
31 | logger.error("scontrol process error")
32 | logger.error(e)
33 | if i >= STATUS_ATTEMPTS - 1:
34 | print("failed")
35 | exit(0)
36 | else:
37 | time.sleep(1)
38 |
39 | status = res[jobid]
40 |
41 | if (status == "BOOT_FAIL"):
42 | print("failed")
43 | elif (status == "OUT_OF_MEMORY"):
44 | print("failed")
45 | elif (status.startswith("CANCELLED")):
46 | print("failed")
47 | elif (status == "COMPLETED"):
48 | print("success")
49 | elif (status == "DEADLINE"):
50 | print("failed")
51 | elif (status == "FAILED"):
52 | print("failed")
53 | elif (status == "NODE_FAIL"):
54 | print("failed")
55 | elif (status == "PREEMPTED"):
56 | print("failed")
57 | elif (status == "TIMEOUT"):
58 | print("failed")
59 | # Unclear whether SUSPENDED should be treated as running or failed
60 | elif (status == "SUSPENDED"):
61 | print("failed")
62 | else:
63 | print("running")
64 |
--------------------------------------------------------------------------------
/scripts/get_prodigal_gbk_result.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # email: zhujie@genomics.cn
3 | # license: GPL V3
4 | import re
5 |
6 | gbklist = "./gene.coordinate.gbk.pathlist.new"
7 | out = open("./gene.coordinate.stat.out.new", 'w')
8 | out.write("ID\tpartial=00\tpartial=01\tpartial=10\tpartial=11\ttotal_len\ttotal_num\tavg_length\n")
9 |
10 | with open(gbklist, 'r') as path_handler:
11 | for gbkpath in path_handler:
12 | genenum = {}
13 | gene_total_len = 0
14 | gene_total_num = 0
15 | gene_avg_len = 0
16 | partial = ['partial=00', 'partial=01', 'partial=10', 'partial=11']
17 | genenum['partial=00'] = 0
18 | genenum['partial=01'] = 0
19 | genenum['partial=10'] = 0
20 | genenum['partial=11'] = 0
21 |
22 | with open(gbkpath.strip(), 'r') as gbk_handler:
23 | first = next(gbk_handler)
24 | id = re.search(r'(.*?)seqhdr="(CL\d+_L\d+_\d+)_scaffold(.*)', first).group(2)
25 | genenum['id'] = id
26 | for line in gbk_handler:
27 | for tag in partial:
28 | if re.search(tag, line):
29 | genenum[tag] += 1
30 | gene_total_num += 1
31 | if re.search("CDS\s+(complement\()?(<)?(\d+)\.\.(>)?(\d+)(\))", line):
32 | len = re.search("CDS\s+(complement\()?(<)?(\d+)\.\.(>)?(\d+)(\))", line)
33 | gene_total_len += int(len.group(5)) - int(len.group(3)) + 1
34 |
35 | gene_avg_len = round(float(gene_total_len) / float(gene_total_num), 6)
36 |
37 | out.write("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%f\n" % (
38 | genenum['id'],
39 | genenum['partial=00'],
40 | genenum['partial=01'],
41 | genenum['partial=10'],
42 | genenum['partial=11'],
43 | gene_total_len,
44 | gene_total_num,
45 | gene_avg_len))
46 |
47 | out.close()
48 |
--------------------------------------------------------------------------------
/metapi/wrappers/vamb/concatenate.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 | import os
5 | import argparse
6 | import gzip
7 | import vamb
8 |
9 | parser = argparse.ArgumentParser(
10 | description="""Creates the input FASTA file for Vamb.
11 | Input should be one or more FASTA files, each from a sample-specific assembly.
12 | If keepnames is False, resulting FASTA can be binsplit with separator 'C'.""",
13 | formatter_class=argparse.RawDescriptionHelpFormatter,
14 | add_help=False,
15 | )
16 |
17 | parser.add_argument("outpath", help="Path to output FASTA file")
18 | parser.add_argument("inpaths", help="Paths to input FASTA file(s)", nargs="+")
19 | parser.add_argument(
20 | "-m",
21 | dest="minlength",
22 | metavar="",
23 | type=int,
24 | default=2000,
25 | help="Discard sequences below this length [2000]",
26 | )
27 | parser.add_argument(
28 | "--keepnames", action="store_true", help="Do not rename sequences [False]"
29 | )
30 | parser.add_argument("--nozip", action="store_true", help="Do not gzip output [False]")
31 |
32 | if len(sys.argv) == 1 or sys.argv[1] in ("-h", "--help"):
33 | parser.print_help()
34 | sys.exit()
35 |
36 | args = parser.parse_args()
37 |
38 | # Check inputs
39 | for path in args.inpaths:
40 | if not os.path.isfile(path):
41 | raise FileNotFoundError(path)
42 |
43 | if os.path.exists(args.outpath):
44 | raise FileExistsError(args.outpath)
45 |
46 | parent = os.path.dirname(args.outpath)
47 | if parent != "" and not os.path.isdir(parent):
48 | raise NotADirectoryError(
49 | f'Output file cannot be created: Parent directory "{parent}" is not an existing directory'
50 | )
51 |
52 | # Run the code. Compressing DNA is easy, this is not much bigger than level 9, but
53 | # many times faster
54 | filehandle = (
55 | open(args.outpath, "w")
56 | if args.nozip
57 | else gzip.open(args.outpath, "wt", compresslevel=1)
58 | )
59 | vamb.vambtools.concatenate_fasta(
60 | filehandle, args.inpaths, minlength=args.minlength, rename=(not args.keepnames)
61 | )
62 | filehandle.close()
63 |
--------------------------------------------------------------------------------
/metapi/rules/simulate.smk:
--------------------------------------------------------------------------------
1 | if config["params"]["simulate"]["do"]:
2 | rule simulate_short_reads:
3 | input:
4 | genomes = lambda wildcards: metapi.get_simulate_info(SAMPLES, wildcards, "genome")
5 | output:
6 | r1 = os.path.join(config["output"]["simulate"],
7 | "short_reads/{sample}.simulate.1.fq.gz"),
8 | r2 = os.path.join(config["output"]["simulate"],
9 | "short_reads/{sample}.simulate.2.fq.gz"),
10 | abunf = os.path.join(config["output"]["simulate"],
11 | "abundance/{sample}.simulate.abundance.txt")
12 | log:
13 | os.path.join(config["output"]["simulate"], "logs/{sample}.iss.log")
14 | benchmark:
15 | os.path.join(config["output"]["simulate"], "benchmark/iss/{sample}.iss.benchmark.txt")
16 | conda:
17 | config["envs"]["simulate"]
18 | params:
19 | output_prefix = os.path.join(config["output"]["simulate"],
20 | "short_reads/{sample}"),
21 | model = lambda wildcards: metapi.get_simulate_info(SAMPLES, wildcards, "model")[0],
22 | reads_num = lambda wildcards: metapi.get_simulate_info(SAMPLES, wildcards, "reads_num")[0],
23 | abundance = lambda wildcards: metapi.get_simulate_info(SAMPLES, wildcards, "abundance")
24 | threads:
25 | config["params"]["simulate"]["threads"]
26 | script:
27 | "../wrappers/simulate_reads.py"
28 |
29 |
30 | rule simulate_all:
31 | input:
32 | expand([
33 | os.path.join(config["output"]["simulate"],
34 | "short_reads/{sample}.simulate.{read}.fq.gz"),
35 | os.path.join(config["output"]["simulate"],
36 | "abundance/{sample}.simulate.abundance.txt")],
37 | read=["1", "2"],
38 | sample=SAMPLES.index.unique())
39 |
40 | else:
41 | rule simulate_all:
42 | input:
43 |
44 |
45 | localrules:
46 | simulate_all
--------------------------------------------------------------------------------
/metapi/profiles/slurm/slurm-status.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import re
3 | import subprocess as sp
4 | import shlex
5 | import sys
6 | import time
7 | import logging
8 | from CookieCutter import CookieCutter
9 |
10 | logger = logging.getLogger("__name__")
11 |
12 | STATUS_ATTEMPTS = 20
13 |
14 | jobid = sys.argv[1]
15 |
16 | cluster = CookieCutter.get_cluster_option()
17 |
18 | for i in range(STATUS_ATTEMPTS):
19 | try:
20 | sacct_res = sp.check_output(shlex.split(f"sacct {cluster} -P -b -j {jobid} -n"))
21 | res = {
22 | x.split("|")[0]: x.split("|")[1]
23 | for x in sacct_res.decode().strip().split("\n")
24 | }
25 | break
26 | except sp.CalledProcessError as e:
27 | logger.error("sacct process error")
28 | logger.error(e)
29 | except IndexError as e:
30 | logger.error(e)
31 | pass
32 | # Try getting job with scontrol instead in case sacct is misconfigured
33 | try:
34 | sctrl_res = sp.check_output(
35 | shlex.split(f"scontrol {cluster} -o show job {jobid}")
36 | )
37 | m = re.search(r"JobState=(\w+)", sctrl_res.decode())
38 | res = {jobid: m.group(1)}
39 | break
40 | except sp.CalledProcessError as e:
41 | logger.error("scontrol process error")
42 | logger.error(e)
43 | if i >= STATUS_ATTEMPTS - 1:
44 | print("failed")
45 | exit(0)
46 | else:
47 | time.sleep(1)
48 |
49 | status = res[jobid]
50 |
51 | if status == "BOOT_FAIL":
52 | print("failed")
53 | elif status == "OUT_OF_MEMORY":
54 | print("failed")
55 | elif status.startswith("CANCELLED"):
56 | print("failed")
57 | elif status == "COMPLETED":
58 | print("success")
59 | elif status == "DEADLINE":
60 | print("failed")
61 | elif status == "FAILED":
62 | print("failed")
63 | elif status == "NODE_FAIL":
64 | print("failed")
65 | elif status == "PREEMPTED":
66 | print("failed")
67 | elif status == "TIMEOUT":
68 | print("failed")
69 | elif status == "SUSPENDED":
70 | print("running")
71 | else:
72 | print("running")
73 |
--------------------------------------------------------------------------------
/scripts/asm_status_wrapper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 | import shutil
5 | import argparse
6 |
7 | STATSWRAPPER_TEMPLATE = '''{stats} \
8 | in={input_list} \
9 | minscaf={minscaf} > {output}'''
10 |
11 |
12 | class statswrapper:
13 | def __init__(self, input_list, minscaf, output):
14 | self.stats = shutil.which("statswrapper.sh")
15 | self.input_list = ",".join(input_list)
16 | self.minscaf = minscaf
17 | self.output = output
18 |
19 |
20 | def gen_shell(ilist, mlen, split, prefix, output):
21 | files = open(ilist, 'r').readlines()
22 | total = len(files)
23 | assert total >= split, "can't split"
24 | step = total // split
25 | m = total % split
26 | count = 0
27 | sub_files = []
28 | cmds = []
29 | for i in range(0, total, step):
30 | count += 1
31 | if count <= split:
32 | sub_files = [f.strip() for f in files[i:(i + step)]]
33 | output_ = "%s.%d.tsv" % (prefix, count)
34 | cmd = STATSWRAPPER_TEMPLATE.format_map(
35 | vars(statswrapper(sub_files, mlen, output_)))
36 | cmds.append(cmd)
37 |
38 | if (count > split) and (m > 0):
39 | sub_files += [f.strip() for f in files[(total - m):total]]
40 | output_ = "%s.%d.tsv" % (prefix, split)
41 | cmd = STATSWRAPPER_TEMPLATE.format_map(
42 | vars(statswrapper(sub_files, mlen, output_)))
43 | cmds[split - 1] = cmd
44 |
45 | with open(output, 'w') as oh:
46 | for i in cmds:
47 | oh.write(i + "\n")
48 |
49 |
50 | def main():
51 | parser = argparse.ArgumentParser("assembler status wrapper")
52 | parser.add_argument('-l', '--list', type=str, help='input assembly file list')
53 | parser.add_argument('-m', '--min_len', type=int, default=0, help='minimal contig/scaffold length')
54 | parser.add_argument('-s', '--split', type=int, default=1, help='split input file')
55 | parser.add_argument('-p', '--prefix', type=str, default="asm_stats", help="assembly status output prefix")
56 | parser.add_argument('-o', '--output', type=str, default=sys.stdout, help='write cmd to file, default: stdout')
57 | args = parser.parse_args()
58 |
59 | gen_shell(args.list, args.min_len, args.split, args.prefix, args.output)
60 |
61 |
62 | if __name__ == '__main__':
63 | main()
64 |
--------------------------------------------------------------------------------
/scripts/cut_up_fasta_concoct.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """
4 | This scipt comes from CONCOCT
5 | Let it support Python 3
6 | Cut up fasta file in non-overlapping or overlapping parts of equal length.
7 | """
8 | import argparse
9 | from Bio import SeqIO
10 | import gzip
11 |
12 | def cut_up_fasta(fastfiles, chunk_size, overlap, merge_last):
13 | for ff in fastfiles:
14 | if ff.strip().endswith(".gz"):
15 | fa_handle = gzip.open(ff.strip(), 'rt')
16 | else:
17 | fa_handle = open(ff.strip(), 'r')
18 | for record in SeqIO.parse(fa_handle, "fasta"):
19 | if (not merge_last and len(record.seq) > chunk_size) or (merge_last and len(record.seq) >= 2 * chunk_size):
20 | i = 0
21 | for split_seq in chunks(record.seq, chunk_size, overlap, merge_last):
22 | print(">%s.%i\n%s" % (record.id, i, split_seq))
23 | i = i + 1
24 | else:
25 | print(">%s\n%s" % (record.id, record.seq))
26 |
27 |
28 | def chunks(l, n, o, merge_last):
29 | """ Yield successive n-sized chunks from l with given overlap o between the
30 | chunks.
31 | """
32 | assert n > o
33 |
34 | if not merge_last:
35 | for i in range(0, len(l), n - o):
36 | yield l[i:i + n]
37 | else:
38 | for i in range(0, len(l) - n + 1, n - o):
39 | yield l[i:i + n] if i + n + n - o <= len(l) else l[i:]
40 |
41 |
42 | if __name__ == "__main__":
43 | parser = argparse.ArgumentParser(description=__doc__,
44 | formatter_class=argparse.RawDescriptionHelpFormatter)
45 | parser.add_argument(
46 | "contigs", nargs="+", help="Fasta files with contigs\n")
47 | parser.add_argument("-c", "--chunk_size", default=1999, type=int, help="Chunk size\n")
48 | parser.add_argument("-o", "--overlap_size", default=1900, type=int, help="Overlap size\n")
49 | parser.add_argument("-m", "--merge_last", default=False, action="store_true", help="Concatenate final part to last contig\n")
50 | args = parser.parse_args()
51 | cut_up_fasta(args.contigs, args.chunk_size, args.overlap_size, args.merge_last)
52 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | import sys
4 | from setuptools import setup
5 |
6 | exec(open("metapi/__about__.py").read())
7 |
8 | if sys.argv[-1] == "publish":
9 | os.system("python setup.py sdist upload")
10 | sys.exit()
11 |
12 | with open("README.md") as f:
13 | long_description = f.read()
14 |
15 | packages = ["metapi"]
16 | package_data = {
17 | "metapi": [
18 | "metapi/config/*.yaml",
19 | "metapi/envs/*.yaml",
20 | "metapi/snakefiles/*.smk",
21 | "metapi/rules/*.smk",
22 | "metapi/wrappers/*.py",
23 | "metapi/data/*",
24 | "metapi/*.py",
25 | ]
26 | }
27 | data_files = [(".", ["LICENSE", "README.md"])]
28 |
29 | entry_points = {"console_scripts": ["metapi=metapi.corer:main"]}
30 |
31 | requires = [
32 | req.strip()
33 | for req in open("requirements.txt", "r").readlines()
34 | if not req.startswith("#")
35 | ]
36 |
37 | classifiers = [
38 | "Development Status :: 3 - Alpha",
39 | "Environment :: Console",
40 | "Intended Audience :: Developers",
41 | "Intended Audience :: Science/Research",
42 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
43 | "Natural Language :: English",
44 | "Operating System :: OS Independent",
45 | "Programming Language :: Python :: 3.7",
46 | "Programming Language :: Python :: 3.8",
47 | "Programming Language :: Python :: 3.9",
48 | "Programming Language :: Python :: 3.10",
49 | "Topic :: Scientific/Engineering :: Bio-Informatics",
50 | ]
51 |
52 | setup(
53 | name="metapi",
54 | version=__version__,
55 | author=__author__,
56 | author_email="alienchuj@gmail.com",
57 | url="https://github.com/ohmeta/metapi",
58 | description="a pipeline to construct a genome catalogue from metagenomics data",
59 | long_description_content_type="text/markdown",
60 | long_description=long_description,
61 | entry_points=entry_points,
62 | packages=packages,
63 | package_data=package_data,
64 | data_files=data_files,
65 | include_package_data=True,
66 | install_requires=requires,
67 | license="GPLv3+",
68 | classifiers=classifiers,
69 | )
70 |
--------------------------------------------------------------------------------
/metapi/profiles/lsf/lsf_config.py:
--------------------------------------------------------------------------------
1 | import shlex
2 | from collections import OrderedDict
3 | from itertools import chain
4 | from typing import TextIO, Union, List, Any, Dict
5 |
6 | import yaml
7 |
8 |
9 | class Config:
10 | def __init__(self, data: Union[dict, None] = None):
11 | self._data = dict()
12 | if data is not None:
13 | for key, value in data.items():
14 | self._data[key] = self.concatenate_params(value)
15 |
16 | def __bool__(self) -> bool:
17 | return bool(self._data)
18 |
19 | def __contains__(self, item) -> bool:
20 | return item in self._data
21 |
22 | def get(self, key: str, default: Any = None) -> Any:
23 | return self._data.get(key, default)
24 |
25 | @staticmethod
26 | def args_to_dict(args: str) -> Dict[str, str]:
27 | """Converts a string into a dictionary where key/value pairs are consecutive
28 | elements of the string.
29 | Eg '-J "2" -q 3' --> {'-J': '2', '-q': '3'}
30 | """
31 | args_iter = shlex.shlex(args, posix=True)
32 | args_iter.whitespace_split = True
33 | return OrderedDict(zip(args_iter, args_iter))
34 |
35 | @staticmethod
36 | def concatenate_params(params: Union[List[str], str]) -> str:
37 | if isinstance(params, str):
38 | return params
39 | return " ".join(filter(None, params))
40 |
41 | def default_params(self) -> str:
42 | return self.get("__default__", "")
43 |
44 | def params_for_rule(self, rulename: str) -> str:
45 | """Loads default + rule-specific arguments.
46 | Arguments specified for a rule override default-specified arguments.
47 | Shlex-joining is required to properly pass quoted escapes in yaml
48 | to the shell.
49 | """
50 | default_params = self.args_to_dict(self.default_params())
51 | rule_params = self.args_to_dict(self.get(rulename, ""))
52 | default_params.update(rule_params)
53 | return " ".join(map(shlex.quote, chain.from_iterable(default_params.items())))
54 |
55 | @staticmethod
56 | def from_stream(stream: TextIO) -> "Config":
57 | data = yaml.safe_load(stream)
58 | return Config(data)
59 |
--------------------------------------------------------------------------------
/metapi/profiles/slurm/slurm-submit.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Snakemake SLURM submit script.
4 | """
5 | from snakemake.utils import read_job_properties
6 |
7 | import slurm_utils
8 | from CookieCutter import CookieCutter
9 |
10 | # cookiecutter arguments
11 | SBATCH_DEFAULTS = CookieCutter.SBATCH_DEFAULTS
12 | CLUSTER = CookieCutter.get_cluster_option()
13 | CLUSTER_CONFIG = CookieCutter.CLUSTER_CONFIG
14 | ADVANCED_ARGUMENT_CONVERSION = CookieCutter.get_advanced_argument_conversion()
15 |
16 | RESOURCE_MAPPING = {
17 | "time": ("time", "runtime", "walltime"),
18 | "mem": ("mem", "mem_mb", "ram", "memory"),
19 | "mem-per-cpu": ("mem-per-cpu", "mem_per_cpu", "mem_per_thread"),
20 | "nodes": ("nodes", "nnodes"),
21 | }
22 |
23 | # parse job
24 | jobscript = slurm_utils.parse_jobscript()
25 | job_properties = read_job_properties(jobscript)
26 |
27 | sbatch_options = {}
28 | cluster_config = slurm_utils.load_cluster_config(CLUSTER_CONFIG)
29 |
30 | # 1) sbatch default arguments and cluster
31 | sbatch_options.update(slurm_utils.parse_sbatch_defaults(SBATCH_DEFAULTS))
32 | sbatch_options.update(slurm_utils.parse_sbatch_defaults(CLUSTER))
33 |
34 | # 2) cluster_config defaults
35 | sbatch_options.update(cluster_config["__default__"])
36 |
37 | # 3) Convert resources (no unit conversion!) and threads
38 | sbatch_options.update(
39 | slurm_utils.convert_job_properties(job_properties, RESOURCE_MAPPING)
40 | )
41 |
42 | # 4) cluster_config for particular rule
43 | sbatch_options.update(cluster_config.get(job_properties.get("rule"), {}))
44 |
45 | # 5) cluster_config options
46 | sbatch_options.update(job_properties.get("cluster", {}))
47 |
48 | # 6) Advanced conversion of parameters
49 | if ADVANCED_ARGUMENT_CONVERSION:
50 | sbatch_options = slurm_utils.advanced_argument_conversion(sbatch_options)
51 |
52 | # 7) Format pattern in snakemake style
53 | sbatch_options = slurm_utils.format_values(sbatch_options, job_properties)
54 |
55 | # ensure sbatch output dirs exist
56 | for o in ("output", "error"):
57 | slurm_utils.ensure_dirs_exist(sbatch_options[o]) if o in sbatch_options else None
58 |
59 | # submit job and echo id back to Snakemake (must be the only stdout)
60 | print(slurm_utils.submit_job(jobscript, **sbatch_options))
61 |
--------------------------------------------------------------------------------
/scripts/aggregate_genomecov.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 |
4 | def aggregate(cov):
5 | '''
6 | bedtools genomecov -ibam sample.mapped.sorted.bam -g contigs_c10K.len > sample_cov.txt
7 | produce a histogram of coverage of the exons throughout the genome
8 |
9 | output format explain:
10 | 1. chromosome(or entire genome)
11 | 2. depth of coverage from features in input file
12 | 3. number of bases on chromosome(or genome) with depth equal to column 2
13 | 4. size of chromosome(or entire genome) in base pairs
14 | 5. fraction of baes on chromosome(or entire genome) with depth equal to column 2
15 | so column_5 = column_3 / column_4
16 | all sum(column_3{column_1}) = column_4{column_1}
17 | all sum(column_5{column_1}) = 1
18 |
19 | k119_2 1 30 399 0.075188
20 | k119_2 2 27 399 0.0676692
21 | k119_2 3 151 399 0.378446
22 | k119_2 4 79 399 0.197995
23 | k119_2 5 54 399 0.135338
24 | k119_2 6 39 399 0.0977444
25 | k119_2 7 19 399 0.047619
26 | k119_3 0 387 473 0.818182
27 | k119_3 1 86 473 0.181818
28 | k119_4 4 1 340 0.00294118
29 | '''
30 | with open(cov, 'r') as in_handle:
31 | cov_num = {}
32 | chr_len = {}
33 | chr_list = []
34 | for line in in_handle:
35 | chr, depth, num, len, frac = line.strip().split('\t')
36 | if chr not in chr_len:
37 | chr_len[chr] = int(len)
38 | cov_num[chr] = int(depth) * int(num)
39 | chr_list.append(chr)
40 | else:
41 | cov_num[chr] += int(depth) * int(num)
42 | for chr_name in chr_list:
43 | print("%s,%f" % (chr_name, cov_num[chr_name] / chr_len[chr_name]))
44 |
45 | def main():
46 | parser = argparse.ArgumentParser(description='aggregate the output of bedtools')
47 | parser.add_argument('-cov', type=str, help='input coverage file')
48 | args = parser.parse_args()
49 |
50 | aggregate(args.cov)
51 |
52 | if __name__ == '__main__':
53 | main()
54 |
55 | # awk -F'\t' '{l[$1]=l[$1]+($2*$3);r[$1]=$4} END {for (i in l){print i","(l[i]/r[i])}}'
--------------------------------------------------------------------------------
/metapi/profiles/sge/sge-status.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import re
3 | import subprocess as sp
4 | import shlex
5 | import sys
6 | import time
7 | import logging
8 |
9 | logger = logging.getLogger("__name__")
10 | logger.setLevel(40)
11 |
12 | STATUS_ATTEMPTS = 20
13 |
14 | jobid = int(sys.argv[1])
15 | job_status = "running"
16 |
17 | # WARNING this currently has no support for task array jobs
18 |
19 | for i in range(STATUS_ATTEMPTS):
20 | # first try qstat to see if job is running
21 | # we can use `qstat -s pr -u "*"` to check for all running and pending jobs
22 | try:
23 | qstat_res = sp.check_output(shlex.split(f"qstat -s pr")).decode().strip()
24 |
25 | # skip the header using [2:]
26 | res = {
27 | int(x.split()[0]) : x.split()[4] for x in qstat_res.splitlines()[2:]
28 | }
29 |
30 | # job is in an unspecified error state
31 | if "E" in res[jobid]:
32 | job_status = "failed"
33 | break
34 |
35 | job_status = "running"
36 | break
37 |
38 | except sp.CalledProcessError as e:
39 | logger.error("qstat process error")
40 | logger.error(e)
41 | except KeyError as e:
42 | # if the job has finished it won't appear in qstat and we should check qacct
43 | # this will also provide the exit status (0 on success, 128 + exit_status on fail)
44 | # Try getting job with scontrol instead in case sacct is misconfigured
45 | try:
46 | qacct_res = sp.check_output(shlex.split(f"qacct -j {jobid}"))
47 |
48 | exit_code = int(re.search("exit_status ([0-9]+)", qacct_res.decode()).group(1))
49 |
50 | if exit_code == 0:
51 | job_status = "success"
52 | break
53 |
54 | if exit_code != 0:
55 | job_status = "failed"
56 | break
57 |
58 | except sp.CalledProcessError as e:
59 | logger.warning("qacct process error")
60 | logger.warning(e)
61 | if i >= STATUS_ATTEMPTS - 1:
62 | job_status = "failed"
63 | break
64 | else:
65 | # qacct can be quite slow to update on large servers
66 | time.sleep(5)
67 | pass
68 |
69 | print(job_status)
70 |
--------------------------------------------------------------------------------
/scripts/contigs_from_sample.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys
3 | import os
4 | from Bio.SeqIO.FastaIO import SimpleFastaParser
5 |
6 | def contigs_from_sample(contigs_len, sc_out):
7 | info = {}
8 | #count = 0
9 | with open(contigs_len, 'r') as handle:
10 | for line in handle:
11 | key = '_'.join(line.split("_")[:3])
12 | len = int(line.split("\t")[-1])
13 | if key not in info:
14 | info[key] = {}
15 | info[key]["num"] = 1
16 | info[key]["len"] = len
17 | else:
18 | info[key]["num"] += 1
19 | info[key]["len"] += len
20 | #count += 1
21 | #if count == 10000:
22 | # break
23 | with open(sc_out, 'w') as out:
24 | out.write("sample_name\ttotal_contigs_num\ttotal_contigs_len\n")
25 | for key in info:
26 | out.write(key + "\t" + str(info[key]["num"]) + "\t" +
27 | str(info[key]["len"]) + "\n")
28 |
29 | def contigs_from_sample_list(contigs_list, sc_out):
30 | info = {}
31 | with open(contigs_list, 'r') as contigs_handle:
32 | for contigs_path in contigs_handle:
33 | key = os.path.basename(contigs_path.strip()).split(".")[0]
34 | if key not in info:
35 | info[key] = {}
36 | info[key]["num"] = 0
37 | info[key]["num_gt2kb"] = 0
38 | info[key]["len"] = 0
39 | info[key]["len_gt2kb"] = 0
40 | with open(contigs_path.strip(), 'r') as contigs_fa:
41 | for title, seq in SimpleFastaParser(contigs_fa):
42 | info[key]["num"] += 1
43 | info[key]["len"] += len(seq)
44 | if len(seq) >= 2000:
45 | info[key]["num_gt2kb"] += 1
46 | info[key]["len_gt2kb"] += len(seq)
47 | with open(sc_out, 'w') as out:
48 | out.write("sample_name\ttotal_contigs_num\ttotal_contigs_num_gt2kb\ttotal_contigs_len\ttotal_contigs_len_gt2kb\n")
49 | for key in info:
50 | out.write("%s\t%d\t%d\t%d\t%d\n" % (key, info[key]["num"], info[key]["num_gt2kb"], info[key]["len"], info[key]["len_gt2kb"]))
51 |
52 | def main():
53 | #contigs_from_sample(sys.argv[1], sys.argv[2])
54 | contigs_from_sample_list(sys.argv[1], sys.argv[2])
55 |
56 |
57 | if __name__ == '__main__':
58 | main()
59 |
--------------------------------------------------------------------------------
/metapi/profiles/lsf/OSLayer.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import sys
3 | import time
4 | import uuid
5 | from pathlib import Path
6 | from typing import Tuple, List
7 |
8 | if not __name__.startswith("tests.src."):
9 | sys.path.append(str(Path(__file__).parent.absolute()))
10 | from CookieCutter import CookieCutter
11 | else:
12 | from .CookieCutter import CookieCutter
13 |
14 | stdout = str
15 | stderr = str
16 |
17 |
18 | class TailError(Exception):
19 | pass
20 |
21 |
22 | class OSLayer:
23 | """
24 | This class provides an abstract layer to communicating with the OS.
25 | Its main purpose is to enable OS operations mocking, so we don't actually need to
26 | make file operations or create processes.
27 | """
28 |
29 | @staticmethod
30 | def mkdir(directory: Path):
31 | directory.mkdir(parents=True, exist_ok=True)
32 |
33 | @staticmethod
34 | def remove_file(file: Path):
35 | if file.is_file():
36 | file.unlink()
37 |
38 | @staticmethod
39 | def run_process(cmd: str) -> Tuple[stdout, stderr]:
40 | completed_process = subprocess.run(
41 | cmd, check=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
42 | )
43 | return (
44 | completed_process.stdout.decode().strip(),
45 | completed_process.stderr.decode().strip(),
46 | )
47 |
48 | @staticmethod
49 | def print(string: str):
50 | print(string)
51 |
52 | @staticmethod
53 | def get_uuid4_string() -> str:
54 | return str(uuid.uuid4())
55 |
56 | @staticmethod
57 | def tail(path: str, num_lines: int = 10) -> List[bytes]:
58 | if not Path(path).exists():
59 | # allow for filesystem latency
60 | time.sleep(CookieCutter.get_latency_wait())
61 | if not Path(path).exists():
62 | raise FileNotFoundError("{} does not exist.".format(path))
63 |
64 | process = subprocess.Popen(
65 | ["tail", "-n", str(num_lines), path],
66 | stdout=subprocess.PIPE,
67 | stderr=subprocess.PIPE,
68 | )
69 | exit_code = process.wait()
70 | if exit_code != 0:
71 | raise TailError(
72 | "Failed to execute the tail command on the file {} due to the "
73 | "following error:\n{}".format(path, process.stderr.read().decode())
74 | )
75 | return process.stdout.readlines()
76 |
--------------------------------------------------------------------------------
/metapi/wrappers/simulate_reads.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os
4 | import sys
5 | import gzip
6 | import subprocess
7 | from Bio import SeqIO
8 |
9 |
10 | def simulate_short_reads(
11 | genomes, output_prefix, r1, r2, abunf, model, reads_num, abundance, threads, logf,
12 | ):
13 | if len(abundance) != 0:
14 | with open(abunf, "w") as outh:
15 | for (g, a) in zip(genomes, abundance):
16 | inh = gzip.open(g, "rt") if g.endswith(".gz") else open(g, "r")
17 | genome = []
18 | total_len = 0
19 | for record in SeqIO.parse(inh, "fasta"):
20 | total_len += len(record.seq)
21 | genome.append((record.id, len(record.seq)))
22 | for s in genome:
23 | outh.write("%s\t%f\n" %
24 | (s[0], float(a) * s[1] / total_len))
25 | inh.close()
26 |
27 | args = (
28 | ["iss", "generate", "--cpus", str(threads), "--genomes"]
29 | + genomes
30 | + ["--n_reads", reads_num, "--model", model, "--output", output_prefix]
31 | )
32 |
33 | if len(abundance) != 0:
34 | args += ["--abundance_file", abunf]
35 | print(" ".join(args))
36 | env = os.environ.copy()
37 | proc = subprocess.Popen(
38 | args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, encoding="utf-8",
39 | )
40 | output, error = proc.communicate()
41 |
42 | with open(logf, "w") as logh:
43 | logh.write(error)
44 |
45 | if proc.returncode == 0:
46 | if len(abundance) == 0:
47 | default_abunf = output_prefix + "_abundance.txt"
48 | if os.path.exists(default_abunf):
49 | os.rename(default_abunf, abunf)
50 | subprocess.run(f"pigz -p {threads} {output_prefix}_R1.fastq", shell=True)
51 | subprocess.run(f"pigz -p {threads} {output_prefix}_R2.fastq", shell=True)
52 | os.rename(f"{output_prefix}_R1.fastq.gz", r1)
53 | os.rename(f"{output_prefix}_R2.fastq.gz", r2)
54 | else:
55 | sys.exit(1)
56 |
57 |
58 | simulate_short_reads(
59 | snakemake.input["genomes"],
60 | snakemake.params["output_prefix"],
61 | snakemake.output["r1"],
62 | snakemake.output["r2"],
63 | snakemake.output["abunf"],
64 | snakemake.params["model"],
65 | snakemake.params["reads_num"],
66 | snakemake.params["abundance"],
67 | snakemake.threads,
68 | str(snakemake.log))
69 |
70 |
71 |
--------------------------------------------------------------------------------
/scripts/fastq_contig_size.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 | import os
4 | import csv
5 |
6 | def gen_size_tsv(fqlist, ctglist, tsvout):
7 | '''gen data size tsv out'''
8 | fq_size = {}
9 | ctg_size = {}
10 | file_size = {}
11 | file_size["header"] = ["fq_1", "fq_2", "fq_s", "contig", "sample_name"]
12 | file_size["body"] = []
13 |
14 | with open(fqlist, 'r') as fq_handle, open(ctglist, 'r') as ctg_handle:
15 | for (fq_line, ctg_line) in zip(fq_handle, ctg_handle):
16 | (reads_a, reads_b, reads_s) = fq_line.strip().split()
17 | fq_name = os.path.basename(reads_a).split('.')[0]
18 | ctg_name = os.path.basename(ctg_line).split('.')[0]
19 | if fq_name not in fq_size:
20 | fq_size[fq_name] = {}
21 | fq_size[fq_name]["fq_1"] = os.path.getsize(reads_a)
22 | fq_size[fq_name]["fq_2"] = os.path.getsize(reads_b)
23 | fq_size[fq_name]["fq_s"] = os.path.getsize(reads_s)
24 | if ctg_name not in ctg_size:
25 | ctg_size[ctg_name] = {}
26 | ctg_size[ctg_name] = os.path.getsize(ctg_line.strip())
27 |
28 | assert sorted(fq_size.keys()) == sorted(ctg_size.keys())
29 |
30 | for key in ctg_size:
31 | file_size_ = {}
32 | file_size_["sample_name"] = key
33 | file_size_["fq_1"] = fq_size[key]["fq_1"]
34 | file_size_["fq_2"] = fq_size[key]["fq_2"]
35 | file_size_["fq_s"] = fq_size[key]["fq_s"]
36 | file_size_["contig"] = ctg_size[key]
37 | file_size["body"].append(file_size_)
38 |
39 | with open(tsvout, 'w') as out_handle:
40 | f_tsv = csv.DictWriter(out_handle, file_size["header"], delimiter='\t')
41 | f_tsv.writeheader()
42 | f_tsv.writerows(file_size["body"])
43 |
44 |
45 | def main():
46 | '''main function'''
47 | parser = argparse.ArgumentParser(
48 | description='''research relationships between fastq size and contigs size:
49 | Usage: python fastq_contig_size_relationship.py --fqlist ./212S_rmhost_fqgz.pathlist.paired --ctglist ./212S_assembly_contigs.pathlist --tsvout fq_contigs_size.ts
50 | ''')
51 | parser.add_argument('--fqlist', type=str,
52 | help='rmhost fastq file path list')
53 | parser.add_argument('--ctglist', type=str,
54 | help='contigs file path list')
55 | parser.add_argument('--tsvout', type=str,
56 | help='tsv out put')
57 | args = parser.parse_args()
58 | gen_size_tsv(args.fqlist, args.ctglist, args.tsvout)
59 |
60 |
61 | if __name__ == '__main__':
62 | main()
63 |
--------------------------------------------------------------------------------
/metapi/profiles/generic/scheduler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 |
4 | import sys, os
5 | from subprocess import Popen, PIPE
6 | import yaml
7 |
8 |
9 | def eprint(*args, **kwargs):
10 | print(*args, file=sys.stderr, **kwargs)
11 |
12 |
13 | # let snakemake read job_properties
14 | from snakemake.utils import read_job_properties
15 |
16 |
17 |
18 | jobscript = sys.argv[1]
19 | job_properties = read_job_properties(jobscript)
20 |
21 | #default paramters defined in cluster_spec (accessed via snakemake read_job_properties)
22 | cluster_param= job_properties["cluster"]
23 |
24 | if job_properties["type"]=='single':
25 | cluster_param['name'] = job_properties['rule']
26 | elif job_properties["type"]=='group':
27 | cluster_param['name'] = job_properties['groupid']
28 | else:
29 | raise NotImplementedError(f"Don't know what to do with job_properties['type']=={job_properties['type']}")
30 |
31 |
32 | # don't overwrite default parameters if defined in rule (or config file)
33 | if ('threads' in job_properties) and ('threads' not in cluster_param):
34 | cluster_param["threads"] = job_properties["threads"]
35 | for res in ['time','mem']:
36 | if (res in job_properties["resources"]) and (res not in cluster_param):
37 | cluster_param[res] = job_properties["resources"][res]
38 |
39 | # time in hours
40 | if "time" in cluster_param:
41 | cluster_param["time"]=int(cluster_param["time"]*60)
42 |
43 |
44 | # check which system you are on and load command command_options
45 | key_mapping_file=os.path.join(os.path.dirname(__file__),"key_mapping.yaml")
46 | command_options=yaml.load(open(key_mapping_file),
47 | Loader=yaml.BaseLoader)
48 | system= command_options['system']
49 | command= command_options[system]['command']
50 |
51 | key_mapping= command_options[system]['key_mapping']
52 |
53 | # construct command:
54 | for key in key_mapping:
55 | if key in cluster_param:
56 | command+=" "
57 | command+=key_mapping[key].format(cluster_param[key])
58 |
59 | command+=' {}'.format(jobscript)
60 |
61 | eprint("submit command: "+command)
62 |
63 | p = Popen(command.split(' '), stdout=PIPE, stderr=PIPE)
64 | output, error = p.communicate()
65 | if p.returncode != 0:
66 | raise Exception("Job can't be submitted\n"+output.decode("utf-8")+error.decode("utf-8"))
67 | else:
68 | res= output.decode("utf-8")
69 |
70 | if system=='lsf':
71 | import re
72 | match = re.search(r"Job <(\d+)> is submitted", res)
73 | jobid = match.group(1)
74 |
75 | elif system=='pbs':
76 | jobid= res.strip().split('.')[0]
77 |
78 | else:
79 | jobid= int(res.strip().split()[-1])
80 |
81 | print(jobid)
82 |
--------------------------------------------------------------------------------
/scripts/split_fx.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # please see http://biopython.org/wiki/Split_large_file
3 | import argparse
4 | import os
5 | import errno
6 |
7 | from Bio import SeqIO
8 |
9 |
10 | def batch_iterator(iterator, batch_size):
11 | """Returns lists of length batch_size.
12 |
13 | This can be used on any iterator, for example to batch up
14 | SeqRecord objects from Bio.SeqIO.parse(...), or to batch
15 | Alignment objects from Bio.AlignIO.parse(...), or simply
16 | lines from a file handle.
17 |
18 | This is a generator function, and it returns lists of the
19 | entries from the supplied iterator. Each list will have
20 | batch_size entries, although the final list may be shorter.
21 | """
22 |
23 | entry = True
24 | while entry:
25 | batch = []
26 | while len(batch) < batch_size:
27 | try:
28 | # entry = iterator.next()
29 | entry = next(iterator)
30 | except StopIteration:
31 | entry = None
32 | if entry is None:
33 | break
34 | batch.append(entry)
35 | if batch:
36 | yield batch
37 |
38 |
39 | # TODO
40 | # def split_fastq()
41 | # def split_alignment()
42 |
43 |
44 | def split_fasta(fa_file, batch_size, outdir, onedir):
45 | record_iter = SeqIO.parse(open(fa_file, 'r'), "fasta")
46 | i = 0
47 | for i, batch in enumerate(batch_iterator(record_iter, batch_size), start = 1):
48 | if onedir:
49 | splitfa = os.path.join(outdir, "split_%i.fa" % (i))
50 | else:
51 | splitdir = os.path.join(outdir, "split_" + str(i))
52 | try:
53 | os.makedirs(splitdir)
54 | except OSError as e:
55 | if e.errno != errno.EEXIST:
56 | raise
57 | splitfa = os.path.join(splitdir, "split_%i.fa" % (i))
58 |
59 | with open(splitfa, 'w') as out_h:
60 | count = SeqIO.write(batch, out_h, "fasta")
61 | print("wrote %i records to %s" % (count, splitfa))
62 | return i
63 |
64 |
65 | def main():
66 | """split large fasta/fastq file by seq size"""
67 | parser = argparse.ArgumentParser(description='split large fasta/fastq file by seq size')
68 | parser.add_argument('-f', type=str, help='input file, a large fasta or fastq file')
69 | parser.add_argument('-n', type=int, help='each splited file base size', default=1000)
70 | parser.add_argument('-outdir', type=str, help='a directory store splited file')
71 |
72 | args = parser.parse_args()
73 | split_fasta(args.f, args.n, args.outdir, False)
74 |
75 | if __name__ == '__main__':
76 | main()
--------------------------------------------------------------------------------
/scripts/megahit_hadoop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Author: liuxing
3 | # Email: liuxing2@genomics.cn
4 |
5 | if [[ $# -ne 8 ]];then
6 | echo
7 | echo "usage: $0 -l FastaqFileList -o OutputDirPath -d HdfsOutputPath -n NumberOfTasks
8 | -l fastaq file list, please make a list including all fastq file, path one sample per line
9 | and seperate the read1 read2 and singleRead with space or table e.g: read1.fq read2.fq singleRead.fq
10 | -o output directory path, the directory that you would write the run script and assembly result
11 | -d HDFS output path, e.g: /user/liuxing2/megahitout
12 | -n the number of tasks, equal to the number of the samples "
13 | echo
14 | else
15 | while [[ -n "$1" ]]
16 | do
17 | case "$1" in
18 | -l) fqfilelist="$2"
19 | shift ;;
20 | -o) outpath="$2"
21 | shift ;;
22 | -d) dfsoutpath="$2"
23 | shift ;;
24 | -n) maps="$2"
25 | shift ;;
26 | esac
27 | shift
28 | done
29 | if [[ ! -d $outpath ]];then
30 | mkdir $outpath
31 | fi
32 |
33 | echo "while read LINE
34 | do
35 | if [[ -n \$LINE ]];then
36 | echo \$LINE;
37 | read1=\`echo \$LINE| awk '{print \$2}'\`
38 | read2=\`echo \$LINE| awk '{print \$3}'\`
39 | reads=\`echo \$LINE| awk '{print \$4}'\`
40 | base=\`basename \$read1\`
41 | prefix=\${base%%.*}
42 | outputfilename=\${prefix}.megahit_asm
43 | /hwfssz1/ST_META/CD/zhujie/program/bioenv/bin/megahit -1 \$read1 -2 \$read2 -r \$reads -o ${outpath}/\$outputfilename --out-prefix \$prefix
44 | fi
45 | done" >${outpath}/megahit.sh
46 |
47 | echo "/hwfssz1/BIGDATA_COMPUTING/hadoop/job_submit/10.53.20.169/CDH/bin/hadoop fs -rm -r -skipTrash $dfsoutpath
48 | /hwfssz1/BIGDATA_COMPUTING/hadoop/job_submit/10.53.20.169/CDH/bin/hadoop jar /hwfssz1/BIGDATA_COMPUTING/hadoop/job_submit/10.53.20.169/CDH/jars/hadoop-streaming-2.6.0-cdh5.11.1.jar -D mapreduce.job.name=\"megahit\" -D mapreduce.job.maps=$maps -D mapreduce.job.reduces=0 -D mapreduce.map.memory.mb=25600 -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat -input file:$fqfilelist -output $dfsoutpath -mapper \"sh megahit.sh\" -file ${outpath}/megahit.sh
49 |
50 | /hwfssz1/BIGDATA_COMPUTING/hadoop/job_submit/10.53.20.169/CDH/bin/hadoop fs -rm -r -skipTrash $dfsoutpath" >${outpath}/megahit_hadoopsubmit.sh
51 | fi
52 |
53 |
--------------------------------------------------------------------------------
/scripts/get_bin_id_by_ccsh.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 | import csv
4 | import os
5 | import re
6 |
7 |
8 | def get_bin_id(checkmout_list, out_tsv, completeness, contamination):
9 | headers = [
10 | "sample_id", "bin_id", "marker_lineage", "genomes", "markers",
11 | "marker_sets", "completeness", "contamination", "strain_heterogeneity"
12 | ]
13 | samples_bin_info = []
14 | with open(checkmout_list, "r") as list_handle:
15 | for checkmout in list_handle:
16 | with open(checkmout.strip(), 'r') as checkmout_handle:
17 | print("processing %s" % checkmout.strip())
18 | sample_id = os.path.basename(checkmout.strip()).split('.')[0]
19 | next(checkmout_handle)
20 | next(checkmout_handle)
21 | next(checkmout_handle)
22 | for info in checkmout_handle:
23 | if info.strip().startswith("R0"):
24 | info_l = re.split(r'\s+', info.strip())
25 | if (float(info_l[-2]) < contamination) and (float(
26 | info_l[-3]) > completeness):
27 | bin_info = {}
28 | bin_info['sample_id'] = sample_id
29 | bin_info["bin_id"] = info_l[0]
30 | bin_info[
31 | "marker_lineage"] = info_l[1] + " " + info_l[2]
32 | bin_info["genomes"] = info_l[3]
33 | bin_info["markers"] = info_l[4]
34 | bin_info["marker_sets"] = info_l[5]
35 | bin_info["completeness"] = info_l[-3]
36 | bin_info["contamination"] = info_l[-2]
37 | bin_info["strain_heterogeneity"] = info_l[-1]
38 | samples_bin_info.append(bin_info)
39 | with open(out_tsv, 'w') as out_handle:
40 | f_tsv = csv.DictWriter(out_handle, headers, delimiter="\t")
41 | f_tsv.writeheader()
42 | f_tsv.writerows(samples_bin_info)
43 |
44 |
45 | def main():
46 | parser = argparse.ArgumentParser(
47 | description='''get bin id by completeness cutoff and contamination
48 | cutoff''')
49 | parser.add_argument('-l', type=str, help='checkmout list of many samples')
50 | parser.add_argument(
51 | '-o',
52 | type=str,
53 | help='bin id and completeness, contamination output file')
54 | parser.add_argument(
55 | '-c1', type=float, help='completeness cutoff', default=70.0)
56 | parser.add_argument(
57 | '-c2', type=float, help='contamination cutoff', default=30.0)
58 | args = parser.parse_args()
59 | get_bin_id(args.l, args.o, args.c1, args.c2)
60 |
61 |
62 | if __name__ == '__main__':
63 | main()
64 |
--------------------------------------------------------------------------------
/scripts/insert_size_ploter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import argparse
4 | import pandas as pd
5 | from glob import glob
6 | import os
7 | import re
8 | from plotnine import *
9 |
10 |
11 | def parse_bam_stats(bam_stats_list):
12 | insert_size_df = pd.DataFrame()
13 | bam_stats_list_ = []
14 | if re.search(r'\*', bam_stats_list[0]):
15 | bam_stats_list_ = glob(bam_stats_list[0])
16 | else:
17 | bam_stats_list_ = bam_stats_list
18 |
19 | for bam_stats_file in bam_stats_list_:
20 | df = pd.DataFrame(columns=["insert_size", "pairs_total",
21 | "inward_oriented_pairs",
22 | "outward_oriented_pairs",
23 | "other_pairs", "sample_id"])
24 | sample_id = os.path.basename(bam_stats_file).split(".")[0]
25 | with open(bam_stats_file, 'r') as ih:
26 | for line in ih:
27 | if line.startswith("IS"):
28 | line_list = re.split(r'\s+', line.strip())
29 | df = df.append({"sample_id": sample_id,
30 | "insert_size": line_list[1],
31 | "pairs_total": line_list[2],
32 | "inward_oriented_pairs": line_list[3],
33 | "outward_oriented_pairs": line_list[4],
34 | "other_pairs": line_list[5]}, ignore_index=True)
35 |
36 | insert_size_df = pd.concat([insert_size_df, df])
37 | return insert_size_df
38 |
39 |
40 | def plot_insert_size(insert_size_df, outpdf):
41 | df_l = insert_size_df.melt(id_vars=["insert_size", "sample_id"],
42 | value_vars=["pairs_total",
43 | "inward_oriented_pairs",
44 | "outward_oriented_pairs",
45 | "other_pairs"],
46 | var_name="type",
47 | value_name="count")
48 | is_plot = (ggplot(df_l, aes(x='insert_size', y='count'))
49 | + geom_point(aes(fill='type', colour='type'), size=0.2)
50 | + facet_wrap('~sample_id', scales='free')
51 | + ggtitle('insert size distribution'))
52 | is_plot.save(outpdf, width=16, height=16)
53 |
54 |
55 | def main():
56 | parser = argparse.ArgumentParser('plot insert size for samtools bamstats')
57 | parser.add_argument('-i', nargs='*', help='bamstats file list, separated by spaces')
58 | parser.add_argument('-o', type=str, help='insert size plot output, pdf format')
59 |
60 | args = parser.parse_args()
61 |
62 | df = parse_bam_stats(args.i)
63 | plot_insert_size(df, args.o)
64 |
65 |
66 | if __name__ == '__main__':
67 | main()
68 |
69 |
70 |
71 |
--------------------------------------------------------------------------------
/scripts/megahit_sge.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env
2 | import shutil
3 | import os
4 | from datetime import datetime
5 | import argparse
6 |
7 | from asub import submit_job
8 |
9 | # TODO
10 | #def assembly(fqlist):
11 |
12 | def coassembly(fqlist, thread, outdir, prefix, queue, project, resource):
13 | r1 = []
14 | r2 = []
15 | with open(fqlist, 'r') as in_handle:
16 | for line in in_handle:
17 | fq_1, fq_2 = line.strip().split("\t")
18 | r1.append(os.path.abspath(fq_1))
19 | r2.append(os.path.abspath(fq_2))
20 | pe1 = ",".join(r1)
21 | pe2 = ",".join(r2)
22 | coasm_shell = "%s -1 %s -2 %s -t %d --out-dir %s --out-prefix %s\n" % (shutil.which("megahit"), pe1, pe2, thread, outdir, prefix)
23 | print(coasm_shell)
24 |
25 | with open("./megahit_coasm.sh", 'w') as sh_h:
26 | sh_h.write(coasm_shell)
27 | with open("./megahit_coasm_submit.sh", 'w') as sge_h:
28 | sge_h.write("qsub -cwd -q %s -P %s -l %s megahit_coasm.sh\n" % (queue, project, resource))
29 |
30 | '''
31 | jobname = "megahit_coasm" + "_" + datetime.now().strftime("%Y%m%d%H%M%S")
32 | logdir = jobname + "_qsub"
33 | if os.path.exists(logdir):
34 | os.remove(logdir)
35 | os.makedirs(logdir)
36 |
37 | jobfile = os.path.join(logdir, jobname + "_1.sh")
38 | with open(jobfile, 'w') as out_handle:
39 | out_handle.write(coasm_shell)
40 |
41 | submit_job(jobname, 1, queue, project, resource, logdir)
42 | '''
43 |
44 |
45 | def main():
46 | parser = argparse.ArgumentParser(description='using megahit to do assembly or coassembly')
47 | parser.add_argument('-asm', action='store_true', help='do assembly', default=False)
48 | parser.add_argument('-coasm', action='store_true', help='do coassembly', default=False)
49 | parser.add_argument('-fqlist', type=str, help='clean pair-ended reads, each line format: reads_1.fq.gz reads_2.fq.gz')
50 | parser.add_argument('-thread', type=int, help="number of CPU threads, at least 2 if GPU enabled. [# of logical processors]", default=8)
51 | parser.add_argument('-outdir', type=str, help='output directory', default="coasm_results")
52 | parser.add_argument('-prefix', type=str, help='coassembly prefix', default="megahit_coasm.out")
53 | parser.add_argument('-queue', type=str, help='submit queue', default='st.q')
54 | parser.add_argument('-project', type=str, help='project id', default='F16ZQSB1SY2779')
55 | parser.add_argument('-resource',type=str, help='resourse requirment', default='vf=30G,p=8')
56 |
57 | args = parser.parse_args()
58 |
59 | assert int(args.resource.split("=")[2]) == args.thread, "please let p number equal thread number"
60 |
61 | #if args.asm:
62 | # assembly(args.fqlist)
63 |
64 | if args.coasm:
65 | coassembly(args.fqlist, args.thread, args.outdir, args.prefix, args.queue, args.project, args.resource)
66 |
67 | if __name__ == '__main__':
68 | main()
--------------------------------------------------------------------------------
/scripts/split_mummer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 | import errno
4 | import os
5 | import shutil
6 |
7 | from asub import submit_job
8 | from split_fx import split_fasta
9 |
10 |
11 | def gen_job(qry_fa, min_cluster, split_num, split_dir, job_dir, results_dir):
12 | nucmer = shutil.which("nucmer")
13 | for i in range(1, split_num + 1):
14 | # split/split_1.fa
15 | # job/mummer_1.sh
16 | # split/split_2.fa
17 | # job/mummer_2.sh
18 | # results/nucmer_1.delta
19 | job_sh = os.path.join(job_dir, "mummer_%i.sh" % (i))
20 | ref_fa = os.path.join(split_dir, "split_%i.fa" % (i))
21 | prefix = os.path.join(results_dir, "nucmer_%i" % (i))
22 | with open(job_sh, 'w') as job_h:
23 | job_h.write("%s -maxmatch -c %d %s %s -p %s\n" % (nucmer, min_cluster, ref_fa, qry_fa, prefix))
24 |
25 | # TODO
26 | # def merge():
27 |
28 | def main():
29 | parser = argparse.ArgumentParser(description='''split reference, submit mummer array job to SGE, finally merge mummer results''')
30 | parser.add_argument('-ref', type=str, help='reference fasta file')
31 | parser.add_argument('-qry', type=str, help='query fasta file')
32 | parser.add_argument('-c', type=int, help='Sets the minimum length of a cluster of matches, default: 65', default=65)
33 | parser.add_argument('-size', type=int, help='how many seq records split into a group, default: 10000', default=10000)
34 | parser.add_argument('-outdir', type=str, help='output directory, default: ./', default="./")
35 | parser.add_argument('-queue', type=str, help='submit queue, default: st.q', default='st.q')
36 | parser.add_argument('-project', type=str, help='project id, default: F16ZQSB1SY2779', default='F16ZQSB1SY2779')
37 | parser.add_argument('-resource',type=str, help='resourse requirment, default: vf=1G,p=1', default='vf=1G,p=1')
38 | args = parser.parse_args()
39 |
40 | # make split, job, results dirs
41 | split_dir = os.path.join(os.path.abspath(args.outdir), "split")
42 | try:
43 | os.makedirs(split_dir)
44 | except OSError as e:
45 | if e.errno != errno.EEXIST:
46 | raise
47 | job_dir = os.path.join(os.path.abspath(args.outdir), "job")
48 | try:
49 | os.makedirs(job_dir)
50 | except OSError as e:
51 | if e.errno != errno.EEXIST:
52 | raise
53 | results_dir = os.path.join(os.path.abspath(args.outdir), "results")
54 | try:
55 | os.makedirs(results_dir)
56 | except OSError as e:
57 | if e.errno != errno.EEXIST:
58 | raise
59 | qry_fa = os.path.abspath(args.qry)
60 |
61 | # split reference fasta
62 | split_num = split_fasta(args.ref, args.size, split_dir, True)
63 | gen_job(qry_fa, args.c, split_num, split_dir, job_dir, results_dir)
64 | submit_job("mummer", split_num, args.queue, args.project, args.resource, job_dir)
65 |
66 | if __name__ == '__main__':
67 | main()
68 |
--------------------------------------------------------------------------------
/scripts/assembly_info.r:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | library(ggplot2)
3 | library(dplyr)
4 | library(tidyr)
5 | library(readr)
6 | library(stringr)
7 | library(argparser)
8 | library(here)
9 |
10 | parse_asm <- function(path_f)
11 | {
12 | return
13 | read_delim(path_f, delim = '\t') %>%
14 | arrange(scaf_L50) %>%
15 | select(
16 | filename, n_scaffolds, scaf_bp,
17 | scaf_N50, scaf_L50,
18 | scaf_N90, scaf_L90,
19 | scaf_max, scaf_n_gt50K, scaf_pct_gt50K,
20 | gc_avg, gc_std)
21 | }
22 |
23 | average_asm <- function(asm_df)
24 | {
25 | return
26 | asm_df %>%
27 | select(
28 | n_scaffolds, scaf_bp,
29 | scaf_N50, scaf_L50,
30 | scaf_N90, scaf_L90, scaf_max,
31 | scaf_n_gt50K, scaf_pct_gt50K,
32 | gc_avg, gc_std) %>%
33 | summarise(
34 | n_scaffolds_average = mean(n_scaffolds),
35 | scaf_bp_average = mean(scaf_bp),
36 | scaf_N50_average = mean(scaf_N50),
37 | scaf_L50_average = mean(scaf_L50),
38 | scaf_N90_average = mean(scaf_N90),
39 | scaf_L90_average = mean(scaf_L90),
40 | scaf_max_average = mean(scaf_max),
41 | scaf_n_gt50K_average = mean(scaf_n_gt50K),
42 | scaf_pct_gt50K_average = mean(scaf_pct_gt50K),
43 | gc_avg_average = mean(gc_avg),
44 | gc_std_average = mean(gc_std)) %>%
45 | gather(key, value) %>%
46 | mutate(value_human = value / 1000)
47 | }
48 |
49 | asm_boxplot <- function(df, title)
50 | {
51 | p <-
52 | df %>%
53 | gather(key, value, -filename) %>%
54 | mutate(key = factor(
55 | key,
56 | levels = c(
57 | "n_scaffolds", "scaf_bp",
58 | "scaf_N50", "scaf_L50",
59 | "scaf_N90", "scaf_L90",
60 | "scaf_max", "scaf_n_gt50K", "scaf_pct_gt50K",
61 | "gc_avg", "gc_std"))) %>%
62 | ggplot(., aes(key, value)) +
63 | geom_boxplot(aes(fill = key), outlier.size = 0.5) +
64 | geom_jitter(size = 1, width = 0.25) +
65 | facet_wrap(~ key, scales = "free") +
66 | theme(
67 | panel.grid = element_blank(),
68 | axis.text.x = element_blank(),
69 | axis.ticks.x = element_blank(),
70 | axis.title = element_blank(),
71 | legend.title = element_blank()) +
72 | ggtitle(title)
73 | return(p)
74 | }
75 |
76 | parser <- arg_parser("plot assembly statistics") %>%
77 | add_argument("--assembly_info", help="assembly statistics info table") %>%
78 | add_argument("--pdf", help="assembly statistics plot", default="assembly_statistics.pdf")
79 |
80 | args <- parse_args(parser)
81 | asm_df <- parse_asm(args$assembly_info)
82 | average_asm_df <- average_asm(asm_df)
83 | plot <- asm_boxplot(asm_df, "8 soil and 2 wood samples megahit assembly statistics")
84 | ggsave(args$pdf, plot, width = 10, height = 10)
85 |
--------------------------------------------------------------------------------
/metapi/predictor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os
4 | import gzip
5 | import re
6 | import sys
7 | import argparse
8 | from Bio import SeqIO
9 |
10 |
11 | def parse_gff(gff_file, min_len):
12 | '''
13 | extract pep id under the requirement of the minimal contig length cutoff from the GFF file generated by prodigal
14 | '''
15 | save = False
16 | min_len = int(min_len)
17 | pep_id_list = []
18 | with gzip.open(gff_file, "rt") as ih:
19 | for line in ih:
20 | seq_len = 0
21 |
22 | if line.startswith("##") or line.startswith("# Model Data") or line.strip() == '"':
23 | continue
24 |
25 | elif line.startswith("# Sequence Data"):
26 | line_split = line.strip().split(";")
27 | for token in line_split:
28 | if "seqlen=" in token:
29 | seq_len = int(token[token.find("=") + 1:])
30 | if seq_len < min_len:
31 | save = False
32 | else:
33 | save = True
34 | elif save:
35 | line_split = re.split("\\s+", line.strip())
36 | seq_id = line_split[0]
37 | trans_id = line_split[-1].split(";")[0].split("_")[-1]
38 | pep_id = f'''{seq_id}_{trans_id}'''
39 | pep_id_list.append(pep_id)
40 | else:
41 | continue
42 | return pep_id_list
43 |
44 |
45 | def extract_faa(faa_file, pep_id_list, out_file, assembly_group=None):
46 | if os.path.dirname(out_file) != "":
47 | os.makedirs(os.path.dirname(out_file), exist_ok=True)
48 |
49 | with gzip.open(out_file, "wt") as oh:
50 | with gzip.open(faa_file, "rt") as ih:
51 | for seq in SeqIO.parse(ih, "fasta"):
52 | if seq.id in pep_id_list:
53 | if assembly_group is not None:
54 | seq.id = f'''{assembly_group}C{seq.id}'''
55 | seq.name = f'''{assembly_group}C{seq.name}'''
56 | seq.description = f'''{assembly_group}C{seq.description}'''
57 | SeqIO.write(seq, oh, "fasta")
58 |
59 |
60 | def main():
61 | parser = argparse.ArgumentParser("PEP extractor")
62 | parser.add_argument("--faa-file", dest="faa_file", type=str, required=True, help="protein file, gzipped")
63 | parser.add_argument("--gff-file", dest="gff_file", type=str, required=True, help="gff file, gzipped")
64 | parser.add_argument("--min-contig", dest="min_contig", default=2000, type=int, help="minimal contig length, default: 2000")
65 | parser.add_argument("--out-file", dest="out_file", type=str, required=True, help="output protein file, gzipped")
66 | args = parser.parse_args()
67 |
68 | pep_id_list = parse_gff(args.gff_file, args.min_contig)
69 | if len(pep_id_list) > 0:
70 | extract_faa(args.faa_file, pep_id_list, args.out_file)
71 | else:
72 | sys.exit("Emplty protein file after contigs length control")
73 |
74 |
75 | if __name__ == '__main__':
76 | main()
--------------------------------------------------------------------------------
/metapi/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from metapi.configer import metaconfig
4 | from metapi.configer import parse_yaml
5 | from metapi.configer import update_config
6 | from metapi.configer import custom_help_formatter
7 |
8 | from metapi.tooler import parse
9 | from metapi.tooler import merge
10 |
11 | from metapi.simulator import parse_genomes
12 | from metapi.simulator import get_simulate_info
13 | from metapi.simulator import simulate_short_reads
14 |
15 | from metapi.sampler import HEADERS
16 | from metapi.sampler import parse_samples
17 | from metapi.sampler import get_reads
18 | from metapi.sampler import get_sample_id
19 | from metapi.sampler import get_sample_id_
20 | from metapi.sampler import get_samples_id_by_assembly_group
21 | from metapi.sampler import get_samples_id_by_binning_group
22 | from metapi.sampler import get_samples_id_by_assembly_and_binning_group
23 | from metapi.sampler import get_assembly_group_by_binning_group
24 | from metapi.sampler import get_binning_group_by_assembly_group
25 | from metapi.sampler import get_multibinning_group_by_assembly_group
26 |
27 | from metapi.sampler import get_raw_input_list
28 | from metapi.sampler import get_raw_input_dict
29 |
30 | from metapi.sampler import get_samples_for_assembly_list
31 | from metapi.sampler import get_samples_for_assembly_dict
32 | from metapi.sampler import get_samples_for_assembly_megahit
33 | from metapi.sampler import get_samples_for_assembly_idba_ud
34 | from metapi.sampler import get_samples_for_assembly_spades
35 | from metapi.sampler import get_samples_for_assembly_plass
36 | from metapi.sampler import get_samples_for_assembly_opera_ms
37 | from metapi.sampler import get_samples_for_metaquast
38 |
39 | from metapi.sampler import get_samples_bax
40 | from metapi.sampler import get_samples_bax_multi
41 | from metapi.sampler import get_samples_bax_multi_all
42 | from metapi.sampler import get_samples_scaftigs
43 |
44 | from metapi.qcer import change
45 | from metapi.qcer import compute_host_rate
46 | from metapi.qcer import qc_summary_merge
47 | from metapi.qcer import qc_bar_plot
48 | from metapi.qcer import parse_fastp_json
49 |
50 | from metapi.assembler import assembler_init
51 | from metapi.assembler import parse_assembly
52 | from metapi.assembler import parse_assembly_spades_params
53 |
54 | from metapi.aligner import flagstats_summary
55 |
56 | from metapi.predictor import parse_gff
57 | from metapi.predictor import extract_faa
58 |
59 | from metapi.binner import get_binning_info
60 | from metapi.binner import generate_mags
61 | from metapi.binner import extract_mags_report
62 | from metapi.binner import combine_jgi
63 |
64 | from metapi.checkmer import checkm_prepare
65 | from metapi.checkmer import checkm_reporter
66 |
67 | from metapi.classifier import demultiplex
68 | from metapi.classifier import gtdbtk_prepare_from_mags
69 | from metapi.classifier import gtdbtk_prepare_from_genes
70 |
71 | from metapi.taxonomyer import refine_taxonomy
72 |
73 | from metapi.uploader import gen_samples_info
74 | from metapi.uploader import gen_info
75 |
76 | from metapi.__about__ import __version__, __author__
77 |
78 | name = "metapi"
79 |
--------------------------------------------------------------------------------
/scripts/metapi_config_update.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import argparse
4 | import os
5 | from metapi import configer
6 |
7 |
8 | def update_config(
9 | workdir,
10 | rmhost_host_fasta,
11 | rmhost_bwa_index,
12 | rmhost_bowtie2_index,
13 | kraken2_db,
14 | prof_index_metadata,
15 | prof_taxonomy,
16 | prof_jgi_index,
17 | project_id,
18 | ):
19 |
20 | conf_file = os.path.join(workdir, "config.yaml")
21 | cluster_file = os.path.join(workdir, "cluster.yaml")
22 | conf_file_up = os.path.join(workdir, "config_update.yaml")
23 | cluster_file_up = os.path.join(workdir, "cluster_update.yaml")
24 |
25 | conf = configer.parse_yaml(os.path.join(workdir, "config.yaml"))
26 | cluster = configer.parse_yaml(os.path.join(workdir, "cluster.yaml"))
27 |
28 | conf["params"]["rmhost"]["host_fasta"] = rmhost_host_fasta
29 | conf["params"]["rmhost"]["bwa"]["index_prefix"] = rmhost_bwa_index
30 | conf["params"]["rmhost"]["bowtie2"]["index_prefix"] = rmhost_bowtie2_index
31 | conf["params"]["classify"]["kraken2"]["database"] = kraken2_db
32 | conf["params"]["profiling"]["jgi"]["index_metadata"] = prof_index_metadata
33 | conf["params"]["profiling"]["jgi"]["taxonomy"] = prof_taxonomy
34 | conf["params"]["profiling"]["jgi"]["index_prefix"] = prof_jgi_index
35 |
36 | cluster["__default__"]["project"] = project_id
37 |
38 | configer.update_config(conf_file, conf_file_up, conf, remove=False)
39 | os.rename(conf_file_up, conf_file)
40 |
41 | configer.update_config(cluster_file, cluster_file_up, cluster, remove=False)
42 | os.rename(cluster_file_up, cluster_file)
43 |
44 |
45 | def main():
46 | parser = argparse.ArgumentParser("update metapi config.yaml")
47 | parser.add_argument("-d", "--workdir", type=str, help="work dir", default="./")
48 | parser.add_argument("-a", "--rmhost_host_fasta", type=str, help="rmhost host fasta")
49 | parser.add_argument(
50 | "-i", "--rmhost_bwa_index", type=str, help="rmhost bwa index prefix"
51 | )
52 | parser.add_argument(
53 | "-I", "--rmhost_bowtie2_index", type=str, help="rmhost bowtie2 index prefix"
54 | )
55 | parser.add_argument("-k", "--kraken2_db", type=str, help="kraken2 database")
56 | parser.add_argument(
57 | "-m", "--profiling_index_metadata", type=str, help="profiling index metadata"
58 | )
59 | parser.add_argument(
60 | "-t", "--profiling_taxonomy", type=str, help="profiling taxonomy"
61 | )
62 | parser.add_argument(
63 | "-j", "--profiling_jgi_index", type=str, help="profiling jgi index prefix"
64 | )
65 | parser.add_argument("-p", "--project_id", type=str, help="project id")
66 | args = parser.parse_args()
67 |
68 | update_config(
69 | args.workdir,
70 | args.rmhost_host_fasta,
71 | args.rmhost_bwa_index,
72 | args.rmhost_bowtie2_index,
73 | args.kraken2_db,
74 | args.profiling_index_metadata,
75 | args.profiling_taxonomy,
76 | args.profiling_jgi_index,
77 | args.project_id,
78 | )
79 |
80 |
81 | if __name__ == "__main__":
82 | main()
83 |
--------------------------------------------------------------------------------
/metapi/wrappers/prodigal_wrapper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import glob
4 | import os
5 | import stat
6 | import sys
7 | import subprocess
8 | import concurrent.futures
9 |
10 | import pandas as pd
11 | from checkm import prodigal
12 |
13 |
14 | def run_prodigal(input_list):
15 | bin_fa = os.path.abspath(input_list[0])
16 | output_dir = os.path.abspath(input_list[1])
17 |
18 | bin_id = os.path.basename(os.path.splitext(os.path.splitext(bin_fa)[0])[0])
19 |
20 | pep_file = os.path.join(output_dir, bin_id + ".faa")
21 | cds_file = os.path.join(output_dir, bin_id + ".ffn")
22 | gff_file = os.path.join(output_dir, bin_id + ".gff")
23 |
24 | pep_file_gz = pep_file + ".gz"
25 | cds_file_gz = cds_file + ".gz"
26 | gff_file_gz = gff_file + ".gz"
27 |
28 | prodigal_runner = prodigal.ProdigalRunner(output_dir)
29 | prodigal_runner.aaGeneFile = pep_file
30 | prodigal_runner.ntGeneFile = cds_file
31 | prodigal_runner.gffFile = gff_file
32 |
33 | best_translation_table = prodigal_runner.run(bin_fa, True)
34 |
35 | if os.path.exists(pep_file) and (os.path.getsize(pep_file) > 0):
36 | subprocess.run(f'''pigz -f {pep_file}''', shell=True)
37 | if os.path.exists(cds_file) and (os.path.getsize(cds_file) > 0):
38 | subprocess.run(f'''pigz -f {cds_file}''', shell=True)
39 | if os.path.exists(gff_file) and (os.path.getsize(gff_file) > 0):
40 | subprocess.run(f'''pigz -f {gff_file}''', shell=True)
41 | else:
42 | subprocess.run(f'''rm -rf {pep_file}''', shell=True)
43 | subprocess.run(f'''rm -rf {cds_file}''', shell=True)
44 | subprocess.run(f'''rm -rf {gff_file}''', shell=True)
45 |
46 | if best_translation_table in [4, 11]:
47 | if (os.path.exists(pep_file_gz)) and (os.path.exists(cds_file_gz)) and (os.path.exists(gff_file_gz)) and (os.stat(pep_file_gz)[stat.ST_SIZE]) > 0:
48 | return (bin_id, bin_fa, pep_file_gz, best_translation_table)
49 | else:
50 | return None
51 | else:
52 | if (os.path.exists(pep_file_gz)) and (os.path.exists(cds_file_gz)) and (os.path.exists(gff_file_gz)) and (os.stat(pep_file_gz)[stat.ST_SIZE]) > 0:
53 | return (bin_id, bin_fa, pep_file_gz, f"unknown: {best_translation_table}")
54 | else:
55 | return None
56 |
57 |
58 | workers = int(sys.argv[1])
59 | input_mags_dir = os.path.dirname(sys.argv[2])
60 | output_done = sys.argv[3]
61 | output_dir = os.path.dirname(output_done)
62 |
63 | bin_list = glob.glob(input_mags_dir + "/*.fa.gz")
64 |
65 | input_list = []
66 | for bin_fa in bin_list:
67 | input_list.append((bin_fa, output_dir))
68 |
69 | table_list = []
70 |
71 |
72 | subprocess.run(f'''rm -rf {output_dir}''', shell=True)
73 | subprocess.run(f'''mkdir -p {output_dir}''', shell=True)
74 |
75 | with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
76 | for table_df in executor.map(run_prodigal, input_list):
77 | if table_df is not None:
78 | table_list.append(table_df)
79 |
80 | table_df = pd.DataFrame(table_list, columns=["bin_id", "bin_file", "pep_file", "best_translation_table"])
81 | table_df.to_csv(output_done, sep="\t", index=False)
--------------------------------------------------------------------------------
/scripts/merge_checkm_out.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import argparse
4 | import pandas as pd
5 | import re
6 | from glob import glob
7 | import sys
8 | from pprint import pprint
9 |
10 |
11 | def merge(checkm_list, sort_by):
12 | df = pd.DataFrame()
13 | if re.search(r'\*', checkm_list[0]):
14 | checkm_list_ = glob(checkm_list[0])
15 | else:
16 | checkm_list_ = checkm_list
17 | for checkm_file in checkm_list_:
18 | checkm_df = pd.DataFrame(columns=["bin_id", "marker_lineage",
19 | "genomes", "markers", "marker_sets",
20 | "0", "1", "2", "3", "4", "5+",
21 | "completeness", "contamination", "strain_heterogeneity"])
22 | with open(checkm_file, 'r') as ih:
23 | print("analysis %s" % checkm_file)
24 | next(ih), next(ih), next(ih)
25 | for line in ih:
26 | if not line.startswith("--"):
27 | line_list = re.split(r'\s+', line.strip())
28 | checkm_df = checkm_df.append({"bin_id": line_list[0],
29 | "marker_lineage": "-".join(line_list[1:3]),
30 | "genomes": line_list[3],
31 | "markers": line_list[4],
32 | "marker_sets": line_list[5],
33 | "0": line_list[6],
34 | "1": line_list[7],
35 | "2": line_list[8],
36 | "3": line_list[9],
37 | "4": line_list[10],
38 | "5+": line_list[11],
39 | "completeness": line_list[12],
40 | "contamination": line_list[13],
41 | "strain_heterogeneity": line_list[14]}, ignore_index=True)
42 | df = pd.concat([df, checkm_df])
43 | if sort_by == "completeness":
44 | df = df.sort_values(by=["completeness", "contamination", "strain_heterogeneity"],
45 | ascending=[False, True, True])
46 | else:
47 | df = df.sort_values(by="bin_id")
48 | return df
49 |
50 |
51 | def main():
52 | parser = argparse.ArgumentParser("merge many checkm out txt to one")
53 | parser.add_argument('-l', '--list', nargs='*', help='checkm out txt list, separated by spaces')
54 | parser.add_argument('-o', '--output', default=sys.stdout,
55 | help='merge results, if not specific it, will print stdout')
56 | parser.add_argument('-s', '--sort', choices=['bin_id', 'completeness'], default="completeness",
57 | help='sort merged checkm output')
58 | args = parser.parse_args()
59 |
60 | df = merge(args.list, args.sort)
61 | df.to_csv(args.output, sep='\t', index=False)
62 |
63 |
64 | if __name__ == '__main__':
65 | main()
66 |
--------------------------------------------------------------------------------
/metapi/simulator.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | import gzip
5 | import sys
6 | import subprocess
7 | import pandas as pd
8 | from Bio import SeqIO
9 |
10 |
11 | def parse_genomes(samples_tsv, output_dir, check_samples=False):
12 | header = ["id", "genome", "abundance", "reads_num", "model"]
13 |
14 | genomes_df = pd.read_csv(samples_tsv, sep="\t").set_index("id", drop=False)
15 |
16 | cancel = False
17 | for i in header:
18 | if i not in genomes_df.columns:
19 | cancel = True
20 | print(f'Error: {i} not in {genomes_df.columns} header')
21 |
22 | for i in genomes_df.index.unique():
23 | if "." in i:
24 | cancel = True
25 | print('Error: sample id %s contains ".", please remove all "."' % i)
26 |
27 | if cancel:
28 | sys.exit(1)
29 |
30 | genomes_df["fq1"] = genomes_df.apply(
31 | lambda x: os.path.join(
32 | output_dir, "short_reads/%s.simulate.1.fq.gz" % x["id"],
33 | ),
34 | axis=1,
35 | )
36 | genomes_df["fq2"] = genomes_df.apply(
37 | lambda x: os.path.join(
38 | output_dir, "short_reads/%s.simulate.2.fq.gz" % x["id"],
39 | ),
40 | axis=1,
41 | )
42 | return genomes_df
43 |
44 |
45 | def simulate_short_reads(
46 | genomes, output_prefix, r1, r2, abunf, model, reads_num, abundance, threads, logf,
47 | ):
48 | if len(abundance) != 0:
49 | with open(abunf, "w") as outh:
50 | for (g, a) in zip(genomes, abundance):
51 | inh = gzip.open(g, "rt") if g.endswith(".gz") else open(g, "r")
52 | genome = []
53 | total_len = 0
54 | for record in SeqIO.parse(inh, "fasta"):
55 | total_len += len(record.seq)
56 | genome.append((record.id, len(record.seq)))
57 | for s in genome:
58 | outh.write("%s\t%f\n" %
59 | (s[0], float(a) * s[1] / total_len))
60 | inh.close()
61 |
62 | args = (
63 | ["iss", "generate", "--cpus", str(threads), "--genomes"]
64 | + genomes
65 | + ["--n_reads", reads_num, "--model", model, "--output", output_prefix]
66 | )
67 |
68 | if len(abundance) != 0:
69 | args += ["--abundance_file", abunf]
70 | print(" ".join(args))
71 | env = os.environ.copy()
72 | proc = subprocess.Popen(
73 | args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, encoding="utf-8",
74 | )
75 | output, error = proc.communicate()
76 |
77 | with open(logf, "w") as logh:
78 | logh.write(error)
79 |
80 | if proc.returncode == 0:
81 | if len(abundance) == 0:
82 | default_abunf = output_prefix + "_abundance.txt"
83 | if os.path.exists(default_abunf):
84 | os.rename(default_abunf, abunf)
85 | subprocess.run(f"pigz -p {threads} {output_prefix}_R1.fastq", shell=True)
86 | subprocess.run(f"pigz -p {threads} {output_prefix}_R2.fastq", shell=True)
87 | os.rename(f"{output_prefix}_R1.fastq.gz", r1)
88 | os.rename(f"{output_prefix}_R2.fastq.gz", r2)
89 | else:
90 | sys.exit(1)
91 |
92 |
93 | def get_simulate_info(genomes_df, wildcards, col):
94 | return genomes_df.loc[[wildcards.sample], col].dropna().tolist()
95 |
--------------------------------------------------------------------------------
/scripts/clstr_szie_tab.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #from Bio.SeqIO.FastaIO import SimpleFastaParser
3 | import argparse
4 | import re
5 |
6 | pattern = re.compile(r'\d+\t(\d+)[a-z]{2}, >(.+)\.\.\. \*')
7 | #pattern = re.compile(r'\d+\t(\d+)[a-z]{2},\s>(.+)\.\.\.\s\*')
8 | #pattern = re.compile(r'\d+\t(\d+)nt, >(.+)\.\.\. \*')
9 | #pattern = re.compile(r'\d+\t(\d+)nt,\s>(.+)\.\.\.\s\*')
10 |
11 | # this parser base code comes from Bio.SeqIO.FastaIO.SimpleFastaParser :)
12 | def cdhit_clstr_parser(handle):
13 | """Generator function to iterate over cdhit clstr records (as string tuple)
14 |
15 | >Cluster 0
16 | 0 1131322nt, >k119_12676... *
17 | 1 84315nt, >k119_210239... at -/99.66%
18 | 2 73592nt, >k119_187067... at +/99.86%
19 | 3 70665nt, >k119_160147... at -/99.32%
20 | 4 66352nt, >k119_217379... at +/99.89%
21 | 5 63337nt, >k119_125106... at +/99.28%
22 | 6 63232nt, >k119_150147... at -/99.80%
23 | 7 59840nt, >k119_197728... at +/99.04%
24 | 8 59306nt, >k119_59391... at -/99.00%
25 | >Cluster 5343379
26 | 0 2000nt, >k119_192744... *
27 | >Cluster 5343380
28 | 0 2000nt, >k119_222307... *
29 | >Cluster 5343381
30 | 0 2000nt, >k119_232332... *
31 | >Cluster 5343382
32 | 0 2000nt, >k119_241124... *
33 | >Cluster 5343383
34 | 0 2000nt, >k119_253638... *
35 |
36 | """
37 | #Skip any text before the first record (e.g. blank lines, comments)
38 | seq_id = ""
39 | seq_len = 0
40 | clstr_size = 0
41 | while True:
42 | line = handle.readline()
43 | if line == "":
44 | return # Premature end of file, or just empty?
45 | if line[0] == ">":
46 | break
47 |
48 | while True:
49 | clstr_size = 0
50 | if line[0] != ">":
51 | raise ValueError(
52 | "Records in cdhit cluster file(fasta format) should start with '>' character")
53 | clstr_name = line[1:].rstrip()
54 | line = handle.readline()
55 | while True:
56 | if not line:
57 | break
58 | if line[0] == ">":
59 | break
60 | # lines contain many cluster records
61 | #lines.append(line.rstrip())
62 | clstr_size += 1
63 | matches = re.search(pattern, line)
64 | if matches:
65 | seq_len = matches.group(1)
66 | seq_id = matches.group(2)
67 |
68 | line = handle.readline()
69 | yield clstr_name, seq_id, seq_len, clstr_size
70 |
71 | if not line:
72 | return # StopIteration
73 |
74 | assert False, "Should not reach this line"
75 |
76 | def clstr_size_tab(clstr_file, clstr_size_out):
77 | with open(clstr_size_out, 'w') as out_handle:
78 | out_handle.write("cluster_name\tcluster_size\tsequence_id\tsequence_length\n")
79 | with open(clstr_file, 'r') as clstr_handle:
80 | for clstr_name, seq_id, seq_len, clstr_size in cdhit_clstr_parser(clstr_handle):
81 | clstr_name = "cluster_" + clstr_name.split(' ')[1]
82 | out_handle.write(clstr_name + "\t" + str(clstr_size) + "\t" + seq_id + "\t" + str(seq_len) + "\n")
83 |
84 | def main():
85 | parser = argparse.ArgumentParser(description='parse cdhit cluster file and get cluste size distribution')
86 | parser.add_argument('--clstr', type=str, help='cluster file')
87 | parser.add_argument('--out', type=str, help='cluster size distribution')
88 | args = parser.parse_args()
89 |
90 | clstr_size_tab(args.clstr, args.out)
91 |
92 | if __name__ == '__main__':
93 | main()
94 |
95 |
96 |
--------------------------------------------------------------------------------
/metapi/wrappers/gtdbtk_postprocess.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import subprocess
4 | import concurrent.futures
5 | from pprint import pprint
6 |
7 |
8 | def parse(stats_file):
9 | if os.path.exists(stats_file):
10 | try:
11 | df = pd.read_csv(stats_file, sep="\t")
12 | except pd.errors.EmptyDataError:
13 | print("%s is empty, please check" % stats_file)
14 | return None
15 |
16 | if not df.empty:
17 | return df
18 | else:
19 | return None
20 | else:
21 | print("%s is not exists" % stats_file)
22 | return None
23 |
24 |
25 | def merge(input_list, func, workers, **kwargs):
26 | df_list = []
27 | with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
28 | for df in executor.map(func, input_list):
29 | if df is not None:
30 | df_list.append(df)
31 |
32 | df_ = pd.concat(df_list)
33 |
34 | if "output" in kwargs:
35 | df_.to_csv(kwargs["output"], sep="\t", index=False)
36 | return df_
37 |
38 |
39 | threads = int(snakemake.threads)
40 |
41 | gtdb_done_list = snakemake.input["gtdb_done"]
42 |
43 | gtdb_to_ncbi_script = snakemake.params["gtdb_to_ncbi_script"]
44 | metadata_archaea = snakemake.params["metadata_archaea"]
45 | metadata_bacteria = snakemake.params["metadata_bacteria"]
46 |
47 | table_gtdb = snakemake.output["table_gtdb"]
48 | table_ncbi = snakemake.output["table_ncbi"]
49 | table_all = snakemake.output["table_all"]
50 |
51 | os.makedirs(os.path.dirname(table_all), exist_ok=True)
52 |
53 | gtdb_list = []
54 | ncbi_list = []
55 |
56 | for i in gtdb_done_list:
57 | out_dir = os.path.dirname(i)
58 | archaea_tsv = os.path.join(out_dir, "gtdbtk.archaea.summary.tsv")
59 | bacteria_tsv = os.path.join(out_dir, "gtdbtk.bacteria.summary.tsv")
60 |
61 | if os.path.exists(archaea_tsv):
62 | gtdb_list.append(archaea_tsv)
63 | if os.path.exists(bacteria_tsv):
64 | gtdb_list.append(bacteria_tsv)
65 |
66 | gtdb_to_ncbi_summary = os.path.join(out_dir, "gtdbtk.ncbi.summary.tsv")
67 | gtdb_to_ncbi_log = os.path.join(out_dir, "gtdbtk.to.ncbi.log")
68 |
69 | archaea_cmd = "--ar53_metadata_file"
70 | if "ar122" in os.path.realpath(archaea_tsv):
71 | archaea_cmd = "--ar122_metadata_file"
72 |
73 | bacteria_cmd = "--bac120_metadata_file"
74 |
75 | gtdb_to_ncbi_cmd = \
76 | f'''
77 | python {gtdb_to_ncbi_script} \
78 | --gtdbtk_output_dir {out_dir} \
79 | --output_file {gtdb_to_ncbi_summary} \
80 | {archaea_cmd} {metadata_archaea} \
81 | {bacteria_cmd} {metadata_bacteria} \
82 | > {gtdb_to_ncbi_log}
83 | '''
84 | subprocess.run(gtdb_to_ncbi_cmd, shell=True)
85 |
86 | if os.path.exists(gtdb_to_ncbi_summary):
87 | ncbi_list.append(gtdb_to_ncbi_summary)
88 |
89 |
90 | if len(gtdb_list) > 0:
91 | table_gtdb_df = merge(gtdb_list, parse, threads, output=table_gtdb)
92 | else:
93 | print(f"No {table_gtdb} generate")
94 |
95 | if len(ncbi_list) > 0:
96 | table_ncbi_df = merge(ncbi_list, parse, threads, output=table_ncbi)
97 | else:
98 | print(f"No {table_ncbi} generate")
99 |
100 |
101 | table_gtdb_df = table_gtdb_df.rename(columns={"classification": "GTDB classification"})
102 | pprint(table_gtdb_df)
103 |
104 | table_ncbi_df = table_ncbi_df.rename(columns={"Genome ID": "user_genome"})
105 | pprint(table_ncbi_df)
106 |
107 | table_all_df = pd.merge(
108 | table_gtdb_df, table_ncbi_df, how="inner",
109 | on=["user_genome", "GTDB classification"])#\
110 |
111 | table_all_df.to_csv(table_all, sep="\t", index=False)
112 |
--------------------------------------------------------------------------------
/scripts/rename_fasta_id.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from Bio import SeqIO, bgzf
3 | from Bio.SeqIO.FastaIO import FastaIterator, FastaWriter
4 | import gzip
5 | import sys
6 | import os
7 | import argparse
8 |
9 | #with open(sys.argv[2], 'w') as fa_out:
10 | # with open(sys.argv[1], 'r') as fa_in:
11 | # for rec in SeqIO.parse(fa_in, 'fasta'):
12 | # (description, sample_name) = rec.description.split("\t")
13 | # rec.description = sample_name + "_" + description
14 | # rec.id = rec.description.split(' ')[0]
15 | # SeqIO.write(rec, fa_out, 'fasta')
16 |
17 | def change_header_sample(title):
18 | # title(total header) -> (id, name, description)
19 | # R0170300050_tooth_RA.contigs.fa
20 | #from > k119_1 flag=1 multi=7.0000 len=3284 R0170300050_tooth_RA
21 | #to > R0170300050_tooth_RA_k119_1 flag=1 multi=7.0000 len=3284
22 | (one_line, sample_name) = title.split("\t")
23 | id = sample_name + "_" + title.split(' ')[0]
24 | desc = id + ' ' + ' '.join(one_line.split(' ')[1:])
25 | return id, "", desc
26 |
27 | def change_header_no_sample(title):
28 | # title(total header) -> (id, name, description)
29 | # R0170300050_tooth_RA.contigs.fa
30 | #from > k119_1 flag=1 multi=7.0000 len=3284
31 | #to > R0170300050_tooth_RA_k119_1 flag=1 multi=7.0000 len=3284
32 | id = sample_tag + "_" + title.split(' ')[0]
33 | desc = id + " " + " ".join(title.split(' ')[1:])
34 | return id, "", desc
35 |
36 |
37 | ## rename header framework
38 | ## just change header_function
39 | def reheader_fasta(fa_in, fa_out, header_function, in_gz, gz):
40 | if in_gz:
41 | in_h = gzip.open(fa_in, 'rt')
42 | else:
43 | in_h = open(fa_in, 'r')
44 | if gz:
45 | out_h = bgzf.BgzfWriter(fa_out, 'wb')
46 | else:
47 | out_h = open(fa_out, 'w')
48 | writer = FastaWriter(out_h)
49 | writer.write_header()
50 | for rec in FastaIterator(in_h, title2ids = header_function):
51 | writer.write_record(rec)
52 | writer.write_footer()
53 | out_h.close()
54 | in_h.close()
55 |
56 | def main():
57 | '''
58 | Why write this script ?
59 | Becaust megahit always generate knum_num format contigs id
60 | '''
61 | parser = argparse.ArgumentParser(description='change fasta file header')
62 | parser.add_argument('-fa', type=str, help='fasta file path')
63 | parser.add_argument('-out', type=str, help='output')
64 | parser.add_argument('-rm', action='store_true', help='delete original fasta file', default=False)
65 | parser.add_argument('-gz', action='store_true', help='compress output fasta file', default=False)
66 | parser.add_argument('-mv', action='store_true', help="rename change id fasta file to original file", default=False)
67 | args = parser.parse_args()
68 |
69 | #assert not args.fa == args.out, "input file name can't equal to output file name"
70 | if (args.out == args.fa) or (not args.out):
71 | args.out = args.fa + ".changeid"
72 | if args.gz:
73 | if not args.out.endswith(".gz"):
74 | args.out = args.out + ".gz"
75 |
76 | in_gz = args.fa.endswith(".gz")
77 | #if args.fa.endswith(".gz"):
78 | # args.gz = True
79 |
80 | global sample_tag
81 | sample_tag = os.path.basename(args.fa).split(".")[0]
82 |
83 | abs_in = os.path.abspath(args.fa)
84 | abs_out = os.path.abspath(args.out)
85 | reheader_fasta(abs_in, abs_out, change_header_no_sample, in_gz, args.gz)
86 |
87 | if args.rm:
88 | os.remove(abs_in)
89 | if args.mv:
90 | if (not in_gz) and args.gz:
91 | abs_in = abs_in + ".gz"
92 | if in_gz and (not args.gz):
93 | abs_in = abs_in.rstrip(".gz")
94 | os.rename(abs_out, abs_in)
95 |
96 | if __name__ == '__main__':
97 | main()
--------------------------------------------------------------------------------
/scripts/asub.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # please see https://github.com/lh3/asub
3 | import argparse
4 | import fileinput
5 | import os
6 | import re
7 | import shutil
8 | import stat
9 | import subprocess
10 | import sys
11 | from datetime import datetime
12 |
13 | __author__ = 'Jie Zhu, Jiahui Zhu'
14 | __email__ = 'zhujie@genomics.cn, zhujiahui@genomics.cn'
15 | __version__ = '0.3.1'
16 | __date__ = 'Jun 19, 2018'
17 |
18 |
19 | def parse_job(job_name, job_file, a_job_line, logdir):
20 | with fileinput.input(files=job_file if not job_file is None else ('-', )) as in_h:
21 | job_num = 0
22 | for one_line in in_h:
23 | job_num += 1
24 | job_f = os.path.join(logdir, job_name.rstrip(".sh") + "_" + str(job_num) + ".sh")
25 | with open(job_f, 'w') as job_h:
26 | job_h.write(one_line)
27 | while fileinput.lineno() % a_job_line != 0:
28 | job_h.write(next(in_h))
29 | #for i in range(1, a_job_line):
30 | # job_h.write(next(in_h))
31 | return job_num
32 |
33 |
34 | def submit_job(job_name, total_job_num, queue, prj_id, resource, logdir):
35 | submit_f = os.path.join(os.path.curdir, job_name.rstrip(".sh") + "_submit.sh")
36 | array_range = "1-" + str(total_job_num) + ":1"
37 | job_script = os.path.join(logdir, job_name.rstrip(".sh") + "_$SGE_TASK_ID.sh")
38 | num_proc = resource.split('=')[-1]
39 | with open(submit_f, 'w') as submit_h:
40 | submit_h.write('''#!/bin/bash\n\
41 | #$ -clear
42 | #$ -S /bin/bash
43 | #$ -N %s
44 | #$ -cwd
45 | #$ -l %s
46 | #$ -binding linear:%s
47 | #$ -q %s
48 | #$ -P %s
49 | #$ -t %s
50 | jobscript=%s
51 | bash $jobscript\n''' % (job_name, resource, num_proc, queue, prj_id, array_range, job_script))
52 |
53 | os.chmod(submit_f, stat.S_IRWXU)
54 | submit_cmd = shutil.which("qsub") + \
55 | " -e " + os.path.join(logdir, job_name + "_\\$TASK_ID.e") + \
56 | " -o " + os.path.join(logdir, job_name + "_\\$TASK_ID.o") + " " + submit_f
57 | print(submit_cmd)
58 | subprocess.call(submit_cmd, shell=True)
59 |
60 | def main():
61 | '''it is a very simple script to submit array job, but you need supply real run command'''
62 | parser = argparse.ArgumentParser(description='make submit array job easy')
63 | parser.add_argument('-jobfile', nargs='*', help='job file to read, if empty, stdin is used')
64 | parser.add_argument('-jobname', type=str, help='job name', default='job')
65 | parser.add_argument('-jobline', type=int, help='set the number of lines to form a job', default=1)
66 | parser.add_argument('-queue', type=str, help='submit queue', default='st.q')
67 | parser.add_argument('-project', type=str, help='project id', default='F16ZQSB1SY2779')
68 | parser.add_argument('-resource',type=str, help='resourse requirment', default='vf=1G,p=1')
69 | parser.add_argument('-logdir', type=str, help='array job log directory')
70 | args = parser.parse_args()
71 |
72 | assert re.match(r'vf=[\d\.]+\w,p=\d+', args.resource), "please specific memory usage and number processor"
73 | assert not re.match(r'^\d+', args.jobname), "array job name cannot start with a digit"
74 | assert args.jobline >= 1, "a job line can't to be zero"
75 |
76 | args.jobname += "_" + datetime.now().strftime("%Y%m%d%H%M%S")
77 |
78 | if not args.logdir:
79 | args.logdir = args.jobname + "_qsub"
80 | args.logdir = args.logdir.rstrip("/") + "/"
81 |
82 | if os.path.exists(args.logdir):
83 | os.remove(args.logdir)
84 | os.makedirs(args.logdir)
85 |
86 | total_job_num = parse_job(args.jobname, args.jobfile, args.jobline, args.logdir)
87 | submit_job(args.jobname, total_job_num, args.queue, args.project, args.resource, args.logdir)
88 |
89 |
90 | if __name__ == '__main__':
91 | main()
92 |
--------------------------------------------------------------------------------
/metapi/rules/upload.smk:
--------------------------------------------------------------------------------
1 | if config["upload"]["do"]:
2 | rule upload_generate_samples_info:
3 | input:
4 | config["params"]["samples"]
5 | output:
6 | os.path.join(config["output"]["upload"], "table/MIxS_Samples.xlsx")
7 | run:
8 | metapi.gen_samples_info(SAMPLES, output[0], config)
9 |
10 |
11 | rule upload_md5_short_reads:
12 | input:
13 | alignment_input_with_short_reads
14 | output:
15 | os.path.join(config["output"]["upload"], "md5/short_reads/{sample}.md5")
16 | shell:
17 | '''
18 | md5sum {input} > {output}
19 | '''
20 |
21 |
22 | rule upload_generate_run_info:
23 | input:
24 | expand(os.path.join(
25 | config["output"]["upload"], "md5/short_reads/{sample}.md5"),
26 | sample=SAMPLES_ID_LIST)
27 | output:
28 | os.path.join(config["output"]["upload"], "table/Experiment_Run.xlsx")
29 | threads:
30 | config["upload"]["threads"]
31 | run:
32 | metapi.gen_info(input, output[0], config, threads, "sequencing_run")
33 |
34 |
35 | rule upload_sequencing_all:
36 | input:
37 | os.path.join(config["output"]["upload"], "table/Experiment_Run.xlsx"),
38 | os.path.join(config["output"]["upload"], "table/MIxS_Samples.xlsx")
39 |
40 |
41 | localrules:
42 | upload_generate_samples_info,
43 | upload_generate_run_info,
44 |
45 |
46 | if len(ASSEMBLERS) != 0:
47 | rule upload_md5_scaftigs:
48 | input:
49 | os.path.join(
50 | config["output"]["assembly"],
51 | "scaftigs/{binning_group}.{assembly_group}.{assembler}/{binning_group}.{assembly_group}.{assembler}.scaftigs.fa.gz")
52 | output:
53 | os.path.join(
54 | config["output"]["upload"],
55 | "md5/scaftigs/{assembler}/{binning_group}.{assembly_group}.{assembler}.scaftigs.md5")
56 | shell:
57 | '''
58 | md5sum {input} > {output}
59 | '''
60 |
61 |
62 | rule upload_generate_assembly_info:
63 | input:
64 | expand(os.path.join(
65 | config["output"]["upload"],
66 | "md5/scaftigs/{{assembler}}/{binning_group}.{assembly_group}.{{assembler}}.scaftigs.md5"),
67 | zip,
68 | binning_group=ASSEMBLY_GROUP["binning_group"],
69 | assembly_group=ASSEMBLY_GROUP["assembly_group"])
70 | output:
71 | os.path.join(
72 | config["output"]["upload"],
73 | "table/Genome_Assembly_{assembler}.xlsx")
74 | threads:
75 | config["upload"]["threads"]
76 | run:
77 | metapi.gen_info(input, output[0], config, threads, "assembly")
78 |
79 |
80 | rule upload_assembly_all:
81 | input:
82 | expand(os.path.join(
83 | config["output"]["upload"],
84 | "table/Genome_Assembly_{assembler}.xlsx"),
85 | assembler=ASSEMBLERS)
86 |
87 |
88 | localrules:
89 | upload_generate_assembly_info
90 |
91 |
92 | else:
93 | rule upload_assembly_all:
94 | input:
95 |
96 | else:
97 | rule upload_sequencing_all:
98 | input:
99 |
100 |
101 | rule upload_assembly_all:
102 | input:
103 |
104 |
105 | rule upload_all:
106 | input:
107 | rules.upload_sequencing_all.input,
108 | rules.upload_assembly_all.input#,
109 |
110 |
111 | localrules:
112 | upload_sequencing_all,
113 | upload_assembly_all,
114 | upload_all
--------------------------------------------------------------------------------
/scripts/kraken2_demultiplex_summary.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import argparse
4 | import sys
5 | import pandas as pd
6 | import pickle
7 | from pprint import pprint
8 | from taxadb.taxid import TaxID
9 | from taxadb.names import SciName
10 |
11 |
12 | def main(args_):
13 | parser = argparse.ArgumentParser("a summary of kraken2 demultiplex pickle")
14 | parser.add_argument(
15 | '--rank',
16 | choices=["superkingdom", "phylum", "class", "order", "family", "genus", "species"],
17 | default="genus",
18 | help='mini rank for merge'
19 | )
20 | parser.add_argument(
21 | '--taxadb',
22 | type=str,
23 | help='taxonomy database'
24 | )
25 | parser.add_argument(
26 | '-p',
27 | '--pickle_list',
28 | help='kraken2 demultiplex pickle list'
29 | )
30 | parser.add_argument(
31 | '-o',
32 | '--summary_output'
33 | )
34 | args = parser.parse_args(args_)
35 |
36 | LINEAGES = ["no_rank", "subspecies", "species", "genus", "family",
37 | "order", "class", "phylum", "superkingdom"]
38 |
39 | RANK = args.rank
40 | if not args.rank in LINEAGES[1:]:
41 | print("wrong rank %s" % args.rank)
42 | sys.exit(1)
43 |
44 | SUB_LINRAGES = LINEAGES[LINEAGES.index(RANK):]
45 |
46 | TAXID_DB = TaxID(dbtype='sqlite', dbname=args.taxadb)
47 | NAMES_DB = SciName(dbtype='sqlite', dbname=args.taxadb)
48 |
49 |
50 | def get_parent_taxid(tax_id, tax_name, level):
51 | if tax_id == 0:
52 | return "no_rank", 0, "unclassified"
53 |
54 | lineage_dict = TAXID_DB.lineage_id(tax_id, ranks=True)
55 |
56 | if lineage_dict is None:
57 | taxid = NAMES_DB.taxid(tax_name)
58 | if taxid is None:
59 | taxid = NAMES_DB.taxid(tax_name.split()[0])
60 | if not taxid is None:
61 | lineage_dict = TAXID_DB.lineage_id(taxid, ranks=True)
62 | else:
63 | return "no_rank", tax_id, tax_name
64 |
65 | for rank in SUB_LINRAGES:
66 | if rank in lineage_dict:
67 | return rank, lineage_dict[rank], TAXID_DB.lineage_name(lineage_dict[rank], ranks=True)[rank]
68 | return "no_rank", tax_id, "unclassified"
69 |
70 |
71 | summary_dict = {"taxid": [],
72 | "taxa_name": [],
73 | "reads_count": [],
74 | "rank": [],
75 | "parent_taxid": [],
76 | "parent_taxa_name": []}
77 |
78 | with open(args.pickle_list, 'r') as ih:
79 | for line in ih:
80 | with open(line.strip(), 'rb') as ph:
81 | kk2_ranks_counter = pickle.load(ph)
82 | # pprint(kk2_ranks_counter)
83 |
84 | for taxid in kk2_ranks_counter:
85 | if taxid in summary_dict["taxid"]:
86 | summary_dict["reads_count"][summary_dict["taxid"].index(taxid)] += 2 * kk2_ranks_counter[taxid][1]
87 | else:
88 | summary_dict["taxid"].append(taxid)
89 | summary_dict["taxa_name"].append(kk2_ranks_counter[taxid][0])
90 | summary_dict["reads_count"].append(2 * kk2_ranks_counter[taxid][1])
91 |
92 | rank_, taxid_, taxaname_ = get_parent_taxid(taxid, kk2_ranks_counter[taxid][0], RANK)
93 | summary_dict["rank"].append(rank_)
94 | summary_dict["parent_taxid"].append(taxid_)
95 | summary_dict["parent_taxa_name"].append(taxaname_)
96 |
97 | summary_df = pd.DataFrame.from_dict(summary_dict)
98 |
99 | summary_df.to_csv(args.summary_output, index=False, sep='\t')
100 |
101 |
102 | if __name__ == '__main__':
103 | main(sys.argv[1:])
104 |
--------------------------------------------------------------------------------
/scripts/qc_report.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import pandas as pd
4 | import concurrent.futures
5 | import subprocess
6 | import argparse
7 | import os
8 |
9 | def get_reads(df, id_, col_):
10 | return df.loc[[id_], col_].dropna().tolist()
11 |
12 |
13 | def run_(tuple_):
14 | try:
15 | output = subprocess.check_output(
16 | tuple_[0], shell=True, stderr=subprocess.STDOUT, universal_newlines=True
17 | )
18 | except subprocess.CalledProcessError as e:
19 | print(e.output)
20 | return None
21 |
22 | out_list = output.strip().split("\n")
23 | header = out_list[0].split("\t")
24 | data = []
25 |
26 | for line in out_list[1:]:
27 | content = tuple(line.split("\t"))
28 | data.append(content)
29 |
30 | df = pd.DataFrame(data, columns=header)
31 | df["id"] = tuple_[1]
32 | df["step"] = tuple_[2]
33 | df["fq_type"] = tuple_[3]
34 | df["reads"] = tuple_[4]
35 | return df
36 |
37 |
38 | def run(cmd_list, workers):
39 | df_list = []
40 | with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
41 | for df in executor.map(run_, cmd_list):
42 | if df is not None:
43 | df_list.append(df)
44 | df_ = pd.concat(df_list)
45 | return df_
46 |
47 |
48 | def gen(fastq_list, step, fq_encoding, is_pe=True):
49 | fq_df = pd.read_csv(fastq_list, sep="\t").set_index("id")
50 | cmd_list = []
51 |
52 | for i in fq_df.index.unique():
53 | fq1_list = get_reads(fq_df, i, "fq1")
54 | if is_pe:
55 | fq2_list = get_reads(fq_df, i, "fq2")
56 |
57 | if is_pe:
58 | if len(fq1_list) == 1:
59 | cmd = "seqkit stats -a -T -b -j 1 -E %s %s %s" % (
60 | fq_encoding,
61 | fq1_list[0],
62 | fq2_list[0],
63 | )
64 | cmd_list.append((cmd, i, step, "pe", ["fq1", "fq2"]))
65 | else:
66 | cmd_1 = "cat %s | seqkit stats -a -T -b -j 1 -E %s" % (
67 | " ".join(fq1_list),
68 | fq_encoding,
69 | )
70 | cmd_list.append((cmd_1, i, step, "pe", ["fq1"]))
71 | cmd_2 = "cat %s | seqkit stats -a -T -b -j 1 -E %s" % (
72 | " ".join(fq2_list),
73 | fq_encoding,
74 | )
75 | cmd_list.append((cmd_2, i, step, "pe", ["fq2"]))
76 | else:
77 | cmd = "cat %s | seqkit stats -a -T -b -j 1 -E %s" % (
78 | " ".join(fq1_list),
79 | fq_encoding,
80 | )
81 | cmd_list.append((cmd, i, step, "se", ["fq1"]))
82 | return cmd_list
83 |
84 |
85 | def main():
86 | parser = argparse.ArgumentParser(description="generate quality control report from raw, trimming, rmhost data")
87 | parser.add_argument("--raw_list", help="raw data list, headers: id fq1 fq2")
88 | parser.add_argument("--trimming_list", help="trimming data list, headers: id fq1 fq2")
89 | parser.add_argument("--rmhost_list", help="rmhost data list, headers: id fq1 fq2")
90 | parser.add_argument("--is_se", action='store_true', default=False, help='default: is_pe')
91 | parser.add_argument("--fq_encoding", help="fastq quality encoding, default: sanger", default="sanger")
92 | parser.add_argument("--threads", help="threads, default: 8", default=8)
93 | parser.add_argument("--output", help="qc report output")
94 |
95 | args = parser.parse_args()
96 |
97 | cmd_raw = gen(args.raw_list, "raw", args.fq_encoding, not args.is_se)
98 | cmd_trimming = gen(args.trimming_list, "trimming", args.fq_encoding, not args.is_se)
99 | cmd_rmhost = gen(args.rmhost_list, "rmhost", args.fq_encoding, not args.is_se)
100 |
101 | cmd = cmd_raw + cmd_trimming + cmd_rmhost
102 |
103 | df = run(cmd, args.threads)
104 | df.to_csv(args.output, sep='\t', index=False)
105 |
106 | if __name__ == "__main__":
107 | main()
108 |
--------------------------------------------------------------------------------
/scripts/clean_statout_to_matrix.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | ## Metagenomics Institute of BGI Research
3 | ## zhujie@genomics.cn
4 | ## 2017-11-29
5 | ## GPL-V3
6 |
7 | import os
8 | import argparse
9 |
10 | ##TODO
11 | ## clean and SE
12 |
13 | def parse_pe_clean_statout(handle, min_l, max_l):
14 | header_list = [str(i) for i in range(min_l, max_l + 1)]
15 | value_dict = {}
16 | for key in header_list:
17 | value_dict[key] = 0
18 |
19 | ## total info
20 | for key,value in zip(handle.readline().strip().split(), handle.readline().strip().split()):
21 | header_list.append(key)
22 | value_dict[key] = value
23 |
24 | ## reads_1 info
25 | tag = True
26 | for key,value in zip(handle.readline().strip().split(), handle.readline().strip().split()):
27 | if tag:
28 | tag = False
29 | else:
30 | key = key + "_1"
31 | header_list.append(key)
32 | value_dict[key] = value
33 |
34 | ## reads_2 info
35 | tag = True
36 | for key,value in zip(handle.readline().strip().split(), handle.readline().strip().split()):
37 | if tag:
38 | tag = False
39 | else:
40 | key = key + "_2"
41 | header_list.append(key)
42 | value_dict[key] = value
43 |
44 | ## reads_single info
45 | tag = True
46 | for key,value in zip(handle.readline().strip().split(), handle.readline().strip().split()):
47 | if tag:
48 | tag = False
49 | else:
50 | key = key + "_single"
51 | header_list.append(key)
52 | value_dict[key] = value
53 |
54 | ## length info
55 | next(handle)
56 | total_filter_base = 0
57 | total_filter_reads = 0
58 | total_filter_reads_len_gt80 = 0
59 | L80 = 0
60 | for line in handle:
61 | line_list = line.strip().split()
62 | reads_len = line_list[0]
63 | reads_num = line_list[1]
64 | total_filter_reads += int(reads_num)
65 | total_filter_base += int(reads_len) * int(reads_num)
66 | if (int(reads_len) >= 80):
67 | total_filter_reads_len_gt80 += int(reads_num)
68 | value_dict[str(reads_len)] = str(reads_num)
69 |
70 | L80 = total_filter_reads_len_gt80 / total_filter_reads
71 | header_list.append("total_filter_base")
72 | value_dict["total_filter_base"] = str(total_filter_base)
73 | header_list.append("total_filter_reads")
74 | value_dict["total_filter_reads"] = str(total_filter_reads)
75 | header_list.append("L80")
76 | value_dict["L80"] = str(L80)
77 |
78 | return (value_dict, header_list)
79 |
80 | def gen_len_matrix(dirname, min_l, max_l):
81 | no_header = True
82 | for fl in os.listdir(dirname):
83 | if fl.endswith("stat_out"):
84 | sample_name = fl.split("/")[-1].split(".")[0]
85 | statout = os.path.join(dirname, fl)
86 | with open(statout, 'r') as h:
87 | tuple_ = parse_pe_clean_statout(h, min_l, max_l)
88 | if no_header:
89 | header = "sample_name\t" + "\t".join(tuple_[1])
90 | print(header)
91 | body = sample_name + "\t" + "\t".join([tuple_[0][key] for key in tuple_[1]])
92 | print(body)
93 | no_header = False
94 | else:
95 | body = sample_name + "\t" + "\t".join([tuple_[0][key] for key in tuple_[1]])
96 | print(body)
97 |
98 | def main():
99 | parser = argparse.ArgumentParser("convert many clean statout to a matrix\n \
100 | e.g: python clean_statout_to_matrix.py ../data/clean_statout -m 100 -n 30 > ../data/length_clean_statout.tsv\n")
101 | parser.add_argument("-d", "--statout_dir", help="a directory contain many samples clean statout file")
102 | parser.add_argument("-m", "--max_len", type=int, help="max reads length")
103 | parser.add_argument("-n", "--min_len", type=int, help="min reads length")
104 | args = parser.parse_args()
105 | gen_len_matrix(args.statout_dir, args.min_len, args.max_len)
106 |
107 | if __name__ == "__main__":
108 | main()
109 |
--------------------------------------------------------------------------------
/scripts/extract_bins_from_mgs_profile.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from Bio.Alphabet import generic_dna
4 | from Bio import SeqIO
5 | import argparse
6 | import re
7 | import os
8 | import time
9 |
10 |
11 | def take_second(elem):
12 | return elem[1]
13 |
14 |
15 | def parse_mgs(mgs_profile):
16 | bins = {}
17 | bins_num = []
18 | with open(mgs_profile, 'r') as ih:
19 | for line in ih:
20 | line_list = re.split(r"\s+|,", line.strip(",|\n"))
21 | bin_id = line_list[0]
22 | contigs_count = int(line_list[1])
23 | bins[bin_id] = []
24 | bins_num.append((bin_id, contigs_count))
25 | for contig_id in line_list[2:]:
26 | bins[bin_id].append(contig_id)
27 | bins_num.sort(key=take_second, reverse=True)
28 | return bins, bins_num
29 |
30 |
31 | def extract(contigs_list, bins, bins_num, head, tail, outdir):
32 | files = []
33 | all_count = 0
34 | with open(contigs_list, 'r') as ih:
35 | for line in ih:
36 | files.append(line.strip())
37 |
38 | begin = time.time()
39 | records = SeqIO.index_db(":memory:", files, "fasta", generic_dna)
40 | end = time.time()
41 | print("index db: %.2f s" % (end - begin))
42 |
43 | begin = time.time()
44 |
45 | if head is not None:
46 | if head > len(bins_num):
47 | count = len(bins_num)
48 | else:
49 | count = head
50 | all_count += count
51 | for i in range(count):
52 | bin_id = bins_num[i][0]
53 | with open(os.path.join(outdir, bin_id + ".fa"), 'w') as oh:
54 | for contig_id in bins[bin_id]:
55 | if contig_id in records:
56 | SeqIO.write(records[contig_id], oh, 'fasta')
57 | else:
58 | print("%s has not find %s" % (bin_id, contig_id))
59 |
60 | if tail is not None:
61 | if tail > len(bins_num):
62 | count = len(bins_num)
63 | else:
64 | count = tail
65 | all_count += count
66 | for i in range(count):
67 | bin_id = bins_num[-(i+1)][0]
68 | with open(os.path.join(outdir, bin_id + ".fa"), 'w') as oh:
69 | for contig_id in bins[bin_id]:
70 | if contig_id in records:
71 | SeqIO.write(records[contig_id], oh, 'fasta')
72 | else:
73 | print("%s has not find %s" % (bin_id, contig_id))
74 |
75 | if (head is None) and (tail is None):
76 | for bin_id in bins:
77 | all_count += 1
78 | with open(os.path.join(outdir, bin_id + ".fa"), 'w') as oh:
79 | for contig_id in bins[bin_id]:
80 | if contig_id in records:
81 | SeqIO.write(records[contig_id], oh, 'fasta')
82 | else:
83 | print("%s has not find %s" % (bin_id, contig_id))
84 |
85 | records.close()
86 | end = time.time()
87 |
88 | print("extract %d bins: %.2f s" % (all_count, end - begin))
89 |
90 |
91 | def main():
92 | parser = argparse.ArgumentParser("get bins fasta from mgs contigs/scafoolds profile")
93 | parser.add_argument('-p', '--profile', type=str, help='mgs contigs/scaffolds profile')
94 | parser.add_argument('-l', '--contigs_list', type=str, help='assembly contigs/scaffolds fasta path list')
95 | parser.add_argument('-o', '--outdir', type=str, help='bins output dir')
96 | parser.add_argument('--head', type=int, default=None, help='head number bins')
97 | parser.add_argument('--tail', type=int, default=None, help='tail number bins')
98 |
99 | args = parser.parse_args()
100 | if not os.path.exists(args.outdir):
101 | os.makedirs(args.outdir, exist_ok=True)
102 |
103 | (bins, bins_num) = parse_mgs(args.profile)
104 |
105 | if (args.head is not None) and (args.tail is not None):
106 | assert args.head + args.tail <= len(bins_num), "too many head or too many tail"
107 |
108 | extract(args.contigs_list, bins, bins_num, args.head, args.tail, args.outdir)
109 |
110 |
111 | if __name__ == '__main__':
112 | main()
113 |
--------------------------------------------------------------------------------
/scripts/filter_pe_fastq_by_size.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 | import glob
5 | import os
6 |
7 | # see: http://goo.gl/kTQMs
8 | SYMBOLS = {
9 | 'customary' : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'),
10 | 'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', 'zetta', 'iotta'),
11 | 'iec' : ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
12 | 'iec_ext' : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', 'zebi', 'yobi'),
13 | }
14 |
15 | def bytes2human(n, format='%(value).1f %(symbol)s', symbols='customary'):
16 | """
17 | Convert n bytes into a human readable string based on format.
18 | symbols can be either "customary", "customary_ext", "iec" or "iec_ext",
19 | see: http://goo.gl/kTQMs
20 |
21 | >>> bytes2human(0)
22 | '0.0 B'
23 | >>> bytes2human(0.9)
24 | '0.0 B'
25 | >>> bytes2human(1)
26 | '1.0 B'
27 | >>> bytes2human(1.9)
28 | '1.0 B'
29 | >>> bytes2human(1024)
30 | '1.0 K'
31 | >>> bytes2human(1048576)
32 | '1.0 M'
33 | >>> bytes2human(1099511627776127398123789121)
34 | '909.5 Y'
35 |
36 | >>> bytes2human(9856, symbols="customary")
37 | '9.6 K'
38 | >>> bytes2human(9856, symbols="customary_ext")
39 | '9.6 kilo'
40 | >>> bytes2human(9856, symbols="iec")
41 | '9.6 Ki'
42 | >>> bytes2human(9856, symbols="iec_ext")
43 | '9.6 kibi'
44 |
45 | >>> bytes2human(10000, "%(value).1f %(symbol)s/sec")
46 | '9.8 K/sec'
47 |
48 | >>> # precision can be adjusted by playing with %f operator
49 | >>> bytes2human(10000, format="%(value).5f %(symbol)s")
50 | '9.76562 K'
51 | """
52 | n = int(n)
53 | if n < 0:
54 | raise ValueError("n < 0")
55 | symbols = SYMBOLS[symbols]
56 | prefix = {}
57 | for i, s in enumerate(symbols[1:]):
58 | prefix[s] = 1 << (i+1)*10
59 | for symbol in reversed(symbols[1:]):
60 | if n >= prefix[symbol]:
61 | value = float(n) / prefix[symbol]
62 | return format % locals()
63 | return format % dict(symbol=symbols[0], value=n)
64 |
65 | def human2bytes(s):
66 | """
67 | Attempts to guess the string format based on default symbols
68 | set and return the corresponding bytes as an integer.
69 | When unable to recognize the format ValueError is raised.
70 |
71 | >>> human2bytes('0 B')
72 | 0
73 | >>> human2bytes('1 K')
74 | 1024
75 | >>> human2bytes('1 M')
76 | 1048576
77 | >>> human2bytes('1 Gi')
78 | 1073741824
79 | >>> human2bytes('1 tera')
80 | 1099511627776
81 |
82 | >>> human2bytes('0.5kilo')
83 | 512
84 | >>> human2bytes('0.1 byte')
85 | 0
86 | >>> human2bytes('1 k') # k is an alias for K
87 | 1024
88 | >>> human2bytes('12 foo')
89 | Traceback (most recent call last):
90 | ...
91 | ValueError: can't interpret '12 foo'
92 | """
93 | init = s
94 | num = ""
95 | while s and s[0:1].isdigit() or s[0:1] == '.':
96 | num += s[0]
97 | s = s[1:]
98 | num = float(num)
99 | letter = s.strip()
100 | for name, sset in SYMBOLS.items():
101 | if letter in sset:
102 | break
103 | else:
104 | if letter == 'k':
105 | # treat 'k' as an alias for 'K' as per: http://goo.gl/kTQMs
106 | sset = SYMBOLS['customary']
107 | letter = letter.upper()
108 | else:
109 | raise ValueError("can't interpret %r" % init)
110 | prefix = {sset[0]:1}
111 | for i, s in enumerate(sset[1:]):
112 | prefix[s] = 1 << (i+1)*10
113 | return int(num * prefix[letter])
114 |
115 |
116 | def main():
117 | FASTQ_SIZE = human2bytes(sys.argv[2])
118 | samples = {}
119 | for i in glob.glob(sys.argv[1].rstrip("/") + "/*.1.fq.gz"):
120 | if os.path.getsize(i) >= FASTQ_SIZE:
121 | sample_id = os.path.basename(i).split(".")[0]
122 | samples[sample_id] = [os.path.abspath(i), os.path.abspath(i.replace("1.fq.gz", "2.fq.gz"))]
123 |
124 | print("id\tfq1\tfq2")
125 | for i in samples:
126 | print("%s\t%s\t%s" % (i, samples[i][0], samples[i][1]))
127 |
128 |
129 | if __name__ == '__main__':
130 | main()
131 |
--------------------------------------------------------------------------------
/metapi/profiles/lsf/memory_units.py:
--------------------------------------------------------------------------------
1 | import re
2 | from enum import Enum
3 | from typing import Union
4 | from collections import namedtuple
5 |
6 |
7 | class InvalidSuffix(Exception):
8 | pass
9 |
10 |
11 | class InvalidPower(Exception):
12 | pass
13 |
14 |
15 | class InvalidMemoryString(Exception):
16 | pass
17 |
18 |
19 | Scale = namedtuple("Scale", ["power", "metric_suffix"])
20 |
21 |
22 | SCALE_MAP = {
23 | "B": Scale(0, "B"),
24 | "K": Scale(1, "KB"),
25 | "M": Scale(2, "MB"),
26 | "G": Scale(3, "GB"),
27 | "T": Scale(4, "TB"),
28 | "P": Scale(5, "PB"),
29 | "E": Scale(6, "EB"),
30 | "Z": Scale(7, "ZB"),
31 | }
32 |
33 |
34 | class Unit(Enum):
35 | BYTES = SCALE_MAP["B"]
36 | KILO = SCALE_MAP["K"]
37 | MEGA = SCALE_MAP["M"]
38 | GIGA = SCALE_MAP["G"]
39 | TERA = SCALE_MAP["T"]
40 | PETA = SCALE_MAP["P"]
41 | EXA = SCALE_MAP["E"]
42 | ZETTA = SCALE_MAP["Z"]
43 |
44 | @staticmethod
45 | def from_suffix(suffix: str) -> "Unit":
46 | first_letter = suffix[0].upper()
47 | if first_letter not in SCALE_MAP:
48 | valid_suffixes = ",".join(
49 | scale.metric_suffix for scale in SCALE_MAP.values()
50 | )
51 | raise InvalidSuffix(
52 | "{suffix}. Valid suffixes are: {valid_suffixes}".format(
53 | suffix=suffix, valid_suffixes=valid_suffixes
54 | )
55 | )
56 | return Unit(SCALE_MAP[first_letter])
57 |
58 | @staticmethod
59 | def from_power(power: int) -> "Unit":
60 | valid_powers = []
61 | for scale in SCALE_MAP.values():
62 | if scale.power == power:
63 | return Unit(scale)
64 | valid_powers.append(scale.power)
65 |
66 | raise InvalidPower(
67 | "{power}. Valid powers are: {valid}".format(
68 | power=power, valid=",".join(str(p) for p in valid_powers)
69 | )
70 | )
71 |
72 | @property
73 | def power(self) -> int:
74 | return self.value.power
75 |
76 | @property
77 | def suffix(self) -> str:
78 | return self.value.metric_suffix
79 |
80 |
81 | Number = Union[int, float]
82 |
83 |
84 | class Memory:
85 | def __init__(self, value: Number = 1, unit: Unit = Unit.BYTES):
86 | self.value = value
87 | self.unit = unit
88 | self._decimal_scaling_factor = 1000
89 | self._binary_scaling_factor = 1024
90 |
91 | def __eq__(self, other: "Memory") -> bool:
92 | return self.bytes() == other.bytes()
93 |
94 | def __repr__(self) -> str:
95 | val = (
96 | int(self.value)
97 | if isinstance(self.value, int) or self.value.is_integer()
98 | else self.value
99 | )
100 | return "{val}{sfx}".format(val=val, sfx=self.suffix)
101 |
102 | @property
103 | def power(self) -> int:
104 | return self.unit.power
105 |
106 | @property
107 | def suffix(self) -> str:
108 | return self.unit.suffix
109 |
110 | def _scaling_factor(self, decimal: bool = True) -> int:
111 | return self._decimal_scaling_factor if decimal else self._binary_scaling_factor
112 |
113 | def bytes(self, decimal_multiples: bool = True) -> float:
114 | scaling_factor = self._scaling_factor(decimal_multiples)
115 | return float(self.value * (scaling_factor ** self.power))
116 |
117 | def to(self, unit: Unit, decimal_multiples: bool = True) -> "Memory":
118 | scaling_factor = self._scaling_factor(decimal_multiples) ** unit.power
119 | size = self.bytes(decimal_multiples)
120 | size /= scaling_factor
121 |
122 | return Memory(size, unit)
123 |
124 | @staticmethod
125 | def from_str(s: str) -> "Memory":
126 | valid_suffixes = "".join(scale.metric_suffix for scale in SCALE_MAP.values())
127 | regex = re.compile(
128 | r"^(?P[0-9]*\.?[0-9]+)\s*(?P[{}]B?)?$".format(valid_suffixes),
129 | re.IGNORECASE,
130 | )
131 | match = regex.search(s)
132 |
133 | if not match:
134 | raise InvalidMemoryString("{s} is an invalid memory string.".format(s=s))
135 |
136 | size = float(match.group("size"))
137 | suffix = match.group("sfx") or "B"
138 | unit = Unit.from_suffix(suffix)
139 |
140 | return Memory(size, unit)
141 |
--------------------------------------------------------------------------------
/metapi/rules/binning_report.smk:
--------------------------------------------------------------------------------
1 | if len(BINNERS_CHECKM) != 0:
2 | rule binning_report:
3 | input:
4 | lambda wildcards: get_binning_done(wildcards, [wildcards.binner_checkm])
5 | output:
6 | directory(
7 | os.path.join(
8 | config["output"]["binning"],
9 | "report/{assembler}/{binner_checkm}/{binning_group}.{assembly_group}"))
10 | params:
11 | binning_group = "{binning_group}",
12 | assembly_group = "{assembly_group}",
13 | assembler = "{assembler}",
14 | binner = "{binner_checkm}"
15 | priority:
16 | 35
17 | run:
18 | import glob
19 |
20 | shell('''rm -rf {output}''')
21 | shell('''mkdir -p {output}''')
22 |
23 | bin_list = glob.glob(os.path.dirname(input[0]) + "/*.fa.gz")
24 | header_list = [
25 | "binning_group", "assembly_group", "bin_id", "bin_file", "assembler", "binner",
26 | "chr", "length", "#A", "#C", "#G", "#T",
27 | "#2", "#3", "#4", "#CpG", "#tv", "#ts", "#CpG-ts"]
28 | header_name = "\\t".join(header_list)
29 |
30 | for bin_fa in bin_list:
31 | bin_id = os.path.basename(os.path.splitext(os.path.splitext(bin_fa)[0])[0])
32 | bin_file = os.path.abspath(bin_fa)
33 | header_content = "\\t".join([params.binning_group, params.assembly_group, bin_id, bin_file, params.assembler, params.binner])
34 | stats_file = os.path.join(output[0], bin_id + ".seqtk.comp.tsv.gz")
35 |
36 | shell(
37 | '''
38 | seqtk comp %s | \
39 | awk \
40 | 'BEGIN \
41 | {{print "%s"}}; \
42 | {{print "%s" "\t" $0}}' | \
43 | gzip -c > %s
44 | ''' % (bin_fa, header_name, header_content, stats_file))
45 |
46 |
47 | rule binning_report_merge:
48 | input:
49 | expand(os.path.join(
50 | config["output"]["binning"],
51 | "report/{{assembler}}/{{binner_checkm}}/{binning_group}.{assembly_group}"),
52 | zip,
53 | binning_group=ASSEMBLY_GROUP["binning_group"],
54 | assembly_group=ASSEMBLY_GROUP["assembly_group"])
55 | output:
56 | summary = os.path.join(
57 | config["output"]["binning"],
58 | "report/assembly_stats_{assembler}.{binner_checkm}.tsv.gz")
59 | params:
60 | min_length = config["params"]["assembly"]["report"]["min_length"],
61 | len_ranges = config["params"]["assembly"]["report"]["len_ranges"]
62 | threads:
63 | config["params"]["binning"]["threads"]
64 | run:
65 | import glob
66 | comp_list = []
67 | for i in input:
68 | comp_list += glob.glob(i + "/*.seqtk.comp.tsv.gz")
69 |
70 | if len(comp_list) != 0:
71 | metapi.assembler_init(
72 | params.len_ranges,
73 | ["binning_group", "assembly_group", "bin_id", "bin_file", "assembler", "binner"])
74 | comp_list_ = [(j, params.min_length) for j in comp_list]
75 | metapi.merge(
76 | comp_list_, metapi.parse_assembly,
77 | threads, output=output.summary)
78 | else:
79 | shell('''touch {output.summary}''')
80 |
81 |
82 | rule binning_report_all:
83 | input:
84 | expand(os.path.join(
85 | config["output"]["binning"],
86 | "report/assembly_stats_{assembler}.{binner_checkm}.tsv.gz"),
87 | assembler=ASSEMBLERS,
88 | binner_checkm=BINNERS_CHECKM)
89 |
90 | else:
91 | rule binning_report_all:
92 | input:
93 |
94 |
95 | rule binning_all:
96 | input:
97 | rules.binning_metabat2_all.input,
98 | rules.binning_maxbin2_all.input,
99 | rules.binning_concoct_all.input,
100 | rules.binning_graphbin2_all.input,
101 | rules.binning_vamb_all.input,
102 | rules.binning_semibin_all.input,
103 | rules.binning_dastools_all.input,
104 | rules.binning_report_all.input
105 |
106 |
107 | localrules:
108 | binning_report_all,
109 | binning_all
--------------------------------------------------------------------------------
/scripts/post_assembly_binning.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 | import os
4 |
5 | import pandas as pd
6 |
7 |
8 | def codegen(samples_tsv, output_dir):
9 | samples = pd.read_csv(samples_tsv, sep='\t').set_index("bin_id", drop=False)
10 |
11 | os.makedirs(output_dir, exist_ok=True)
12 |
13 | index_dir = os.path.join(output_dir, "00.index")
14 | os.makedirs(index_dir, exist_ok=True)
15 |
16 | mapping_dir = os.path.join(output_dir, "01.mapping")
17 | os.makedirs(mapping_dir, exist_ok=True)
18 |
19 | asm_dir = os.path.join(output_dir, "02.assembly")
20 | os.makedirs(asm_dir, exist_ok=True)
21 |
22 | checkm_asm_dir = os.path.join(output_dir, "03.checkm_asm")
23 | os.makedirs(checkm_asm_dir, exist_ok=True)
24 | checkm_asm_input_dir = os.path.join(checkm_asm_dir, "input")
25 | os.makedirs(checkm_asm_input_dir, exist_ok=True)
26 | checkm_asm_output_dir = os.path.join(checkm_asm_dir, "output")
27 | os.makedirs(checkm_asm_output_dir, exist_ok=True)
28 |
29 | with open(os.path.join(output_dir, "step1.index.sh"), 'w') as oh1, \
30 | open(os.path.join(output_dir, "step2.mapping.sh"), 'w') as oh2, \
31 | open(os.path.join(output_dir, "step3.assembly_spades.sh"), 'w') as oh3, \
32 | open(os.path.join(output_dir, "step3.assembly_shovill_spades.sh"), 'w') as oh4, \
33 | open(os.path.join(output_dir, "step3.assembly_shovill_megahit.sh"), 'w') as oh5, \
34 | open(os.path.join(output_dir, "step3.assembly_shovill_velvet.sh"), 'w') as oh6, \
35 | open(os.path.join(output_dir, "step3.assembly_shovill_skesa.sh"), 'w') as oh7, \
36 | open(os.path.join(output_dir, "step4.links_asm_fa.sh"), 'w') as oh8, \
37 | open(os.path.join(output_dir, "step5.checkm_lineage_wf.sh"), 'w') as oh9:
38 |
39 | for bin_id in samples.index:
40 | # index
41 | prefix = os.path.join(index_dir, bin_id)
42 | cmd = "bwa index %s -p %s\n" % (samples.loc[bin_id, "bins_fna_path"], prefix)
43 | oh1.write(cmd)
44 |
45 | # mapping and extract reads
46 | r1 = os.path.join(mapping_dir, "%s.r1.fq.gz" % bin_id)
47 | r2 = os.path.join(mapping_dir, "%s.r2.fq.gz" % bin_id)
48 | stat = os.path.join(mapping_dir, "%s-flagstat.txt" % bin_id)
49 | cmd = "bwa mem -t 8 %s %s %s | tee >(samtools flagstat -@8 - > %s) | samtools fastq -@8 -F 12 -n -1 %s -2 %s -\n" % (
50 | prefix, samples.loc[bin_id, "fq1"], samples.loc[bin_id, "fq2"], stat, r1, r2)
51 | oh2.write(cmd)
52 |
53 | # assembly
54 | ## spades
55 | bins_asm_dir = os.path.join(asm_dir, bin_id + ".spades_out")
56 | cmd = "spades.py -1 %s -2 %s -k 21,29,39,59,79,99 --threads 8 -o %s\n" % (r1, r2, bins_asm_dir)
57 | oh3.write(cmd)
58 | asm_fa = os.path.join(bins_asm_dir, "scaffolds.fasta")
59 | asm_fa_ = os.path.join(checkm_asm_input_dir, bin_id + ".spades.fa")
60 | oh8.write("ln -s %s %s\n" % (asm_fa, asm_fa_))
61 |
62 | ## shovill
63 | ### spades or megahit or velvet or skesa
64 | for assembler, file_handle in zip(["spades", "megahit", "velvet", "skesa"], [oh4, oh5, oh6, oh7]):
65 | bins_asm_dir = os.path.join(asm_dir, bin_id + ".shovill_%s_out" % assembler)
66 | cmd = "shovill --cpus 8 --ram 20 --keepfiles --assembler %s --outdir %s --R1 %s --R2 %s\n" % (assembler, bins_asm_dir, r1, r2)
67 | file_handle.write(cmd)
68 | asm_fa = os.path.join(bins_asm_dir, "contigs.fa")
69 | asm_fa_ = os.path.join(checkm_asm_input_dir, bin_id + "." + assembler + ".fa")
70 | oh8.write("ln -s %s %s\n" % (asm_fa, asm_fa_))
71 |
72 | checkm_asm_out = os.path.join(checkm_asm_dir, "checkm_asm.txt")
73 | checkm_asm_log = os.path.join(checkm_asm_dir, "checkm_asm.log")
74 | checkm_asm_cmd = "checkm lineage_wf -f %s -t 8 --pplacer_threads 8 -x fa %s/ %s/ 2>%s\n" % \
75 | (checkm_asm_out, checkm_asm_input_dir, checkm_asm_output_dir, checkm_asm_log)
76 | oh9.write(checkm_asm_cmd)
77 |
78 |
79 | def main():
80 | parser = argparse.ArgumentParser(description='reassembly reads')
81 | parser.add_argument('-s', '--samples', type=str, help='metagenomics bins and paired reads list')
82 | parser.add_argument('-o', '--outdir', type=str, help='output directory')
83 | args = parser.parse_args()
84 | codegen(args.samples, args.outdir)
85 |
86 |
87 | if __name__ == "__main__":
88 | main()
89 |
--------------------------------------------------------------------------------
/scripts/estimate_T2T_data_size.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import pandas as pd
4 | import requests
5 | import xmltodict
6 | import argparse
7 | from rich import print
8 | from rich.console import Console
9 |
10 | # https://github.com/Textualize/rich/issues/67
11 | _console = Console()
12 |
13 | class RichArgumentParser(argparse.ArgumentParser):
14 | def _print_message(self, message, file=None):
15 | _console.print(message)
16 |
17 | def add_argument_group(self, *args, **kwargs):
18 | group = super().add_argument_group(*args, **kwargs)
19 | group.title = f"[cyan]{group.title.title()}[/cyan]"
20 | return group
21 |
22 |
23 | class RichRawTextHelpFormatter(argparse.RawTextHelpFormatter):
24 | def _split_lines(self, text, width):
25 | return [f"[yellow]{line}[/yellow]" for line in text.splitlines()]
26 |
27 |
28 | # see: http://goo.gl/kTQMs
29 | SYMBOLS = {
30 | 'customary' : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'),
31 | 'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', 'zetta', 'iotta'),
32 | 'iec' : ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
33 | 'iec_ext' : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', 'zebi', 'yobi'),
34 | }
35 |
36 |
37 | def bytes2human(n, format='%(value).1f %(symbol)s', symbols='customary'):
38 | n = int(n)
39 | if n < 0:
40 | raise ValueError("n < 0")
41 | symbols = SYMBOLS[symbols]
42 | prefix = {}
43 | for i, s in enumerate(symbols[1:]):
44 | prefix[s] = 1 << (i+1)*10
45 | for symbol in reversed(symbols[1:]):
46 | if n >= prefix[symbol]:
47 | value = float(n) / prefix[symbol]
48 | return format % locals()
49 | return format % dict(symbol=symbols[0], value=n)
50 |
51 |
52 | def human2bytes(s):
53 | init = s
54 | num = ""
55 | while s and s[0:1].isdigit() or s[0:1] == '.':
56 | num += s[0]
57 | s = s[1:]
58 | if num != "":
59 | num = float(num)
60 | else:
61 | raise ValueError(f"can't covert {s} to float")
62 | letter = s.strip()
63 | #print(letter)
64 | for name, sset in SYMBOLS.items():
65 | if letter in sset:
66 | break
67 | else:
68 | if (letter == 'k') or (letter == "m") or (letter == "g"):
69 | # treat 'k' as an alias for 'K' as per: http://goo.gl/kTQMs
70 | sset = SYMBOLS['customary']
71 | letter = letter.upper()
72 | else:
73 | raise ValueError("can't interpret %r" % init)
74 | prefix = {sset[0]:1}
75 | for i, s in enumerate(sset[1:]):
76 | prefix[s] = 1 << (i+1)*10
77 | return int(num * prefix[letter])
78 |
79 |
80 | def generate_xml(http_link):
81 | print(f'''Parsing: {http_link}\n''')
82 | r = requests.get(http_link)
83 | if "xml" in r.headers['content-type']:
84 | print(f'''Success: get XML document from the link: {http_link}\n''')
85 | return r.text
86 | else:
87 | print(f'''Error: can't get XML document from the link: {http_link}\nExiting\n''')
88 | return None
89 |
90 |
91 | def estimate_size(xml_str, output=None):
92 | xml_dict = xmltodict.parse(xml_str)
93 | if "ListBucketResult" in xml_dict:
94 | file_info_df = pd.DataFrame(xml_dict["ListBucketResult"]["Contents"])\
95 | .astype({"Size": int})\
96 | .sort_values(["Size", "Key"])
97 | print(file_info_df)
98 |
99 | if output is not None:
100 | file_info_df.to_csv(output, sep="\t", index=False)
101 |
102 | total_size = sum(file_info_df["Size"])
103 | total_size_human = bytes2human(total_size)
104 | print(f'''\nTotal file size is: {total_size}''')
105 | print(f'''\nTotal file size is: {total_size_human}''')
106 | else:
107 | print("\nError: parse XML document error\nExiting")
108 |
109 |
110 | def main():
111 | parser = RichArgumentParser("Estimate T2T data size")
112 | parser.add_argument("--http-link", dest="http_link",
113 | default="https://s3-us-west-2.amazonaws.com/human-pangenomics?/delimiter=/&prefix=T2T",
114 | help="T2T file/directory S3 link, default:\nhttps://s3-us-west-2.amazonaws.com/human-pangenomics?/delimiter=/&prefix=T2T")
115 | parser.add_argument("--output", dest="output", default=None,
116 | help="a tsv file contains file information, default=None")
117 | args = parser.parse_args()
118 |
119 | xml_str = generate_xml(args.http_link)
120 | estimate_size(xml_str, args.output)
121 |
122 |
123 | if __name__ == "__main__":
124 | main()
125 |
--------------------------------------------------------------------------------
/scripts/mapping_statistics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 | import csv
4 | import fileinput
5 | import os
6 | import re
7 | from decimal import *
8 |
9 |
10 | """
11 | How to assess the quality of metagenomcis assembly
12 | https://www.biostars.org/p/128629/#128639
13 |
14 | Brian Bushnell saied:
15 | calculate the percentage of reads that map back to the assembly
16 | if only 50% of your reads map to the assembly, it is not very complete
17 | but if 95% of your reads map to the assembly, then even if it is
18 | somewhat fragmented, that's probably very good
19 |
20 | It might also help to look at the percentage properly paired reads to
21 | detect any chimeras, something that seems especially relevant in a
22 | metagenome assembly
23 |
24 | the most useful tools is quast, quality assessment tool for
25 | genome ascalculate map rate from bamsemblies
26 |
27 | http://genomebio.org/alignment-stats-bwa/
28 | getting alignment stats out of bwa
29 |
30 | bwa mem -t 6 ref read.1.fq read.2.fq \
31 | | samtools view -@6 -Sbh - \
32 | | tee >(samtools flagstat - > stats.out) > aln.bam
33 |
34 | http://www.pnas.org/content/pnas/113/42/11901.full.pdf
35 | Deep sequencing of 10,000 human genomes(Amalio Telenti and J.Craig Venter)
36 | 测序深度 = reads长度 × 比对的reads数目 / 参考序列长度
37 |
38 | metabat_coverage
39 | concoct_coverage
40 | checkm_coverage
41 | """
42 |
43 |
44 | def mapping_rate(flagstats, out_file, method):
45 | """
46 | get alignment rate from sorted bam file
47 | samtools -flagstat --threads 8 sample.sort.bam
48 | """
49 | headers = [
50 | 'sample_id', 'total_num', 'read_1_num', 'read_2_num', 'mapping_type',
51 | 'mapped_num', 'mapped_rate', 'paired_num', 'paired_rate',
52 | 'singletons_num', 'singletons_rate', 'mate_mapped_num',
53 | 'mate_mapped_num_mapQge5'
54 | ]
55 | mapping_info = []
56 | getcontext().prec = 8
57 |
58 | # with open(flagstat_list, 'r') as list_handle:
59 | if method == 1:
60 | list_handle = open(flagstats, 'r')
61 | if method == 2:
62 | list_handle = flagstats
63 |
64 | for flagstat_file in list_handle:
65 | info = {}
66 | info['sample_id'] = os.path.basename(
67 | flagstat_file.strip()).split('.')[0]
68 | stat_list = open(flagstat_file.strip(), 'r').readlines()
69 | info['total_num'] = stat_list[0].split(' ')[0]
70 | info['read_1_num'] = stat_list[6].split(' ')[0]
71 | info['read_2_num'] = stat_list[7].split(' ')[0]
72 |
73 | mapped = re.split(r'\(|\s+', stat_list[4])
74 | info['mapped_num'] = mapped[0]
75 | info['mapped_rate'] = Decimal(mapped[5].rstrip('%')) / Decimal(100)
76 |
77 | paired = re.split(r'\(|\s+', stat_list[8])
78 | info['paired_num'] = paired[0]
79 | paired_rate = paired[6].rstrip('%')
80 | if paired_rate != "N/A":
81 | info['paired_rate'] = Decimal(paired_rate) / Decimal(100)
82 | info['mapping_type'] = "paired-end"
83 | else:
84 | info['paired_rate'] = paired_rate
85 | info["mapping_type"] = "single-end"
86 |
87 | singletons = re.split(r'\(|\s+', stat_list[-3])
88 | info['singletons_num'] = singletons[0]
89 | singletons_rate = singletons[5].rstrip('%')
90 | if singletons_rate != "N/A":
91 | info['singletons_rate'] = Decimal(singletons_rate) / Decimal(100)
92 | else:
93 | info['singletons_rate'] = singletons_rate
94 |
95 | info['mate_mapped_num'] = re.split(r'\(|\s+', stat_list[-2])[0]
96 | info['mate_mapped_num_mapQge5'] = re.split(r'\(|\s+', stat_list[-1])[0]
97 | mapping_info.append(info)
98 |
99 | with open(out_file, 'w') as out_handle:
100 | f_tsv = csv.DictWriter(out_handle, headers, delimiter='\t')
101 | f_tsv.writeheader()
102 | f_tsv.writerows(mapping_info)
103 |
104 |
105 | def main():
106 | """main funciton"""
107 | parser = argparse.ArgumentParser(
108 | description='compute alignment rate based bam file')
109 | parser.add_argument(
110 | '-statlist', default=None, type=str, help='sorted bam file list')
111 | parser.add_argument(
112 | '-statfiles', default=None, nargs='*', help='sorted bam file')
113 | parser.add_argument(
114 | '-outfile', type=str, help='output alignment rate file')
115 | args = parser.parse_args()
116 | if args.statlist:
117 | method = 1
118 | mapping_rate(args.statlist, args.outfile, method)
119 | if args.statfiles:
120 | method = 2
121 | mapping_rate(args.statfiles, args.outfile, method)
122 |
123 |
124 | if __name__ == '__main__':
125 | main()
126 |
--------------------------------------------------------------------------------
/metapi/checkmer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | import argparse
5 | import concurrent.futures
6 | import subprocess
7 | import pandas as pd
8 | import numpy as np
9 | from natsort import index_natsorted
10 |
11 |
12 | def checkm_prepare(gene_table, batch_num, mags_dir):
13 | os.makedirs(mags_dir, exist_ok=True)
14 |
15 | table_df = pd.read_csv(gene_table, sep="\t")
16 | table_df = table_df.sort_values(
17 | by="bin_id",
18 | key=lambda x: np.argsort(
19 | index_natsorted(table_df["bin_id"]))).reset_index(drop=True)
20 |
21 | batchid = -1
22 | if len(table_df) > 0:
23 | for batch in range(0, len(table_df), batch_num):
24 | batchid += 1
25 | table_split = table_df.iloc[batch:batch+batch_num, ]
26 | table_split.to_csv(
27 | os.path.join(mags_dir, f"mags_input.{batchid}.tsv"),
28 | sep="\t", index=False, header=None)
29 | else:
30 | subprocess.run(f'''touch {os.path.join(mags_dir, "mags_input.0.tsv")}''', shell=True)
31 |
32 |
33 | def MIMAG_quality_level(row):
34 | """
35 | https://doi.org/10.1038/nbt.3893
36 | """
37 | if (row["completeness"] > 90.0) and (row["contamination"] < 5.0):
38 | return "high_quality"
39 | elif (row["completeness"] > 50.0) and (row["contamination"] < 10.0):
40 | return "medium_quality"
41 | else:
42 | return "low_quality"
43 |
44 |
45 | def SGB_quality_level(row):
46 | """
47 | https://doi.org/10.1016/j.cell.2019.01.001
48 | """
49 | if (
50 | (row["strain_heterogeneity"] < 0.5)
51 | and (row["completeness"] > 90.0)
52 | and (row["contamination"] < 5.0)
53 | ):
54 | return "high_quality"
55 | elif (row["completeness"] > 50.0) and (row["contamination"] < 5.0):
56 | return "medium_quality"
57 | else:
58 | return "low_quality"
59 |
60 |
61 | def quality_score(row):
62 | """
63 | https://doi.org/10.1038/s41586-019-0965-1
64 | """
65 | return row["completeness"] - 5 * row["contamination"]
66 |
67 |
68 | def parse_checkm_table(checkm_table):
69 | if os.path.getsize(checkm_table) > 0:
70 | checkm_df = pd.read_csv(checkm_table, sep="\t")
71 | return checkm_df
72 | else:
73 | return None
74 |
75 |
76 | def checkm_reporter(checkm_list, output, threads):
77 | df_list = []
78 | with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor:
79 | for df in executor.map(parse_checkm_table, checkm_list):
80 | if df is not None:
81 | df_list.append(df)
82 |
83 | df_ = pd.DataFrame(
84 | columns=[
85 | "bin_id",
86 | "marker_lineage",
87 | "genomes",
88 | "markers",
89 | "marker_sets",
90 | "completeness",
91 | "contamination",
92 | "strain_heterogeneity",
93 | "MIMAG_quality_level",
94 | "SGB_quality_level",
95 | "quality_score"])
96 |
97 | if len(df_list) >= 1:
98 | df_ = pd.concat(df_list).rename(
99 | columns={
100 | "Bin Id": "bin_id",
101 | "Marker lineage": "marker_lineage",
102 | "# genomes": "genomes",
103 | "# markers": "markers",
104 | "# marker sets": "marker_sets",
105 | "Completeness": "completeness",
106 | "Contamination": "contamination",
107 | "Strain heterogeneity": "strain_heterogeneity",
108 | }
109 | )
110 |
111 | if not df_.empty:
112 | df_["MIMAG_quality_level"] = df_.apply(
113 | lambda x: MIMAG_quality_level(x), axis=1)
114 | df_["SGB_quality_level"] = df_.apply(
115 | lambda x: SGB_quality_level(x), axis=1)
116 | df_["quality_score"] = df_.apply(lambda x: quality_score(x), axis=1)
117 |
118 | if output is not None:
119 | df_.to_csv(output, sep="\t", index=False)
120 |
121 | return df_
122 |
123 |
124 | def main():
125 | parser = argparse.ArgumentParser("CheckM reporter")
126 | parser.add_argument("--checkm_list", type=str, help="checkm out list")
127 | parser.add_argument("--output", type=str, required=True,
128 | help="checkm output file")
129 | parser.add_argument(
130 | "--threads", type=int, default=8, help="threads used on combine CheckM output"
131 | )
132 | args = parser.parse_args()
133 |
134 | checkm_list = [l.strip() for l in open(args.checkm_list, "r")]
135 | checkm_reporter(checkm_list, args.output, args.threads)
136 |
137 |
138 | if __name__ == "__main__":
139 | main()
140 |
--------------------------------------------------------------------------------
/scripts/t2d_abundance_merger.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import pandas as pd
4 | import concurrent.futures
5 | import os
6 | import sys
7 | import argparse
8 |
9 | def global_init(index_metadata):
10 | global INDEX_METADATA__
11 | INDEX_METADATA__ = pd.read_csv(index_metadata, sep='\t')
12 |
13 |
14 | def get_mgs_id(row):
15 | return "_".join(row["ID"].split("_")[0:-1])
16 |
17 |
18 | def get_abun_df_hsx(abun_file):
19 | sample_id = os.path.basename(abun_file).split(".")[0]
20 |
21 | try:
22 | if os.path.exists(abun_file):
23 | abun = pd.read_csv(abun_file, sep='\t')
24 | else:
25 | print("%s is not exists" % abun_file)
26 | return None, None
27 | except pd.errors.EmptyDataError:
28 | print("%s is empty" % abun_file)
29 | return None, None
30 |
31 | abun["mgs_id"] = abun.apply(get_mgs_id, axis=1)
32 |
33 | count_df = abun.loc[:, ["mgs_id", "reads_pairs"]]\
34 | .groupby("mgs_id")\
35 | .agg({"reads_pairs": 'sum'})\
36 | .rename(columns={"reads_pairs": sample_id})
37 | abun_df = abun.loc[:, ["mgs_id", "gene_abundance"]]\
38 | .groupby("mgs_id")\
39 | .agg({"gene_abundance": 'sum'})\
40 | .rename(columns={"gene_abundance": sample_id})
41 | return count_df, abun_df
42 |
43 |
44 | def get_abun_df_jgi(depth_file):
45 | sample_id = os.path.basename(depth_file).split(".")[0]
46 |
47 | try:
48 | if os.path.exists(depth_file):
49 | depth = pd.read_csv(depth_file, sep='\t')
50 | else:
51 | print("%s is not exists" % depth_file)
52 | return None, None
53 | except pd.errors.EmptyDataError:
54 | print("%s is empty" % depth_file)
55 | return None, None
56 |
57 | depth = depth.rename(columns={"contigName": "contig_name"})\
58 | .merge(INDEX_METADATA__)\
59 | .groupby("mgs_id")\
60 | .agg({"totalAvgDepth": "mean"})
61 | depth[sample_id] = depth["totalAvgDepth"] / sum(depth["totalAvgDepth"])
62 | depth_df = depth.loc[:, ["totalAvgDepth"]].rename(columns={"totalAvgDepth": sample_id})
63 | abun_df = depth.loc[:, [sample_id]]
64 | return depth_df, abun_df
65 |
66 |
67 | def get_all_abun_df(abun_files, workers, func):
68 | count_list = []
69 | abun_list = []
70 | with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
71 | for count_df, abun_df in executor.map(func, abun_files):
72 | if (not count_df is None) and (not abun_df is None):
73 | count_list.append(count_df)
74 | abun_list.append(abun_df)
75 |
76 | count_df_ = pd.concat(count_list, axis=1).reset_index()
77 | abun_df_ = pd.concat(abun_list, axis=1).reset_index()
78 |
79 | return count_df_, abun_df_
80 |
81 |
82 | def main():
83 | parser = argparse.ArgumentParser('merge many samples abundance file to one profile')
84 | parser.add_argument(
85 | '-l',
86 | '--abundance_list',
87 | type=str,
88 | help='abundance list')
89 | parser.add_argument(
90 | '--method',
91 | default="hsx",
92 | choices=["hsx", "jgi"],
93 | help='compute method'
94 | )
95 | parser.add_argument(
96 | '--database',
97 | default=None,
98 | help='contig and genome relationships'
99 | )
100 | parser.add_argument(
101 | '--threads',
102 | default=8,
103 | type=int,
104 | help='threads'
105 | )
106 | parser.add_argument(
107 | '--out_count_profile',
108 | type=str,
109 | help='output count profile')
110 | parser.add_argument(
111 | '--out_abundance_profile',
112 | type=str,
113 | help='output abundance profile')
114 | args = parser.parse_args()
115 |
116 | abun_files = pd.read_csv(args.abundance_list, names=["path"])\
117 | .loc[:, "path"].values
118 |
119 | if args.method == "jgi" and args.database is None:
120 | print("pleas supply database when parse jgi depth file")
121 | sys.exit(1)
122 |
123 | if args.method == "hsx":
124 | count_df, abun_df = get_all_abun_df(abun_files, args.threads, get_abun_df_hsx)
125 | elif args.method == "jgi":
126 | global_init(args.database)
127 | count_df, abun_df = get_all_abun_df(abun_files, args.threads, get_abun_df_jgi)
128 | else:
129 | print("unsupport method: %s" % args.method)
130 |
131 | count_df.to_csv(args.out_count_profile, sep='\t', index=False)
132 | abun_df.to_csv(args.out_abundance_profile, sep='\t', index=False)
133 |
134 |
135 | if __name__ == '__main__':
136 | main()
137 |
--------------------------------------------------------------------------------