├── setup.cfg ├── tests ├── test_bash │ ├── test.sh │ └── Snakefile ├── test_executor │ ├── test.sh │ ├── test.py │ └── Snakefile └── test_spades │ ├── test.sh │ ├── metapi.yaml │ └── Snakefile ├── docs ├── mag_workflow.png └── metapi.dio ├── metapi ├── profiles │ ├── lsf │ │ ├── lsf_jobscript.sh │ │ ├── config.yaml │ │ ├── CookieCutter.py │ │ ├── lsf_config.py │ │ ├── OSLayer.py │ │ └── memory_units.py │ ├── slurm │ │ ├── slurm-jobscript.sh │ │ ├── settings.json │ │ ├── config.yaml │ │ ├── CookieCutter.py │ │ ├── slurm-status.py │ │ └── slurm-submit.py │ ├── pbs-torque │ │ ├── pbs-jobscript.sh │ │ ├── config.yaml │ │ └── pbs-status.py │ ├── sge │ │ ├── sge-jobscript.sh │ │ ├── config.yaml │ │ └── sge-status.py │ └── generic │ │ ├── config.yaml │ │ ├── lsf_status.py │ │ ├── pbs_status.py │ │ ├── key_mapping.yaml │ │ ├── cluster_config.yaml │ │ ├── slurm_status.py │ │ └── scheduler.py ├── __about__.py ├── envs │ ├── drep.yaml │ ├── kmcp.yaml │ ├── blast.yaml │ ├── cdhit.yaml │ ├── checkv.yaml │ ├── fastqc.yaml │ ├── multiqc.yaml │ ├── kneaddata.yaml │ ├── plass.yaml │ ├── quast.yaml │ ├── virsorter2.yaml │ ├── metabat2.yaml │ ├── taxonkit.yaml │ ├── gtdbtk.yaml │ ├── idba.yaml │ ├── simulate.yaml │ ├── spades.yaml │ ├── dastools.yaml │ ├── galah.yaml │ ├── predict.yaml │ ├── maxbin2.yaml │ ├── megahit.yaml │ ├── report.yaml │ ├── checkm.yaml │ ├── trimming.yaml │ ├── krakenuniq.yaml │ ├── raw.yaml │ ├── kraken2.yaml │ ├── phamb.yaml │ ├── align.yaml │ ├── concoct.yaml │ ├── deepvirfinder.yaml │ ├── vamb.yaml │ └── semibin.yaml ├── wrappers │ ├── concoct_postprocess.py │ ├── maxbin2_postprocess.py │ ├── dastools_postprocess.py │ ├── hmmsearch_wrapper.py │ ├── vamb │ │ ├── write_abundances.py │ │ ├── abundances_mask.py │ │ ├── create_abundances.py │ │ └── concatenate.py │ ├── misc.py │ ├── prokka_wrapper.py │ ├── simulate_reads.py │ ├── prodigal_wrapper.py │ └── gtdbtk_postprocess.py ├── snakefiles │ ├── simulate_wf.smk │ └── gene_wf.smk ├── visualization │ └── dada2_stats_barplot.R ├── tooler.py ├── rules │ ├── qcreport.smk │ ├── simulate.smk │ ├── upload.smk │ └── binning_report.smk ├── predictor.py ├── __init__.py ├── simulator.py └── checkmer.py ├── requirements.txt ├── run_metapi.py ├── environment.yml ├── scripts ├── perl_test.pl ├── cout_seq_by_line.py ├── samples_validator.py ├── job.py ├── find_ATG.pl ├── contigs_filter_by_len.py ├── parse_mgs_profile.py ├── merge_sig_csv.py ├── get_bins_id.py ├── print_reads_length.py ├── batch_prokka.py ├── checkm_link.py ├── merge_fasta_by_len.py ├── filter_pe_fastq_by_len.py ├── fasta_length_tab.py ├── kraken2_reads_merger.py ├── animf_cluster.py ├── find_path.py ├── taxonomy_info_covert.py ├── get_prodigal_gbk_result.py ├── asm_status_wrapper.py ├── cut_up_fasta_concoct.py ├── aggregate_genomecov.py ├── contigs_from_sample.py ├── fastq_contig_size.py ├── split_fx.py ├── megahit_hadoop.sh ├── get_bin_id_by_ccsh.py ├── insert_size_ploter.py ├── megahit_sge.py ├── split_mummer.py ├── assembly_info.r ├── metapi_config_update.py ├── merge_checkm_out.py ├── clstr_szie_tab.py ├── rename_fasta_id.py ├── asub.py ├── kraken2_demultiplex_summary.py ├── qc_report.py ├── clean_statout_to_matrix.py ├── extract_bins_from_mgs_profile.py ├── filter_pe_fastq_by_size.py ├── post_assembly_binning.py ├── estimate_T2T_data_size.py ├── mapping_statistics.py └── t2d_abundance_merger.py ├── MANIFEST.in ├── .gitignore ├── .circleci └── config.yml └── setup.py /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /tests/test_bash/test.sh: -------------------------------------------------------------------------------- 1 | snakemake --snakefile Snakefile -c 1 --until all 2 | -------------------------------------------------------------------------------- /tests/test_executor/test.sh: -------------------------------------------------------------------------------- 1 | 2 | snakemake --snakefile Snakefile -c 1 --until all -------------------------------------------------------------------------------- /docs/mag_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ohmeta/metapi/HEAD/docs/mag_workflow.png -------------------------------------------------------------------------------- /metapi/profiles/lsf/lsf_jobscript.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # properties = {properties} 3 | {exec_job} -------------------------------------------------------------------------------- /tests/test_spades/test.sh: -------------------------------------------------------------------------------- 1 | snakemake --snakefile Snakefile -c 1 --until all --use-conda 2 | -------------------------------------------------------------------------------- /metapi/profiles/slurm/slurm-jobscript.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # properties = {properties} 3 | {exec_job} 4 | -------------------------------------------------------------------------------- /metapi/profiles/pbs-torque/pbs-jobscript.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # properties = {properties} 3 | {exec_job} 4 | -------------------------------------------------------------------------------- /metapi/__about__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | __version__ = '3.0.0' 4 | __author__ = "Jie Zhu, Fangming Yang, Ye Peng" 5 | -------------------------------------------------------------------------------- /metapi/envs/drep.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - drep=3.5.0 7 | - pigz 8 | - jq -------------------------------------------------------------------------------- /metapi/envs/kmcp.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - kmcp=0.9.4 7 | - pigz 8 | - jq -------------------------------------------------------------------------------- /tests/test_spades/metapi.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - metapi=2.3.0 7 | -------------------------------------------------------------------------------- /metapi/envs/blast.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - blast=2.15.0 7 | - pigz 8 | - jq -------------------------------------------------------------------------------- /metapi/envs/cdhit.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - cd-hit=4.8.1 7 | - pigz 8 | - jq -------------------------------------------------------------------------------- /metapi/envs/checkv.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - checkv=1.0.1 7 | - pigz 8 | - jq -------------------------------------------------------------------------------- /metapi/envs/fastqc.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - fastqc=0.12.1 7 | - pigz 8 | - jq -------------------------------------------------------------------------------- /metapi/envs/multiqc.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - multiqc=1.21 7 | - pigz 8 | - jq -------------------------------------------------------------------------------- /metapi/envs/kneaddata.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - kneaddata=0.12.0 7 | - pigz 8 | - jq -------------------------------------------------------------------------------- /metapi/envs/plass.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - plass=4.687d7 7 | - pigz 8 | - jq 9 | -------------------------------------------------------------------------------- /metapi/envs/quast.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - quast=5.2.0 7 | - pigz 8 | - jq 9 | -------------------------------------------------------------------------------- /metapi/envs/virsorter2.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - virsorter=2.2.4 7 | - pigz 8 | - jq -------------------------------------------------------------------------------- /metapi/profiles/sge/sge-jobscript.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # properties = {properties} 3 | 4 | # exit on first error 5 | set -o errexit 6 | 7 | {exec_job} 8 | -------------------------------------------------------------------------------- /metapi/envs/metabat2.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - metabat2=2.15 7 | - pigz 8 | - jq 9 | -------------------------------------------------------------------------------- /metapi/envs/taxonkit.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - taxonkit 7 | - csvtk 8 | - pigz 9 | - jq -------------------------------------------------------------------------------- /metapi/envs/gtdbtk.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - gtdbtk=2.3.2 7 | - pandas 8 | - pigz 9 | - jq -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | ruamel.yaml 4 | snakemake >=7.0 5 | openpyxl 6 | natsort 7 | biopython >=1.73 8 | seaborn 9 | matplotlib 10 | executor -------------------------------------------------------------------------------- /tests/test_executor/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | from executor import execute 5 | 6 | output = str(snakemake.output) 7 | execute(f'''touch {output}''') -------------------------------------------------------------------------------- /metapi/envs/idba.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - idba=1.1.3 7 | - seqtk 8 | - pigz 9 | - jq 10 | -------------------------------------------------------------------------------- /metapi/envs/simulate.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - insilicoseq=2.0.1 7 | - pandas 8 | - biopython 9 | - pigz -------------------------------------------------------------------------------- /metapi/envs/spades.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - spades=3.15.5 7 | - pigz 8 | - fd-find 9 | - jq 10 | -------------------------------------------------------------------------------- /metapi/envs/dastools.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - python 7 | - das_tool=1.1.7 8 | - pigz 9 | - jq 10 | -------------------------------------------------------------------------------- /metapi/profiles/slurm/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "SBATCH_DEFAULTS": "", 3 | "CLUSTER_NAME": "", 4 | "CLUSTER_CONFIG": "cluster.yaml", 5 | "ADVANCED_ARGUMENT_CONVERSION": "no" 6 | } -------------------------------------------------------------------------------- /metapi/envs/galah.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - galah=0.4.0 7 | - dashing 8 | - fastani 9 | - pigz 10 | - jq -------------------------------------------------------------------------------- /metapi/envs/predict.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - prodigal=2.6.3 7 | - prokka=1.14.6 8 | - pigz 9 | - jq 10 | -------------------------------------------------------------------------------- /run_metapi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import sys 5 | 6 | sys.path.insert(0, os.path.dirname(__file__)) 7 | 8 | from metapi.corer import main 9 | main() 10 | -------------------------------------------------------------------------------- /metapi/envs/maxbin2.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - maxbin2=2.2.7 7 | - fraggenescan=1.31 8 | - pigz 9 | - jq 10 | -------------------------------------------------------------------------------- /metapi/envs/megahit.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - megahit=1.2.9 7 | - gfa1 8 | - pigz 9 | - fd-find 10 | - jq 11 | -------------------------------------------------------------------------------- /metapi/envs/report.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - seqtk=1.4 7 | - seqkit=2.8.0 8 | - bioawk=1.0 9 | - pigz 10 | - jq -------------------------------------------------------------------------------- /metapi/envs/checkm.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - checkm-genome=1.2.2 7 | - prodigal=2.6.3 8 | - pandas=1.5.1 9 | - pigz 10 | - jq -------------------------------------------------------------------------------- /metapi/envs/trimming.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - sickle-trim=1.33 7 | - fastp=0.23.4 8 | - trimmomatic=0.39 9 | - pigz 10 | - jq -------------------------------------------------------------------------------- /metapi/envs/krakenuniq.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - krakenuniq=1.0.4 7 | - bracken=2.9 8 | - jellyfish=1.0.3 9 | - pigz 10 | - jq 11 | -------------------------------------------------------------------------------- /metapi/envs/raw.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - python 7 | - coreutils 8 | - seqkit 9 | - pigz 10 | - jq 11 | - executor 12 | - sra-tools=3.0.3 -------------------------------------------------------------------------------- /metapi/envs/kraken2.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - kraken2=2.1.3 7 | - krakentools=1.2 8 | - bracken=2.9 9 | - krona=2.8.1 10 | - pigz 11 | - jq 12 | -------------------------------------------------------------------------------- /metapi/profiles/pbs-torque/config.yaml: -------------------------------------------------------------------------------- 1 | cluster: "pbs-submit.py --depend \"{dependencies}\"" 2 | cluster-status: "pbs-status.py" 3 | jobscript: "pbs-jobscript.sh" 4 | jobs: 5000 5 | immediate-submit: False 6 | verbose: true 7 | notemp: true 8 | -------------------------------------------------------------------------------- /metapi/profiles/sge/config.yaml: -------------------------------------------------------------------------------- 1 | restart-times: 3 2 | jobscript: sge-jobscript.sh 3 | cluster: "sge-submit.py" 4 | cluster-status: "sge-status.py" 5 | max-jobs-per-second: 1 6 | max-status-checks-per-second: 1 7 | latency-wait: 60 8 | local-cores: 1 9 | -------------------------------------------------------------------------------- /metapi/envs/phamb.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - phamb=1.0.1 7 | - hmmer=3.4 8 | - pyhmmer=0.10.9 9 | - joblib 10 | - pandas 11 | - numpy 12 | - biopython 13 | - pigz 14 | - jq -------------------------------------------------------------------------------- /metapi/profiles/slurm/config.yaml: -------------------------------------------------------------------------------- 1 | restart-times: 3 2 | jobscript: "slurm-jobscript.sh" 3 | cluster-config: "cluster.yaml" 4 | cluster: "slurm-submit.py" 5 | cluster-status: "slurm-status.py" 6 | max-jobs-per-second: 10 7 | max-status-checks-per-second: 10 8 | latency-wait: 80 -------------------------------------------------------------------------------- /metapi/envs/align.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - minimap2=2.27 7 | - samtools=1.19.2 8 | - bwa=0.7.17 9 | - bwa-mem2=2.2.1 10 | - bowtie2=2.5.3 11 | - seqtk 12 | - seqkit 13 | - pigz 14 | - jq -------------------------------------------------------------------------------- /metapi/envs/concoct.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - concoct=1.1.0 7 | - libopenblas=*=openmp* 8 | - mkl 9 | - python>=3 10 | - samtools>=1.9 11 | - scikit-learn=1.1.* 12 | - pigz 13 | - jq 14 | variables: 15 | USE_OPENMP: 1 -------------------------------------------------------------------------------- /metapi/envs/deepvirfinder.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - nodefaults 5 | dependencies: 6 | - python=3.6 7 | - numpy 8 | - theano=1.0.3 9 | - keras=2.2.4 10 | - scikit-learn 11 | - Biopython 12 | - h5py=2.10.0 13 | - mkl-service=2.3.0 14 | - pigz 15 | - jq -------------------------------------------------------------------------------- /metapi/envs/vamb.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - pytorch 3 | - conda-forge 4 | - bioconda 5 | - nodefaults 6 | dependencies: 7 | - pigz 8 | - pytorch=*=*cuda11.3* 9 | - pysam 10 | - numpy 11 | - pandas 12 | - jq 13 | - pip 14 | - pip: 15 | - git+https://github.com/RasmussenLab/vamb@v4.1.3 16 | -------------------------------------------------------------------------------- /metapi/profiles/lsf/config.yaml: -------------------------------------------------------------------------------- 1 | latency-wait: "5" 2 | jobscript: "lsf_jobscript.sh" 3 | use-conda: "False" 4 | use-singularity: "False" 5 | printshellcmds: "False" 6 | restart-times: "0" 7 | jobs: "500" 8 | cluster: "lsf_submit.py" 9 | cluster-status: "lsf_status.py" 10 | max-jobs-per-second: "10" 11 | max-status-checks-per-second: "10" -------------------------------------------------------------------------------- /metapi/envs/semibin.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - pytorch 3 | - conda-forge 4 | - bioconda 5 | - nodefaults 6 | dependencies: 7 | - pytorch=*=*cuda11.3* 8 | - mkl=2023.2.0 9 | - pigz 10 | - jq 11 | - pandas 12 | - numexpr=2.9.0 13 | - mmseqs2 14 | - hmmer 15 | - prodigal 16 | - bedtools 17 | - samtools 18 | - fraggenescan 19 | - semibin=2.1.0 -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: metapi 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - nodefaults 6 | dependencies: 7 | - setuptools 8 | - pandas 9 | - numpy 10 | - ruamel.yaml 11 | - snakemake >=7.0 12 | - openpyxl 13 | - natsort 14 | - biopython >=1.73 15 | - seaborn 16 | - matplotlib 17 | - seqtk 18 | - seqkit 19 | - pigz 20 | - fd-find 21 | - executor -------------------------------------------------------------------------------- /tests/test_executor/Snakefile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env snakemake 2 | 3 | 4 | rule touch_1: 5 | output: 6 | "done1" 7 | script: 8 | "test.py" 9 | 10 | 11 | rule touch_2: 12 | output: 13 | "done2" 14 | run: 15 | from executor import execute 16 | execute(f'''touch {output}''') 17 | 18 | 19 | rule all: 20 | input: 21 | "done1", 22 | "done2" -------------------------------------------------------------------------------- /scripts/perl_test.pl: -------------------------------------------------------------------------------- 1 | #/usr/bin/env perl 2 | use strict; 3 | use warnings; 4 | my $a = "hello/world"; 5 | my $b = join('', $a, "/summary"); 6 | print "$a\n"; 7 | print "$b\n"; 8 | print "$ARGV[0]\n"; 9 | print "$ARGV[1]\n"; 10 | print "$ARGV[2]\n"; 11 | print "$ARGV[3]\n"; 12 | 13 | # eg : perl perl_test.pl a b c d 14 | # output: 15 | # hello/world 16 | # hello/world/summary 17 | # a 18 | # b 19 | # c 20 | # d -------------------------------------------------------------------------------- /metapi/profiles/generic/config.yaml: -------------------------------------------------------------------------------- 1 | restart-times: 0 2 | cluster-config: "cluster_config.yaml" #abs path 3 | cluster: "scheduler.py" # 4 | cluster-status: "slurm_status.py" # 5 | max-jobs-per-second: 10 6 | max-status-checks-per-second: 10 7 | cores: 99 # how many jobs you want to submit to your cluster queue 8 | local-cores: 1 9 | rerun-incomplete: true # recomended for cluster submissions 10 | keep-going: false 11 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md *.txt 2 | include LICENSE 3 | 4 | recursive-include metapi/ * 5 | recursive-include metapi/wrappers * 6 | recursive-include metapi/rules * 7 | recursive-include metapi/snakefiles * 8 | recursive-include metapi/envs * 9 | recursive-include metapi/config * 10 | 11 | global-exclude metapi/ *.pyc 12 | global-exclude metapi/__pycache__ *.pyc 13 | global-exclude metapi/wrappers/ *.pyc 14 | global-exclude metapi/wrappers/__pycache__ *.pyc 15 | -------------------------------------------------------------------------------- /scripts/cout_seq_by_line.py: -------------------------------------------------------------------------------- 1 | #!/ust/bin/env python 2 | 3 | import sys 4 | 5 | with open(sys.argv[1], 'r') as handle: 6 | num = 0 7 | for line in handle: 8 | num += 1 9 | if num == int(sys.argv[2]): 10 | print(str(num) + ":\t" + line) 11 | break 12 | 13 | # awk 'NR==YOUR_LINE{print}' file_name 14 | # sed -n -e YOUR_LINEp file_name 15 | # perl -wnl -e '$. == YOUR_LINE and print and exit;' 16 | # less -SN +TOUR_LINEg file_name (nice!) -------------------------------------------------------------------------------- /scripts/samples_validator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import pandas as pd 3 | import sys 4 | import os 5 | 6 | 7 | def main(): 8 | samples = pd.read_csv(sys.argv[1], sep='\t').set_index("id", drop=False) 9 | for i in samples.index: 10 | fq1, fq2 = samples.loc[i, ["fq1", "fq2"]] 11 | if (not os.path.exists(fq1)) or (not os.path.exists(fq2)): 12 | print("error: %s\t%s\t%s" % (i, fq1, fq2)) 13 | 14 | 15 | if __name__ == '__main__': 16 | main() 17 | -------------------------------------------------------------------------------- /scripts/job.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | 5 | from snakemake.utils import read_job_properties 6 | 7 | jobscript = sys.argv[1] 8 | job_properties = read_job_properties(jobscript) 9 | 10 | # do something useful with the threads 11 | threads = job_properties[threads] 12 | 13 | # access property defined in the cluster configuration file (snakemake >=3.6.0) 14 | job_properties["cluster"]["time"] 15 | 16 | os.system("qsub -t {threads} {script}".format(threads=threads, script=jobscript)) 17 | -------------------------------------------------------------------------------- /metapi/wrappers/concoct_postprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import subprocess 6 | 7 | 8 | with os.scandir(sys.argv[1]) as itr: 9 | i = 0 10 | for entry in itr: 11 | bin_id, suffix = os.path.splitext(entry.name) 12 | if suffix == ".fa": 13 | i += 1 14 | subprocess.run('''mv %s %s''' \ 15 | % (os.path.join(sys.argv[1], entry.name), 16 | os.path.join(sys.argv[2] + "." + str(i) + ".fa")), shell=True) -------------------------------------------------------------------------------- /scripts/find_ATG.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | use warnings; 4 | # QQ group: perlchina 5 | # question: find longest ATG+ sequences 6 | 7 | my $seq = "ATGATGASFSAGATGATGATGSFAATGATGATGATGDSFS"; 8 | 9 | my @atg = $seq =~ /((ATG)+)/g; 10 | my @atg_len = map { length($_) } @atg; 11 | print "@atg\n"; 12 | print "@atg_len\n\n"; 13 | 14 | print((sort {$b cmp $a} ($seq =~ /(?:ATG)+/g))[0]); 15 | print "\n\n"; 16 | 17 | my @atg_2 = $seq =~ /(?:ATG)+/g; 18 | my @atg_len_2 = map { length($_) } @atg_2; 19 | print "@atg_2\n"; 20 | print "@atg_len_2\n"; -------------------------------------------------------------------------------- /metapi/profiles/generic/lsf_status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | import os 5 | import sys 6 | import warnings 7 | import subprocess 8 | 9 | 10 | jobid = sys.argv[1] 11 | 12 | out= subprocess.run(['bjobs','-noheader',jobid],stdout=subprocess.PIPE).stdout.decode('utf-8') 13 | 14 | state = out.strip().split()[2] 15 | 16 | 17 | map_state={"PEND":'running', 18 | "RUN":'running', 19 | "PROV":"running", 20 | "WAIT":'running', 21 | "DONE":'success', 22 | "":'success'} 23 | 24 | print(map_state.get(state,'failed')) 25 | -------------------------------------------------------------------------------- /metapi/snakefiles/simulate_wf.smk: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env snakemake 3 | 4 | import sys 5 | from pprint import pprint 6 | import pandas as pd 7 | 8 | from snakemake.utils import min_version 9 | min_version("7.0") 10 | shell.executable("bash") 11 | 12 | import metapi 13 | 14 | METAPI_DIR = metapi.__path__[0] 15 | WRAPPER_DIR = os.path.join(METAPI_DIR, "wrappers") 16 | DATA_DIR = os.path.join(METAPI_DIR, "data") 17 | 18 | 19 | SAMPLES, DATA_TYPE = metapi.parse_samples(config["params"]["samples"]) 20 | 21 | 22 | include: "../rules/simulate.smk" 23 | 24 | 25 | rule all: 26 | input: 27 | rules.simulate_all.input -------------------------------------------------------------------------------- /metapi/wrappers/maxbin2_postprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | 7 | with os.scandir(sys.argv[1]) as itr: 8 | for entry in itr: 9 | bin_id, bin_suffix = os.path.splitext(entry.name) 10 | bin_name, cluster_num = bin_id.rsplit(".", maxsplit=1) 11 | bin_id = bin_name + "." + cluster_num.lstrip("0") 12 | if bin_suffix == ".fasta": 13 | subprocess.run('''mv %s %s''' \ 14 | % (os.path.join(sys.argv[1], entry.name), 15 | os.path.join(sys.argv[1], bin_id + ".fa")), shell=True) 16 | 17 | 18 | -------------------------------------------------------------------------------- /metapi/wrappers/dastools_postprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import glob 6 | import subprocess 7 | 8 | 9 | bins_prefix = sys.argv[1] #.replace("dastools.bin", "") 10 | 11 | mags_list = glob.glob(os.path.join(sys.argv[1] + "_DASTool_bins", "*.fa")) 12 | 13 | if len(mags_list) > 0: 14 | for bin_fa in mags_list: 15 | if (os.path.getsize(bin_fa) > 0) and (not "*" in bin_fa): 16 | binner = os.path.basename(bin_fa).split(".")[0] 17 | if (binner != "unbinned") and (binner != "*"): 18 | bin_fa_ = bins_prefix + "." + os.path.basename(bin_fa).replace(binner, f'''{binner}_dastools.bin''') 19 | subprocess.run(f'''mv {bin_fa} {bin_fa_}''', shell=True) -------------------------------------------------------------------------------- /scripts/contigs_filter_by_len.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from contigs_to_gene import cut_fasta_by_len 3 | import argparse 4 | 5 | def main(): 6 | parser = argparse.ArgumentParser(description="cut fasta by len") 7 | parser.add_argument('-fa', type=str, help='scaffolds or contigs file path') 8 | parser.add_argument('-sclen', type=int, help='scaffold or contigs length cutoff, default: 500', default=500) 9 | parser.add_argument('-outdir', type=str, help='output dir store gene prediction results') 10 | parser.add_argument('-prefix', type=str, help='prefix for file name') 11 | args = parser.parse_args() 12 | cut_fasta_by_len(args.fa, args.sclen, args.outdir, args.prefix, ".fa") 13 | 14 | if __name__ == '__main__': 15 | main() -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # vscode 2 | .vscode 3 | 4 | # pycharm 5 | .idea 6 | 7 | # kdevelop 8 | .kdev4 9 | metapi.kdev4 10 | 11 | # snakemake 12 | .snakemake 13 | 14 | metapi/__pycache__/ 15 | metapi/*.pyc 16 | 17 | # pipenv 18 | build/ 19 | dist/ 20 | metapi.egg-info/ 21 | release 22 | 23 | conda/*.gz 24 | 25 | notebooks/ 26 | notebook 27 | 28 | # test 29 | test/ 30 | test/simulation_test/metaconfig.yaml 31 | 32 | # examples 33 | example/basic_test/data/01.trimmed 34 | example/basic_test/data/02.assembly 35 | example/basic_test/data/03.alignment 36 | example/basic_test/data/04.binning 37 | example/basic_test/data/05.checkm 38 | example/basic_test/data/logs 39 | examples/simulation_test 40 | examples 41 | 42 | # docs 43 | docs/hello.py 44 | 45 | # others 46 | index.list 47 | data.tar.gz -------------------------------------------------------------------------------- /scripts/parse_mgs_profile.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | import sys 5 | from pprint import pprint 6 | 7 | 8 | def parse(mgs_profile): 9 | count = 0 10 | with open(mgs_profile, 'r') as ih: 11 | for line in ih: 12 | line_list = re.split(r"\s+|,", line) 13 | cag_id = line_list[0] 14 | seq_count = line_list[1] 15 | seq_id_list = line_list[2:] 16 | count += 1 17 | a = 0 18 | if count == 1: 19 | for i in seq_id_list: 20 | print(i) 21 | a += 1 22 | print(a) 23 | print(seq_count) 24 | break 25 | 26 | 27 | def main(): 28 | parse(sys.argv[1]) 29 | 30 | 31 | if __name__ == '__main__': 32 | main() 33 | -------------------------------------------------------------------------------- /metapi/profiles/generic/pbs_status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import subprocess 5 | import xml.etree.cElementTree as ET 6 | 7 | jobid = sys.argv[1] 8 | 9 | try: 10 | res = subprocess.run("qstat -f -x {}".format(jobid), check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) 11 | 12 | xmldoc = ET.ElementTree(ET.fromstring(res.stdout.decode())).getroot() 13 | job_state = xmldoc.findall('.//job_state')[0].text 14 | 15 | if job_state == "C": 16 | exit_status = xmldoc.findall('.//exit_status')[0].text 17 | if exit_status == '0': 18 | print("success") 19 | else: 20 | print("failed") 21 | else: 22 | print("running") 23 | 24 | except (subprocess.CalledProcessError, IndexError, KeyboardInterrupt) as e: 25 | print("failed") 26 | -------------------------------------------------------------------------------- /metapi/profiles/pbs-torque/pbs-status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import subprocess 5 | import xml.etree.cElementTree as ET 6 | 7 | jobid = sys.argv[1] 8 | 9 | try: 10 | res = subprocess.run("qstat -f -x {}".format(jobid), check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) 11 | 12 | xmldoc = ET.ElementTree(ET.fromstring(res.stdout.decode())).getroot() 13 | job_state = xmldoc.findall('.//job_state')[0].text 14 | 15 | if job_state == "C": 16 | exit_status = xmldoc.findall('.//exit_status')[0].text 17 | if exit_status == '0': 18 | print("success") 19 | else: 20 | print("failed") 21 | else: 22 | print("running") 23 | 24 | except (subprocess.CalledProcessError, IndexError, KeyboardInterrupt) as e: 25 | print("failed") 26 | -------------------------------------------------------------------------------- /scripts/merge_sig_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import csv 4 | import argparse 5 | import pandas as pd 6 | 7 | def merge_csv(csvlist, output): 8 | frame = pd.DataFrame() 9 | list = [] 10 | with open(csvlist, 'r') as csv_l: 11 | for csv_f in csv_l: 12 | df = pd.read_csv(csv_f.strip(), index_col=None, header=0) 13 | list.append(df) 14 | frame = pd.concat(list) 15 | frame.to_csv(output) 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser(description="merge sourmash sigs to a csv file") 19 | parser.add_argument('-csvlist', type=str, help='a file contain sig file path list') 20 | parser.add_argument('-output', type=str, help='output csv file') 21 | args = parser.parse_args() 22 | merge_csv(args.csvlist, args.output) 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /metapi/wrappers/hmmsearch_wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import pyhmmer 3 | import sys 4 | 5 | hmm_threads = sys.argv[0] 6 | hmm_evalue = sys.argv[1] 7 | hmm_tbl = sys.argv[2] 8 | hmm_db = sys.argv[3] 9 | hmm_seq = sys.argv[4] 10 | 11 | # reference 12 | # https://github.com/althonos/pyhmmer/issues/22 13 | 14 | alphabet = pyhmmer.easel.Alphabet.amino() 15 | 16 | with pyhmmer.easel.SequenceFile(hmm_seq, digital=True, alphabet=alphabet) as seq_file: 17 | sequences = list(seq_file) 18 | 19 | with open(hmm_tbl, "wb") as dst: 20 | with pyhmmer.plan7.HMMFile(hmm_db) as hmm_file: 21 | for i, hits in enumerate(pyhmmer.hmmsearch(hmm_file, sequences, cpus=hmm_threads, E=hmm_evalue)): 22 | hits.write(dst, format="targets", header=i==0) 23 | 24 | # example 25 | # python hmmsearch_wrapper.py 8 0.01 output.tbl virus.hmm test.faa 26 | -------------------------------------------------------------------------------- /scripts/get_bins_id.py: -------------------------------------------------------------------------------- 1 | ##!/usr/bin/env python 2 | import glob 3 | import os 4 | import pprint 5 | import sys 6 | 7 | import pandas 8 | 9 | 10 | def parse_mags(mags_dir): 11 | bin_list = [] 12 | pattern = mags_dir + "/*/*.fa" 13 | for bin in glob.glob(pattern): 14 | bin_dict = {} 15 | bin_fa = os.path.basename(bin) 16 | bin_id = bin_fa.rstrip(".fa") 17 | id = ".".join(bin_fa.split(".")[:-3]) 18 | bin_dict["bin_path"] = bin.strip() 19 | bin_dict["bin_id"] = bin_id 20 | bin_dict["id"] = id 21 | bin_list.append(bin_dict) 22 | pprint.pprint(bin_list) 23 | #bin_df = pandas.DataFrame(bin_list).set_index("bin_id", drop=False) 24 | #pprint.pprint(bin_df) 25 | # a = bin_df.loc["s1.bin.2", ["bin_path"]].dropna()[0] 26 | # print(a) 27 | 28 | 29 | parse_mags(sys.argv[1]) 30 | -------------------------------------------------------------------------------- /metapi/profiles/lsf/CookieCutter.py: -------------------------------------------------------------------------------- 1 | class CookieCutter: 2 | """ 3 | Cookie Cutter wrapper 4 | """ 5 | 6 | @staticmethod 7 | def get_default_threads() -> int: 8 | return int("1") 9 | 10 | @staticmethod 11 | def get_default_mem_mb() -> int: 12 | return int("1024") 13 | 14 | @staticmethod 15 | def get_log_dir() -> str: 16 | return "logs/cluster" 17 | 18 | @staticmethod 19 | def get_default_queue() -> str: 20 | return "" 21 | 22 | @staticmethod 23 | def get_lsf_unit_for_limits() -> str: 24 | return "KB" 25 | 26 | @staticmethod 27 | def get_unknwn_behaviour() -> str: 28 | return "wait" 29 | 30 | @staticmethod 31 | def get_zombi_behaviour() -> str: 32 | return "ignore" 33 | 34 | @staticmethod 35 | def get_latency_wait() -> float: 36 | return float("5") 37 | -------------------------------------------------------------------------------- /metapi/profiles/slurm/CookieCutter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Based on lsf CookieCutter.py 3 | # 4 | import os 5 | import json 6 | 7 | d = os.path.dirname(__file__) 8 | with open(os.path.join(d, "settings.json")) as fh: 9 | settings = json.load(fh) 10 | 11 | 12 | class CookieCutter: 13 | 14 | SBATCH_DEFAULTS = settings['SBATCH_DEFAULTS'] 15 | CLUSTER_NAME = settings['CLUSTER_NAME'] 16 | CLUSTER_CONFIG = settings['CLUSTER_CONFIG'] 17 | ADVANCED_ARGUMENT_CONVERSION = settings['ADVANCED_ARGUMENT_CONVERSION'] 18 | 19 | @staticmethod 20 | def get_cluster_option() -> str: 21 | cluster = CookieCutter.CLUSTER_NAME 22 | if cluster != "": 23 | return f"--cluster={cluster}" 24 | return "" 25 | 26 | @staticmethod 27 | def get_advanced_argument_conversion() -> bool: 28 | val = {"yes": True, "no": False}[ 29 | CookieCutter.ADVANCED_ARGUMENT_CONVERSION 30 | ] 31 | return val 32 | -------------------------------------------------------------------------------- /scripts/print_reads_length.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | "get each reads length form fasta/fastq file" 3 | import argparse 4 | import gzip 5 | from Bio import SeqIO 6 | 7 | def print_len(infile, seqtype): 8 | '''print_len function''' 9 | if infile.endswith(".gz"): 10 | handle = gzip.open(infile, 'rt') 11 | else: 12 | handle = open(infile, 'rt') 13 | for reads in SeqIO.parse(handle, seqtype): 14 | print(reads.id, "\t", len(reads)) 15 | handle.close() 16 | 17 | 18 | def main(): 19 | '''main function''' 20 | parser = argparse.ArgumentParser(description='print each reads id and length info form fasta/fastq file') 21 | parser.add_argument('--infile', action='store', dest='infile', help='input fasta/fastq file') 22 | parser.add_argument('--seqtype', action='store', dest='seqtype', help='input file seq type, fasta or fastq') 23 | args = parser.parse_args() 24 | print_len(args.infile, args.seqtype) 25 | 26 | if __name__ == "__main__": 27 | main() 28 | -------------------------------------------------------------------------------- /tests/test_bash/Snakefile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env snakemake 2 | 3 | rule download: 4 | output: 5 | r1 = "ecoli_1K.1.fq.gz", 6 | r2 = "ecoli_1K.2.fq.gz" 7 | threads: 8 | 1 9 | shell: 10 | ''' 11 | curl -o ecoli_1K.1.fq.gz https://raw.githubusercontent.com/ablab/spades/spades_3.15.5/assembler/test_dataset/ecoli_1K_1.fq.gz 12 | curl -o ecoli_1K.2.fq.gz https://raw.githubusercontent.com/ablab/spades/spades_3.15.5/assembler/test_dataset/ecoli_1K_2.fq.gz 13 | ''' 14 | 15 | 16 | rule decompress: 17 | input: 18 | r1 = "ecoli_1K.1.fq.gz", 19 | r2 = "ecoli_1K.2.fq.gz" 20 | output: 21 | r1 = "ecoli_1K.1.fq", 22 | r2 = "ecoli_1K.2.fq" 23 | shell: 24 | ''' 25 | R1={input.r1} 26 | R2={input.r2} 27 | pigz -dc {input.r1} > ${{R1%.gz}} 28 | pigz -dc {input.r2} > ${{R2%.gz}} 29 | ''' 30 | 31 | 32 | rule all: 33 | input: 34 | "ecoli_1K.1.fq", 35 | "ecoli_1K.2.fq" 36 | -------------------------------------------------------------------------------- /scripts/batch_prokka.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import glob 4 | import os 5 | import pprint 6 | import sys 7 | 8 | 9 | def run(dir_list, outdir, logdir): 10 | cmd_list = [] 11 | count = 1 12 | with open(dir_list) as f: 13 | for dir in f: 14 | count += 1 15 | bin_list = glob.glob(dir.strip() + "/*.fa") 16 | for bin in bin_list: 17 | bin_id = os.path.basename(bin).rstrip(".fa") 18 | prokka_dir = os.path.join(outdir, 19 | os.path.basename(dir.strip())) 20 | log = os.path.join(logdir, bin_id + ".prokka.log") 21 | cmd = "prokka %s --outdir %s --prefix %s --kingdom Bacteria --cpus 8 2> %s" % ( 22 | bin.strip(), prokka_dir, bin_id, log) 23 | cmd_list.append(cmd) 24 | if count == 2: 25 | break 26 | return cmd_list 27 | 28 | 29 | cmd_list = run(sys.argv[1], sys.argv[2], sys.argv[3]) 30 | pprint.pprint(cmd_list) 31 | -------------------------------------------------------------------------------- /metapi/profiles/generic/key_mapping.yaml: -------------------------------------------------------------------------------- 1 | # only parameters defined in key_mapping (see below) are passed to the command in the order specified. 2 | system: "slurm" #check if system is defined below 3 | 4 | slurm: 5 | command: "sbatch --parsable" 6 | key_mapping: 7 | name: "--job-name={}" 8 | threads: "-n {}" 9 | mem: "--mem={}g" 10 | account: "--account={}" 11 | queue: "--partition={}" 12 | time: "--time={}" 13 | nodes: "-N {}" 14 | pbs: 15 | command: "qsub" 16 | key_mapping: 17 | name: "-N {}" 18 | account: "-A {}" 19 | queue: "-q {}" 20 | threads: "-l nodes=1:ppn={}" # always use 1 node 21 | mem: "-l mem={}gb" 22 | time: "-l walltime={}00" #min= seconds x 100 23 | lsf: 24 | command: "bsub -e lsf_%J.log -o lsf_%J.log" 25 | key_mapping: 26 | queue: "-q {}" 27 | name: "-J {}" 28 | threads: "-n {}" 29 | mem: '-R "rusage[mem={}000]"' 30 | account: "-P {}" 31 | nodes: "-C {}" 32 | 33 | 34 | 35 | # for other cluster systems see: https://slurm.schedmd.com/rosetta.pdf 36 | -------------------------------------------------------------------------------- /metapi/profiles/generic/cluster_config.yaml: -------------------------------------------------------------------------------- 1 | ## This is a yaml file, defining options for specific rules or by default. 2 | ## The '#' defines a comment. 3 | ## the two spaces at the beginning of rows below rulenames are important. 4 | ## For more information see https://snakemake.readthedocs.io/en/stable/executing/cluster-cloud.html#cluster-execution 5 | 6 | # default parameter for all rules 7 | __default__: 8 | #queue: normal 9 | nodes: 1 10 | 11 | 12 | # The following rules in atlas need need more time/memory. 13 | # If you need to submit them to different queues you can configure this as outlined. 14 | 15 | # run_megahit: 16 | # queue: bigmem 17 | # run_spades: 18 | # queue: bigmem 19 | 20 | #gtdb-tk classify uses 'large_mem' and log time 21 | # classify: 22 | # queue: bigmem-long 23 | 24 | # run_checkm_lineage_wf: 25 | # queue: long 26 | 27 | # run_all_checkm_lineage_wf: 28 | # queue: long 29 | 30 | # You can overwrite values for specific rules 31 | # rulename: 32 | # queue: long 33 | # account: "" 34 | # time: # h 35 | # threads: 36 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Check https://circleci.com/docs/2.0/language-python/ for more details 2 | 3 | version: 2.1 4 | 5 | orbs: 6 | python: circleci/python@0.2.1 7 | 8 | jobs: 9 | build-and-test: 10 | executor: python/default 11 | steps: 12 | - run: 13 | name: conda create 14 | command: | 15 | ls $HOME 16 | if [ ! -d "/home/circleci/conda" ]; then 17 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh 18 | /bin/bash Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/conda 19 | else 20 | echo "Miniconda is already installed, continuing to build." 21 | fi 22 | - save_cache: 23 | paths: 24 | - /home/circleci/conda 25 | key: v2-dependencies 26 | 27 | - run: 28 | name: conda build 29 | command: | 30 | cd ~/metapi 31 | /bin/bash ~/metapi/conda 32 | conda build ./ 33 | - store_artifacts: 34 | path: ~/repo/build 35 | destination: singularity-containers 36 | 37 | workflows: 38 | main: 39 | jobs: 40 | - build-and-test 41 | -------------------------------------------------------------------------------- /metapi/wrappers/vamb/write_abundances.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | import vamb 4 | from pathlib import Path 5 | 6 | 7 | def write_abundances( 8 | mask_refhash: Path, bampath: Path, min_identity: float, outfile: Path 9 | ): 10 | """For every sample, compute the abundances given the mask and refhashes""" 11 | loadnpz = np.load(mask_refhash) 12 | refhash = loadnpz["refhash"] 13 | mask = loadnpz["mask"] 14 | refhash = refhash.reshape(1)[0] 15 | (abundance, _) = vamb.parsebam.Abundance.run_pycoverm( 16 | paths=[bampath], 17 | minid=min_identity, 18 | target_refhash=refhash, 19 | target_identifiers=None, 20 | mask=mask, 21 | ) 22 | vamb.vambtools.write_npz(outfile, abundance.ravel()) 23 | 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument("--msk", type=Path, help="mask refhash") 28 | parser.add_argument("--b", type=Path, help=" bam path") 29 | parser.add_argument("--min_id", type=float, help="min identity for alignment") 30 | parser.add_argument("--out", type=Path, help="abundances outfile") 31 | 32 | opt = parser.parse_args() 33 | 34 | write_abundances(opt.msk, opt.b, opt.min_id, opt.out) 35 | -------------------------------------------------------------------------------- /docs/metapi.dio: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /metapi/wrappers/vamb/abundances_mask.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | from vamb.vambtools import RefHasher 4 | from pathlib import Path 5 | 6 | 7 | def abundances_mask(headers: Path, mask_refhash: Path, min_contig_size: int): 8 | """# Using the headers above, compute the mask and the refhash""" 9 | 10 | mask = [] 11 | identifiers = [] 12 | 13 | with open(headers) as file: 14 | for line in file: 15 | # SN:S27C112075 LN:2239 16 | (sn, ln) = line.split("\t") 17 | if sn[:3] != "SN:" or ln[:3] != "LN:": 18 | raise ValueError("Unknown format") 19 | passed = int(ln[3:]) >= min_contig_size 20 | mask.append(passed) 21 | if passed: 22 | identifiers.append(sn[3:]) 23 | 24 | np.savez_compressed( 25 | mask_refhash, 26 | mask=np.array(mask, dtype=bool), 27 | refhash=RefHasher.hash_refnames(identifiers), 28 | ) 29 | 30 | 31 | if __name__ == "__main__": 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument("--h", type=Path, help=" Headers file") 34 | parser.add_argument("--msk", type=Path, help="mask refhash") 35 | 36 | parser.add_argument("--minsize", type=int, help="min contig size") 37 | 38 | opt = parser.parse_args() 39 | 40 | abundances_mask(opt.h, opt.msk, opt.minsize) 41 | -------------------------------------------------------------------------------- /scripts/checkm_link.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import argparse 5 | 6 | 7 | def link(link_dir, batch_num, bin_list): 8 | bins = [] 9 | with open(bin_list, "r") as ih: 10 | for line in ih: 11 | bins.append(os.path.abspath(line.strip())) 12 | 13 | os.makedirs(link_dir, exist_ok=True) 14 | 15 | if len(bins) > 0: 16 | for batch_id in range(0, len(bins), batch_num): 17 | batch_dir = os.path.join(link_dir, "bins_%d" % batch_id) 18 | os.makedirs(batch_dir, exist_ok=True) 19 | 20 | for bin_fa in bins[batch_id : batch_id + batch_num]: 21 | os.symlink(bin_fa, os.path.join(batch_dir, os.path.basename(bin_fa))) 22 | else: 23 | os.makedirs(os.path.join(link_dir, "bins_0"), exist_ok=True) 24 | 25 | 26 | def main(): 27 | parser = argparse.ArgumentParser("checkm link") 28 | parser.add_argument("--link_dir", help="a dir contains checkm input link") 29 | parser.add_argument( 30 | "--batch_num", 31 | type=int, 32 | default=500, 33 | help="how many bins each cehckm run, default: 500", 34 | ) 35 | parser.add_argument("--bin_list", help="a file contains all bin path") 36 | args = parser.parse_args() 37 | 38 | link(args.link_dir, args.batch_num, args.bin_list) 39 | 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /metapi/wrappers/vamb/create_abundances.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | import vamb 4 | from pathlib import Path 5 | 6 | 7 | def create_abundances( 8 | abundances: list[Path], mask_refhash: Path, min_id: float, outfile: Path 9 | ): 10 | """Merge the abundances to a single Abundance object and save it""" 11 | refhash = np.load(mask_refhash)["refhash"] 12 | 13 | n_samples = len(abundances) 14 | first = vamb.vambtools.read_npz(abundances[0]) 15 | print(len(first), n_samples) 16 | print(first.shape) 17 | matrix = np.empty((len(first), n_samples), dtype=np.float32) 18 | matrix[:, 0] = first 19 | for i, path in enumerate(abundances[1:]): 20 | matrix[:, i + 1] = vamb.vambtools.read_npz(path) 21 | abundance = vamb.parsebam.Abundance( 22 | matrix, [str(i) for i in abundances], min_id, refhash 23 | ) 24 | abundance.save(outfile) 25 | 26 | 27 | if __name__ == "__main__": 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument("--msk", type=Path, help="mask refhash") 30 | parser.add_argument("--ab", type=Path, nargs="+", help=" abundancaes list of files") 31 | parser.add_argument("--min_id", type=float, help="min identity for alignment") 32 | parser.add_argument("--out", type=Path, help="abundances outfile") 33 | 34 | opt = parser.parse_args() 35 | 36 | create_abundances(opt.ab, opt.msk, opt.min_id, opt.out) 37 | -------------------------------------------------------------------------------- /scripts/merge_fasta_by_len.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import gzip 4 | import os 5 | 6 | from Bio import SeqIO 7 | 8 | 9 | def merge_fa_by_len(falist, minlen, maxlen, outfa): 10 | with open(falist, 'r') as falist_h, open(outfa, 'w') as out_h: 11 | for fa_file in falist_h: 12 | fa_file = fa_file.rstrip() 13 | if fa_file.endswith(".gz"): 14 | fa_h = gzip.open(fa_file, 'rt') 15 | else: 16 | fa_h = open(fa_file, 'r') 17 | for record in SeqIO.parse(fa_h, 'fasta'): 18 | if (len(record.seq) >= minlen) and (len(record.seq) <= maxlen): 19 | SeqIO.write(record, out_h, 'fasta') 20 | fa_h.close() 21 | 22 | 23 | def main(): 24 | parser = argparse.ArgumentParser(description="merge many fasta file to a fasta file by length cutoff") 25 | parser.add_argument('--falist', type=str, help='input file contain fasta path list') 26 | parser.add_argument('--minlen', type=int, help='sequences min length cutoff', default=1) 27 | parser.add_argument('--maxlen', type=int, help='sequences max length cutoff', default=10000000000) 28 | parser.add_argument('--outfa', type=str, help='output fasta contain sequences which length between [minlen, maxlen]') 29 | args = parser.parse_args() 30 | 31 | merge_fa_by_len(args.falist, args.minlen, args.maxlen, args.outfa) 32 | 33 | 34 | if __name__ == "__main__": 35 | main() 36 | -------------------------------------------------------------------------------- /metapi/wrappers/misc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import argparse 5 | import subprocess 6 | import sys 7 | 8 | 9 | def link_or_cat(args): 10 | fq_gz = os.path.join(args.output_dir, args.basename + ".fq.gz") 11 | 12 | if (not os.path.exists(fq_gz)) or (os.path.getsize(fq_gz) == 0): 13 | subprocess.call( 14 | f'''rm -rf {fq_gz}''', 15 | shell=True, stdout=sys.stdout, stderr=sys.stderr) 16 | 17 | if len(args.input_file) == 1: 18 | reads = os.path.realpath(args.input_file[0]) 19 | subprocess.call( 20 | f''' 21 | pushd {args.output_dir} && \ 22 | ln -s {reads} {args.basename}.fq.gz && \ 23 | popd 24 | ''', shell=True, stdout=sys.stdout, stderr=sys.stderr) 25 | else: 26 | reads = " ".join(args.input_file) 27 | subprocess.call( 28 | f''' 29 | cat {reads} > {args.output_dir}/{args.basename}.fq.gz 30 | ''', shell=True, stdout=sys.stdout, stderr=sys.stderr) 31 | 32 | 33 | def main(): 34 | parser = argparse.ArgumentParser("metapi misc") 35 | parser.add_argument("--basename", dest="basename") 36 | parser.add_argument("--input-file", dest="input_file", nargs="+") 37 | parser.add_argument("--output-dir", dest="output_dir") 38 | 39 | args = parser.parse_args() 40 | 41 | link_or_cat(args) 42 | 43 | 44 | if __name__ == "__main__": 45 | main() 46 | -------------------------------------------------------------------------------- /scripts/filter_pe_fastq_by_len.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import gzip 4 | from Bio import SeqIO, bgzf 5 | 6 | def filter_pe_fasq_by_len(fq_1, fq_2, minlen, prefix): 7 | '''filter pe reads by min length''' 8 | fq_1_ = prefix + ".gt" + str(minlen) + ".1.fq.gz" 9 | fq_2_ = prefix + ".gt" + str(minlen) + ".2.fq.gz" 10 | with bgzf.BgzfWriter(fq_1_, 'wb') as out_1, bgzf.BgzfWriter(fq_2_, 'wb') as out_2: 11 | with gzip.open(fq_1, 'rt') as in_1, gzip.open(fq_2, 'rt') as in_2: 12 | for rec_a, rec_b in zip(SeqIO.parse(in_1, 'fastq'), SeqIO.parse(in_2, 'fastq')): 13 | if (len(rec_a.seq) > minlen) and (len(rec_b.seq) > minlen): 14 | SeqIO.write(rec_a, out_1, 'fastq') 15 | SeqIO.write(rec_b, out_2, 'fastq') 16 | 17 | def main(): 18 | '''main function''' 19 | parser = argparse.ArgumentParser( 20 | description='filter fastq file by reads length') 21 | parser.add_argument('-1', '--read1', help='paired-end fastq file one') 22 | parser.add_argument('-2', '--read2', help='paired-end fastq file two') 23 | parser.add_argument('-l', '--minlen', type=int, default=80, 24 | help='remove reads if length < min-len') 25 | parser.add_argument('-p', '--prefix', 26 | help='output prefix') 27 | args = parser.parse_args() 28 | 29 | filter_pe_fasq_by_len(args.read1, args.read2, args.minlen, args.prefix) 30 | 31 | if __name__ == '__main__': 32 | main() 33 | -------------------------------------------------------------------------------- /metapi/visualization/dada2_stats_barplot.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library(ggplot2) 4 | 5 | 6 | dada2_stats_barplot <- function(df, stack=FALSE, pretty=FALSE) 7 | { 8 | df <- df %>% dplyr::arrange(`non-chimeric`) 9 | df_l <- df %>% 10 | dplyr::select("sample-id", "input", "filtered", "denoised", "non-chimeric") %>% 11 | tidyr::pivot_longer(!"sample-id", names_to="step", values_to="count") %>% 12 | dplyr::mutate(step=factor(step, 13 | levels=c("input", "filtered", "denoised", "non-chimeric")), 14 | `sample-id`=factor(`sample-id`, 15 | levels=df$`sample-id`)) 16 | 17 | position = position_dodge(0.8) 18 | if (stack) { position = "stack" } 19 | 20 | if (pretty) { 21 | p <- 22 | ggpubr::ggbarplot(df_l, x="sample-id", y="count", 23 | fill="step", color="step", x.text.angle=90, 24 | stat="identity", position=position) 25 | } else { 26 | p <- 27 | ggplot(df_l, aes(x=`sample-id`, y=count)) + 28 | geom_bar(aes(color=step, fill=step), 29 | stat="identity", position=position, width=0.7) + 30 | theme_classic() + 31 | theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5, 32 | size=12, color="black"), 33 | axis.text.y=element_text(size=12, color="black")) 34 | } 35 | 36 | print(p) 37 | return(p) 38 | } -------------------------------------------------------------------------------- /scripts/fasta_length_tab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from Bio.SeqIO.FastaIO import SimpleFastaParser 3 | import argparse 4 | """get each sequence length from a fasta file and pring it to a file, then plot""" 5 | 6 | def gen_fa_len_tab(fa_file, len_out): 7 | with open(len_out, 'w') as out_handle: 8 | with open(fa_file, 'r') as in_handle: 9 | for title, seq in SimpleFastaParser(in_handle): 10 | #out_handle.write(title + "\t" + str(len(seq))) 11 | # just print id and seq length 12 | out_handle.write(title.split(' ')[0] + "\t" + str(len(seq)) + "\n") 13 | 14 | # megahit contigs header contains contigs length info 15 | def gen_fa_len_tab_megahit(fa_file, len_out): 16 | with open(len_out, 'w') as out_handle: 17 | with open(fa_file, 'r') as in_handle: 18 | for title, seq in SimpleFastaParser(in_handle): 19 | # maybe wrong 20 | len = title.split(' ')[-1].split('=')[-1] 21 | out_handle.write(title + "\t" + len + "\n") 22 | 23 | def main(): 24 | parser = argparse.ArgumentParser(description='get fasta length info') 25 | parser.add_argument('--fasta', type=str, help='fasta file') 26 | parser.add_argument('--out', type=str, help='fasta length output file') 27 | args = parser.parse_args() 28 | 29 | gen_fa_len_tab(args.fasta, args.out) 30 | 31 | # fasta input must contigs file which was assemblyed by megahit 32 | # gen_fa_len_tab(args.fasta, args.out) 33 | 34 | if __name__ == '__main__': 35 | main() -------------------------------------------------------------------------------- /scripts/kraken2_reads_merger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import os 5 | import sys 6 | from glob import glob 7 | 8 | 9 | def merger_reads(inputdir, outputdir): 10 | merger = {} 11 | for i in glob(inputdir.rstrip("/") + "/*/*.1.fq.gz"): 12 | taxid = int(os.path.basename(i).split(".")[1]) 13 | if taxid in merger: 14 | merger[taxid].append(i) 15 | else: 16 | merger[taxid] = [i] 17 | 18 | for taxid in merger: 19 | r1_str = " ".join(merger[taxid]) 20 | r2_str = r1_str.replace("1.fq.gz", "2.fq.gz") 21 | r1 = os.path.join(outputdir, "%d.1.fq.gz" % taxid) 22 | r2 = os.path.join(outputdir, "%d.2.fq.gz" % taxid) 23 | if len(merger[taxid]) > 1: 24 | cmd = 'cat %s > %s && rm -rf %s && cat %s > %s && rm -rf %s' % (r1_str, r1, r1_str, r2_str, r2, r2_str) 25 | print(cmd) 26 | else: 27 | cmd = 'mv %s %s && mv %s %s' % (r1_str, r1, r2_str, r2) 28 | print(cmd) 29 | 30 | 31 | def main(args_): 32 | parser = argparse.ArgumentParser("merge kraken2 partition reads of many samples") 33 | parser.add_argument( 34 | '-i', 35 | '--input_dir', 36 | help='a directory contains many sample-specific directory' 37 | ) 38 | parser.add_argument( 39 | '-o', 40 | '--output_dir', 41 | help='output directory' 42 | ) 43 | 44 | args = parser.parse_args(args_) 45 | merger_reads(args.input_dir, args.output_dir) 46 | 47 | 48 | if __name__ == '__main__': 49 | main(sys.argv[1:]) 50 | -------------------------------------------------------------------------------- /scripts/animf_cluster.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import pandas as pd 5 | import drep 6 | 7 | def check_drep_exists(): 8 | try: 9 | from drep import argumentParser 10 | print("drep version: %s" % argumentParser.version()) 11 | except ImportError: 12 | print("drep doesn't exists") 13 | 14 | 15 | def cluster(Bdb, Cdb, work_dir): 16 | Ndb = pd.DataFrame() 17 | for bdb, name in drep.d_cluster.iteratre_clusters(Bdb, Cdb): 18 | genome_list = bdb["location"].tolist() 19 | anin_folder = os.path.join(work_dir, "ANImf_files") 20 | 21 | org_lengths = {} 22 | files = [] 23 | deltafiles = [] 24 | 25 | # genome1_vs_genome2.delta 26 | # genome1_vs_genome2.filtered.delta 27 | for g1 in genome_list: 28 | cur_folder = os.path.join(anin_folder, os.path.basename(g1)) 29 | org_lengths[os.path.basename(g1)] = \ 30 | drep.d_filter.calc_fasta_length(g1) 31 | for g2 in genome_list: 32 | file_name = "{0}/{1}_vs_{2}".format( 33 | cur_folder, 34 | os.path.basename(g1), 35 | os.path.basename(g2) 36 | ) 37 | deltafiles.append(file_name + ".filtered.delta") 38 | df = drep.d_cluster.process_deltafiles(deltafiles, 39 | org_lengths, 40 | coverage_method="larger") 41 | 42 | 43 | 44 | 45 | def main(): 46 | pass 47 | 48 | 49 | if __name__ == '__main__': 50 | main() -------------------------------------------------------------------------------- /scripts/find_path.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | 4 | def find_path(dir, suffix): 5 | path = {} 6 | for f in os.listdir(dir): 7 | if f.endswith(suffix): 8 | key = f.rstrip(suffix) 9 | path[key] = os.path.join(dir, f) 10 | return path 11 | 12 | def find_path_tag(dir, tag): 13 | if tag == "raw": 14 | r1 = {} 15 | r2 = {} 16 | for f in os.listdir(dir): 17 | if f.endswith("1.fq.gz"): 18 | key = f.rstrip(".|-|_" + "1.fq.gz") 19 | r1[key] = os.path.join(dir, f) 20 | if f.endswith("2.fq.gz"): 21 | key = f.rstrip(".|-|_" + "2.fq.gz") 22 | r2[key] = os.path.join(dir, f) 23 | return (r1, r2) 24 | elif tag == "clean" or tag == "rmhost": 25 | r1 = {} 26 | r2 = {} 27 | rs = {} 28 | rt = {} 29 | for f in os.listdir(dir): 30 | if f.endswith(tag + ".1.fq.gz"): 31 | key = f.rstrip(".|-|_" + tag + ".1.fq.gz") 32 | r1[key] = os.path.join(dir, f) 33 | if f.endswith(tag + ".2.fq.gz"): 34 | key = f.rstrip(".|-|_" + tag + ".2.fq.gz") 35 | r1[key] = os.path.join(dir, f) 36 | if f.endswith(tag + ".single.fq.gz"): 37 | key = f.rstrip(".|-|_" + tag + ".single.fq.gz") 38 | r1[key] = os.path.join(dir, f) 39 | if f.endswith(tag + ".stat_out"): 40 | key = f.rstrip(".|-|_" + tag + ".stat_out") 41 | r1[key] = os.path.join(dir, f) 42 | return (r1, r2, rs, rt) 43 | -------------------------------------------------------------------------------- /metapi/wrappers/prokka_wrapper.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import time 4 | import subprocess 5 | 6 | 7 | PROKKA_SUFFIX = ["err", "log", "faa", "ffn", "fna", "fsa", 8 | "gbk", "gff", "sqn", "tbl", "tsv", "txt"] 9 | 10 | bin_list = glob.glob(snakemake.input["mags_dir"] + "/*.fa.gz") 11 | gff_count = 0 12 | 13 | for bin_fa in bin_list: 14 | bin_id = os.path.basename(os.path.splitext(os.path.splitext(bin_fa)[0])[0]) 15 | output_dir = os.path.join(snakemake.params["output_dir"], bin_id) 16 | gff_file = os.path.join(output_dir, bin_id + ".gff") 17 | 18 | subprocess(f'''echo "\nProcessing {bin_fa}\n" >> {snakemake.log}''', shell=True) 19 | 20 | # https://github.com/tseemann/prokka/pull/130 21 | # Uncompressing 1000's of gzip'ed fasta files just to run them through prokka can be a bit of pain. 22 | subprocess( 23 | f''' 24 | prokka <(zcat {bin_fa}) \ 25 | --force \ 26 | --centre X \ 27 | --compliant \ 28 | --cpus {snakemake.threads} \ 29 | --outdir {output_dir} \ 30 | --locustag {bin_id} \ 31 | --prefix {bin_id} \ 32 | --kingdom {snakemake.params["kingdom"]} \ 33 | 2>> {snakemake.log} 34 | ''', shell=True) 35 | 36 | if os.path.exists(gff_file): 37 | gff_count += 1 38 | 39 | if gff_count == len(bin_list): 40 | subprocess('''touch {snakemake.output["done"]}''', shell=True) 41 | 42 | for suffix in PROKKA_SUFFIX: 43 | prokka_f = os.path.join(output_dir, f'''{bin_id}.{suffix}''') 44 | if os.path.exists(prokka_f): 45 | subprocess.run(f'''pigz -f {prokka_f}''', shell=True) -------------------------------------------------------------------------------- /scripts/taxonomy_info_covert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import csv 4 | import os 5 | 6 | def parse_lca_classify(taxonomy_csv, output): 7 | # taxonomy = ['superkingdom', 'phylum', 'order', 'class', 'family', 'genus', 'species'] 8 | headers = ["ID", "status", "lineage"] 9 | rows = [] 10 | with open(taxonomy_csv, 'r') as csv_h: 11 | f_csv = csv.DictReader(csv_h) 12 | # print(type(f_csv)) 13 | for row in f_csv: 14 | row_dict = {} 15 | row_dict["ID"] = os.path.basename(row["ID"]) 16 | row_dict["status"] = row["status"] 17 | row_dict["lineage"] = row["superkingdom"] + ";" + \ 18 | row["phylum"] + ";" + \ 19 | row["order"] + ";" + \ 20 | row["class"] + ";" + \ 21 | row["family"] + ";" + \ 22 | row["genus"] + ";" + \ 23 | row["species"] 24 | rows.append(row_dict) 25 | 26 | with open(output, 'w') as csv_out: 27 | csv_f = csv.DictWriter(csv_out, headers) 28 | csv_f.writeheader() 29 | csv_f.writerows(rows) 30 | 31 | def main(): 32 | parser = argparse.ArgumentParser(description="convert sourmash lca classify resuts to metacoder input") 33 | parser.add_argument('-csv', type=str, help="sourmash lca classify results csv file") 34 | parser.add_argument('-out', type=str, help='coverted csv file') 35 | args = parser.parse_args() 36 | parse_lca_classify(args.csv, args.out) 37 | 38 | if __name__ == '__main__': 39 | main() 40 | -------------------------------------------------------------------------------- /metapi/tooler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import concurrent.futures 5 | import pandas as pd 6 | 7 | 8 | def parse(stats_file): 9 | if os.path.exists(stats_file): 10 | try: 11 | df = pd.read_csv(stats_file, sep="\t") 12 | except pd.errors.EmptyDataError: 13 | print("%s is empty, please check" % stats_file) 14 | return None 15 | 16 | if not df.empty: 17 | return df 18 | else: 19 | return None 20 | else: 21 | print("%s is not exists" % stats_file) 22 | return None 23 | 24 | 25 | def merge(input_list, func, workers, **kwargs): 26 | df_list = [] 27 | with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor: 28 | for df in executor.map(func, input_list): 29 | if df is not None: 30 | df_list.append(df) 31 | 32 | df_ = pd.concat(df_list) 33 | 34 | if "output" in kwargs: 35 | df_.to_csv(kwargs["output"], sep="\t", index=False) 36 | return df_ 37 | 38 | 39 | def merge2(input_list, func, workers, **kwargs): 40 | df1_list = [] 41 | df2_list = [] 42 | with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor: 43 | for df1, df2 in executor.map(func, input_list): 44 | if df1 is not None: 45 | df1_list.append(df1) 46 | if df2 is not None: 47 | df2_list.append(df2) 48 | 49 | df_1 = pd.concat(df1_list) 50 | df_2 = pd.concat(df2_list) 51 | 52 | if "output_1" in kwargs: 53 | df_1.to_csv(kwargs["output_1"], sep="\t", index=False) 54 | if "output_2" in kwargs: 55 | df_2.to_csv(kwargs["output_2"], sep="\t", index=False) 56 | 57 | return df_1, df_2 58 | -------------------------------------------------------------------------------- /metapi/snakefiles/gene_wf.smk: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env snakemake 2 | 3 | import sys 4 | import metapi 5 | import pandas as pd 6 | from snakemake.utils import min_version 7 | 8 | min_version(7.0) 9 | 10 | shell.executable("bash") 11 | 12 | METAPI_DIR = metapi.__path__[0] 13 | WRAPPER_DIR = os.path.join(METAPI_DIR, "wrappers") 14 | 15 | 16 | RMHOST_DO = any([ 17 | config["params"]["rmhost"]["bwa"]["do"], 18 | config["params"]["rmhost"]["bowtie2"]["do"]]) 19 | 20 | 21 | TRIMMING_DO = any([ 22 | config["params"]["trimming"]["sickle"]["do"], 23 | config["params"]["trimming"]["fastp"]["do"], 24 | config["params"]["trimming"]["trimmomatic"]["do"]]) 25 | 26 | 27 | ASSEMBLERS = [] 28 | if config["params"]["assembly"]["megahit"]["do"]: 29 | ASSEMBLERS += ["megahit"] 30 | if config["params"]["assembly"]["idba_ud"]["do"]: 31 | ASSEMBLERS += ["idba_ud"] 32 | if config["params"]["assembly"]["metaspades"]["do"]: 33 | ASSEMBLERS += ["metaspades"] 34 | if config["params"]["assembly"]["spades"]["do"]: 35 | ASSEMBLERS += ["spades"] 36 | 37 | 38 | SAMPLES, DATA_TYPE = metapi.parse_samples(config["params"]["samples"]) 39 | 40 | 41 | include: "../rules/raw.smk" 42 | include: "../rules/trimming.smk" 43 | include: "../rules/rmhost.smk" 44 | include: "../rules/qcreport.smk" 45 | include: "../rules/assembly.smk" 46 | include: "../rules/predict_scaftigs.smk" 47 | include: "../rules/dereplicate_cds.smk" 48 | include: "../rules/upload.smk" 49 | 50 | 51 | rule all: 52 | input: 53 | rules.raw_all.input, 54 | rules.trimming_all.input, 55 | rules.rmhost_all.input, 56 | rules.qcreport_all.input, 57 | rules.assembly_all.input, 58 | rules.predict_scaftigs_gene_all.input, 59 | rules.dereplicate_gene_all.input, 60 | rules.upload_all.input 61 | -------------------------------------------------------------------------------- /tests/test_spades/Snakefile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env snakemake 2 | 3 | rule download_reads: 4 | output: 5 | r1 = "test/reads/ecoli_1K.1.fq.gz", 6 | r2 = "test/reads/ecoli_1K.2.fq.gz" 7 | threads: 8 | 1 9 | shell: 10 | ''' 11 | curl -o test/reads/ecoli_1K.1.fq.gz https://raw.githubusercontent.com/ablab/spades/spades_3.15.5/assembler/test_dataset/ecoli_1K_1.fq.gz 12 | curl -o test/reads/ecoli_1K.2.fq.gz https://raw.githubusercontent.com/ablab/spades/spades_3.15.5/assembler/test_dataset/ecoli_1K_2.fq.gz 13 | ''' 14 | 15 | 16 | rule prepare_samples_tsv: 17 | input: 18 | r1 = "test/reads/ecoli_1K.1.fq.gz", 19 | r2 = "test/reads/ecoli_1K.2.fq.gz" 20 | output: 21 | "test/samples.tsv" 22 | threads: 23 | 1 24 | shell: 25 | ''' 26 | fd -t f fq.gz $(pwd)/test/reads | \ 27 | sort | uniq | paste - - | \ 28 | awk 'BEGIN{{print "sample_id\tassembly_group\tbinning_group\tfq1\tfq2"}};{{print "ecoli_1K\tecoli_1K\tecoli_1K\t" $0}}' \ 29 | > {output} 30 | ''' 31 | 32 | 33 | rule metapi_init: 34 | input: 35 | "test/samples.tsv" 36 | output: 37 | "test/config.yaml" 38 | conda: 39 | "metapi.yaml" 40 | shell: 41 | ''' 42 | pushd test 43 | metapi init -d . -s $(basename {input}) -b assembly --assembler spades 44 | popd 45 | ''' 46 | 47 | 48 | rule metapi_run_assembly: 49 | input: 50 | "test/config.yaml" 51 | output: 52 | "test/results/04.assembly/report/assembly_stats_spades.tsv" 53 | conda: 54 | "metapi.yaml" 55 | shell: 56 | ''' 57 | pushd test 58 | metapi mag_wf assembly_all --run-local --use-conda 59 | popd 60 | ''' 61 | 62 | 63 | rule all: 64 | input: 65 | "test/results/04.assembly/report/assembly_stats_spades.tsv" 66 | -------------------------------------------------------------------------------- /metapi/rules/qcreport.smk: -------------------------------------------------------------------------------- 1 | STEPS = ["raw"] 2 | if TRIMMING_DO: 3 | STEPS += ["trimming"] 4 | if RMHOST_DO: 5 | STEPS += ["rmhost"] 6 | 7 | SAMPLESDIR = os.path.join(config["output"][STEPS[-1]]) 8 | 9 | if config["params"]["qcreport"]["do"]: 10 | rule qcreport_summary: 11 | input: 12 | expand(os.path.join(config["output"]["qcreport"], "{step}_stats.tsv"), 13 | step=STEPS) 14 | output: 15 | summary_l = os.path.join(config["output"]["qcreport"], "qc_stats_l.tsv"), 16 | summary_w = os.path.join(config["output"]["qcreport"], "qc_stats_w.tsv") 17 | priority: 18 | 30 19 | threads: 20 | config["params"]["qcreport"]["seqkit"]["threads"] 21 | run: 22 | df = metapi.merge(input, metapi.parse, threads) 23 | df = metapi.compute_host_rate(df, STEPS, SAMPLES_ID_LIST, allow_miss_samples=True, output=output.summary_l) 24 | metapi.qc_summary_merge(df, output=output.summary_w) 25 | 26 | 27 | rule qcreport_plot: 28 | input: 29 | rules.qcreport_summary.output 30 | output: 31 | os.path.join(config["output"]["qcreport"], "qc_reads_num_barplot.pdf") 32 | priority: 33 | 30 34 | run: 35 | df = metapi.parse(input[0]) 36 | metapi.qc_bar_plot(df, "seaborn", output=output[0]) 37 | 38 | 39 | rule qcreport_all: 40 | input: 41 | os.path.join(config["output"]["qcreport"], "qc_stats_l.tsv"), 42 | os.path.join(config["output"]["qcreport"], "qc_stats_w.tsv"), 43 | os.path.join(config["output"]["qcreport"], "qc_reads_num_barplot.pdf") 44 | 45 | else: 46 | rule qcreport_summary: 47 | input: 48 | 49 | 50 | rule qcreport_plot: 51 | input: 52 | 53 | 54 | rule qcreport_all: 55 | input: 56 | 57 | 58 | localrules: 59 | qcreport_summary, 60 | qcreport_plot, 61 | qcreport_all -------------------------------------------------------------------------------- /metapi/profiles/generic/slurm_status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import re 3 | import subprocess as sp 4 | import shlex 5 | import sys 6 | import time 7 | import logging 8 | logger = logging.getLogger("__name__") 9 | 10 | STATUS_ATTEMPTS = 20 11 | 12 | jobid = sys.argv[1] 13 | 14 | for i in range(STATUS_ATTEMPTS): 15 | try: 16 | sacct_res = sp.check_output(shlex.split("sacct -P -b -j {} -n".format(jobid))) 17 | res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")} 18 | break 19 | except sp.CalledProcessError as e: 20 | logger.error("sacct process error") 21 | logger.error(e) 22 | except IndexError as e: 23 | pass 24 | # Try getting job with scontrol instead in case sacct is misconfigured 25 | try: 26 | sctrl_res = sp.check_output(shlex.split("scontrol -o show job {}".format(jobid))) 27 | m = re.search("JobState=(\w+)", sctrl_res.decode()) 28 | res = {jobid: m.group(1)} 29 | break 30 | except sp.CalledProcessError as e: 31 | logger.error("scontrol process error") 32 | logger.error(e) 33 | if i >= STATUS_ATTEMPTS - 1: 34 | print("failed") 35 | exit(0) 36 | else: 37 | time.sleep(1) 38 | 39 | status = res[jobid] 40 | 41 | if (status == "BOOT_FAIL"): 42 | print("failed") 43 | elif (status == "OUT_OF_MEMORY"): 44 | print("failed") 45 | elif (status.startswith("CANCELLED")): 46 | print("failed") 47 | elif (status == "COMPLETED"): 48 | print("success") 49 | elif (status == "DEADLINE"): 50 | print("failed") 51 | elif (status == "FAILED"): 52 | print("failed") 53 | elif (status == "NODE_FAIL"): 54 | print("failed") 55 | elif (status == "PREEMPTED"): 56 | print("failed") 57 | elif (status == "TIMEOUT"): 58 | print("failed") 59 | # Unclear whether SUSPENDED should be treated as running or failed 60 | elif (status == "SUSPENDED"): 61 | print("failed") 62 | else: 63 | print("running") 64 | -------------------------------------------------------------------------------- /scripts/get_prodigal_gbk_result.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # email: zhujie@genomics.cn 3 | # license: GPL V3 4 | import re 5 | 6 | gbklist = "./gene.coordinate.gbk.pathlist.new" 7 | out = open("./gene.coordinate.stat.out.new", 'w') 8 | out.write("ID\tpartial=00\tpartial=01\tpartial=10\tpartial=11\ttotal_len\ttotal_num\tavg_length\n") 9 | 10 | with open(gbklist, 'r') as path_handler: 11 | for gbkpath in path_handler: 12 | genenum = {} 13 | gene_total_len = 0 14 | gene_total_num = 0 15 | gene_avg_len = 0 16 | partial = ['partial=00', 'partial=01', 'partial=10', 'partial=11'] 17 | genenum['partial=00'] = 0 18 | genenum['partial=01'] = 0 19 | genenum['partial=10'] = 0 20 | genenum['partial=11'] = 0 21 | 22 | with open(gbkpath.strip(), 'r') as gbk_handler: 23 | first = next(gbk_handler) 24 | id = re.search(r'(.*?)seqhdr="(CL\d+_L\d+_\d+)_scaffold(.*)', first).group(2) 25 | genenum['id'] = id 26 | for line in gbk_handler: 27 | for tag in partial: 28 | if re.search(tag, line): 29 | genenum[tag] += 1 30 | gene_total_num += 1 31 | if re.search("CDS\s+(complement\()?(<)?(\d+)\.\.(>)?(\d+)(\))", line): 32 | len = re.search("CDS\s+(complement\()?(<)?(\d+)\.\.(>)?(\d+)(\))", line) 33 | gene_total_len += int(len.group(5)) - int(len.group(3)) + 1 34 | 35 | gene_avg_len = round(float(gene_total_len) / float(gene_total_num), 6) 36 | 37 | out.write("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%f\n" % ( 38 | genenum['id'], 39 | genenum['partial=00'], 40 | genenum['partial=01'], 41 | genenum['partial=10'], 42 | genenum['partial=11'], 43 | gene_total_len, 44 | gene_total_num, 45 | gene_avg_len)) 46 | 47 | out.close() 48 | -------------------------------------------------------------------------------- /metapi/wrappers/vamb/concatenate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import argparse 6 | import gzip 7 | import vamb 8 | 9 | parser = argparse.ArgumentParser( 10 | description="""Creates the input FASTA file for Vamb. 11 | Input should be one or more FASTA files, each from a sample-specific assembly. 12 | If keepnames is False, resulting FASTA can be binsplit with separator 'C'.""", 13 | formatter_class=argparse.RawDescriptionHelpFormatter, 14 | add_help=False, 15 | ) 16 | 17 | parser.add_argument("outpath", help="Path to output FASTA file") 18 | parser.add_argument("inpaths", help="Paths to input FASTA file(s)", nargs="+") 19 | parser.add_argument( 20 | "-m", 21 | dest="minlength", 22 | metavar="", 23 | type=int, 24 | default=2000, 25 | help="Discard sequences below this length [2000]", 26 | ) 27 | parser.add_argument( 28 | "--keepnames", action="store_true", help="Do not rename sequences [False]" 29 | ) 30 | parser.add_argument("--nozip", action="store_true", help="Do not gzip output [False]") 31 | 32 | if len(sys.argv) == 1 or sys.argv[1] in ("-h", "--help"): 33 | parser.print_help() 34 | sys.exit() 35 | 36 | args = parser.parse_args() 37 | 38 | # Check inputs 39 | for path in args.inpaths: 40 | if not os.path.isfile(path): 41 | raise FileNotFoundError(path) 42 | 43 | if os.path.exists(args.outpath): 44 | raise FileExistsError(args.outpath) 45 | 46 | parent = os.path.dirname(args.outpath) 47 | if parent != "" and not os.path.isdir(parent): 48 | raise NotADirectoryError( 49 | f'Output file cannot be created: Parent directory "{parent}" is not an existing directory' 50 | ) 51 | 52 | # Run the code. Compressing DNA is easy, this is not much bigger than level 9, but 53 | # many times faster 54 | filehandle = ( 55 | open(args.outpath, "w") 56 | if args.nozip 57 | else gzip.open(args.outpath, "wt", compresslevel=1) 58 | ) 59 | vamb.vambtools.concatenate_fasta( 60 | filehandle, args.inpaths, minlength=args.minlength, rename=(not args.keepnames) 61 | ) 62 | filehandle.close() 63 | -------------------------------------------------------------------------------- /metapi/rules/simulate.smk: -------------------------------------------------------------------------------- 1 | if config["params"]["simulate"]["do"]: 2 | rule simulate_short_reads: 3 | input: 4 | genomes = lambda wildcards: metapi.get_simulate_info(SAMPLES, wildcards, "genome") 5 | output: 6 | r1 = os.path.join(config["output"]["simulate"], 7 | "short_reads/{sample}.simulate.1.fq.gz"), 8 | r2 = os.path.join(config["output"]["simulate"], 9 | "short_reads/{sample}.simulate.2.fq.gz"), 10 | abunf = os.path.join(config["output"]["simulate"], 11 | "abundance/{sample}.simulate.abundance.txt") 12 | log: 13 | os.path.join(config["output"]["simulate"], "logs/{sample}.iss.log") 14 | benchmark: 15 | os.path.join(config["output"]["simulate"], "benchmark/iss/{sample}.iss.benchmark.txt") 16 | conda: 17 | config["envs"]["simulate"] 18 | params: 19 | output_prefix = os.path.join(config["output"]["simulate"], 20 | "short_reads/{sample}"), 21 | model = lambda wildcards: metapi.get_simulate_info(SAMPLES, wildcards, "model")[0], 22 | reads_num = lambda wildcards: metapi.get_simulate_info(SAMPLES, wildcards, "reads_num")[0], 23 | abundance = lambda wildcards: metapi.get_simulate_info(SAMPLES, wildcards, "abundance") 24 | threads: 25 | config["params"]["simulate"]["threads"] 26 | script: 27 | "../wrappers/simulate_reads.py" 28 | 29 | 30 | rule simulate_all: 31 | input: 32 | expand([ 33 | os.path.join(config["output"]["simulate"], 34 | "short_reads/{sample}.simulate.{read}.fq.gz"), 35 | os.path.join(config["output"]["simulate"], 36 | "abundance/{sample}.simulate.abundance.txt")], 37 | read=["1", "2"], 38 | sample=SAMPLES.index.unique()) 39 | 40 | else: 41 | rule simulate_all: 42 | input: 43 | 44 | 45 | localrules: 46 | simulate_all -------------------------------------------------------------------------------- /metapi/profiles/slurm/slurm-status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import re 3 | import subprocess as sp 4 | import shlex 5 | import sys 6 | import time 7 | import logging 8 | from CookieCutter import CookieCutter 9 | 10 | logger = logging.getLogger("__name__") 11 | 12 | STATUS_ATTEMPTS = 20 13 | 14 | jobid = sys.argv[1] 15 | 16 | cluster = CookieCutter.get_cluster_option() 17 | 18 | for i in range(STATUS_ATTEMPTS): 19 | try: 20 | sacct_res = sp.check_output(shlex.split(f"sacct {cluster} -P -b -j {jobid} -n")) 21 | res = { 22 | x.split("|")[0]: x.split("|")[1] 23 | for x in sacct_res.decode().strip().split("\n") 24 | } 25 | break 26 | except sp.CalledProcessError as e: 27 | logger.error("sacct process error") 28 | logger.error(e) 29 | except IndexError as e: 30 | logger.error(e) 31 | pass 32 | # Try getting job with scontrol instead in case sacct is misconfigured 33 | try: 34 | sctrl_res = sp.check_output( 35 | shlex.split(f"scontrol {cluster} -o show job {jobid}") 36 | ) 37 | m = re.search(r"JobState=(\w+)", sctrl_res.decode()) 38 | res = {jobid: m.group(1)} 39 | break 40 | except sp.CalledProcessError as e: 41 | logger.error("scontrol process error") 42 | logger.error(e) 43 | if i >= STATUS_ATTEMPTS - 1: 44 | print("failed") 45 | exit(0) 46 | else: 47 | time.sleep(1) 48 | 49 | status = res[jobid] 50 | 51 | if status == "BOOT_FAIL": 52 | print("failed") 53 | elif status == "OUT_OF_MEMORY": 54 | print("failed") 55 | elif status.startswith("CANCELLED"): 56 | print("failed") 57 | elif status == "COMPLETED": 58 | print("success") 59 | elif status == "DEADLINE": 60 | print("failed") 61 | elif status == "FAILED": 62 | print("failed") 63 | elif status == "NODE_FAIL": 64 | print("failed") 65 | elif status == "PREEMPTED": 66 | print("failed") 67 | elif status == "TIMEOUT": 68 | print("failed") 69 | elif status == "SUSPENDED": 70 | print("running") 71 | else: 72 | print("running") 73 | -------------------------------------------------------------------------------- /scripts/asm_status_wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import shutil 5 | import argparse 6 | 7 | STATSWRAPPER_TEMPLATE = '''{stats} \ 8 | in={input_list} \ 9 | minscaf={minscaf} > {output}''' 10 | 11 | 12 | class statswrapper: 13 | def __init__(self, input_list, minscaf, output): 14 | self.stats = shutil.which("statswrapper.sh") 15 | self.input_list = ",".join(input_list) 16 | self.minscaf = minscaf 17 | self.output = output 18 | 19 | 20 | def gen_shell(ilist, mlen, split, prefix, output): 21 | files = open(ilist, 'r').readlines() 22 | total = len(files) 23 | assert total >= split, "can't split" 24 | step = total // split 25 | m = total % split 26 | count = 0 27 | sub_files = [] 28 | cmds = [] 29 | for i in range(0, total, step): 30 | count += 1 31 | if count <= split: 32 | sub_files = [f.strip() for f in files[i:(i + step)]] 33 | output_ = "%s.%d.tsv" % (prefix, count) 34 | cmd = STATSWRAPPER_TEMPLATE.format_map( 35 | vars(statswrapper(sub_files, mlen, output_))) 36 | cmds.append(cmd) 37 | 38 | if (count > split) and (m > 0): 39 | sub_files += [f.strip() for f in files[(total - m):total]] 40 | output_ = "%s.%d.tsv" % (prefix, split) 41 | cmd = STATSWRAPPER_TEMPLATE.format_map( 42 | vars(statswrapper(sub_files, mlen, output_))) 43 | cmds[split - 1] = cmd 44 | 45 | with open(output, 'w') as oh: 46 | for i in cmds: 47 | oh.write(i + "\n") 48 | 49 | 50 | def main(): 51 | parser = argparse.ArgumentParser("assembler status wrapper") 52 | parser.add_argument('-l', '--list', type=str, help='input assembly file list') 53 | parser.add_argument('-m', '--min_len', type=int, default=0, help='minimal contig/scaffold length') 54 | parser.add_argument('-s', '--split', type=int, default=1, help='split input file') 55 | parser.add_argument('-p', '--prefix', type=str, default="asm_stats", help="assembly status output prefix") 56 | parser.add_argument('-o', '--output', type=str, default=sys.stdout, help='write cmd to file, default: stdout') 57 | args = parser.parse_args() 58 | 59 | gen_shell(args.list, args.min_len, args.split, args.prefix, args.output) 60 | 61 | 62 | if __name__ == '__main__': 63 | main() 64 | -------------------------------------------------------------------------------- /scripts/cut_up_fasta_concoct.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This scipt comes from CONCOCT 5 | Let it support Python 3 6 | Cut up fasta file in non-overlapping or overlapping parts of equal length. 7 | """ 8 | import argparse 9 | from Bio import SeqIO 10 | import gzip 11 | 12 | def cut_up_fasta(fastfiles, chunk_size, overlap, merge_last): 13 | for ff in fastfiles: 14 | if ff.strip().endswith(".gz"): 15 | fa_handle = gzip.open(ff.strip(), 'rt') 16 | else: 17 | fa_handle = open(ff.strip(), 'r') 18 | for record in SeqIO.parse(fa_handle, "fasta"): 19 | if (not merge_last and len(record.seq) > chunk_size) or (merge_last and len(record.seq) >= 2 * chunk_size): 20 | i = 0 21 | for split_seq in chunks(record.seq, chunk_size, overlap, merge_last): 22 | print(">%s.%i\n%s" % (record.id, i, split_seq)) 23 | i = i + 1 24 | else: 25 | print(">%s\n%s" % (record.id, record.seq)) 26 | 27 | 28 | def chunks(l, n, o, merge_last): 29 | """ Yield successive n-sized chunks from l with given overlap o between the 30 | chunks. 31 | """ 32 | assert n > o 33 | 34 | if not merge_last: 35 | for i in range(0, len(l), n - o): 36 | yield l[i:i + n] 37 | else: 38 | for i in range(0, len(l) - n + 1, n - o): 39 | yield l[i:i + n] if i + n + n - o <= len(l) else l[i:] 40 | 41 | 42 | if __name__ == "__main__": 43 | parser = argparse.ArgumentParser(description=__doc__, 44 | formatter_class=argparse.RawDescriptionHelpFormatter) 45 | parser.add_argument( 46 | "contigs", nargs="+", help="Fasta files with contigs\n") 47 | parser.add_argument("-c", "--chunk_size", default=1999, type=int, help="Chunk size\n") 48 | parser.add_argument("-o", "--overlap_size", default=1900, type=int, help="Overlap size\n") 49 | parser.add_argument("-m", "--merge_last", default=False, action="store_true", help="Concatenate final part to last contig\n") 50 | args = parser.parse_args() 51 | cut_up_fasta(args.contigs, args.chunk_size, args.overlap_size, args.merge_last) 52 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | from setuptools import setup 5 | 6 | exec(open("metapi/__about__.py").read()) 7 | 8 | if sys.argv[-1] == "publish": 9 | os.system("python setup.py sdist upload") 10 | sys.exit() 11 | 12 | with open("README.md") as f: 13 | long_description = f.read() 14 | 15 | packages = ["metapi"] 16 | package_data = { 17 | "metapi": [ 18 | "metapi/config/*.yaml", 19 | "metapi/envs/*.yaml", 20 | "metapi/snakefiles/*.smk", 21 | "metapi/rules/*.smk", 22 | "metapi/wrappers/*.py", 23 | "metapi/data/*", 24 | "metapi/*.py", 25 | ] 26 | } 27 | data_files = [(".", ["LICENSE", "README.md"])] 28 | 29 | entry_points = {"console_scripts": ["metapi=metapi.corer:main"]} 30 | 31 | requires = [ 32 | req.strip() 33 | for req in open("requirements.txt", "r").readlines() 34 | if not req.startswith("#") 35 | ] 36 | 37 | classifiers = [ 38 | "Development Status :: 3 - Alpha", 39 | "Environment :: Console", 40 | "Intended Audience :: Developers", 41 | "Intended Audience :: Science/Research", 42 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", 43 | "Natural Language :: English", 44 | "Operating System :: OS Independent", 45 | "Programming Language :: Python :: 3.7", 46 | "Programming Language :: Python :: 3.8", 47 | "Programming Language :: Python :: 3.9", 48 | "Programming Language :: Python :: 3.10", 49 | "Topic :: Scientific/Engineering :: Bio-Informatics", 50 | ] 51 | 52 | setup( 53 | name="metapi", 54 | version=__version__, 55 | author=__author__, 56 | author_email="alienchuj@gmail.com", 57 | url="https://github.com/ohmeta/metapi", 58 | description="a pipeline to construct a genome catalogue from metagenomics data", 59 | long_description_content_type="text/markdown", 60 | long_description=long_description, 61 | entry_points=entry_points, 62 | packages=packages, 63 | package_data=package_data, 64 | data_files=data_files, 65 | include_package_data=True, 66 | install_requires=requires, 67 | license="GPLv3+", 68 | classifiers=classifiers, 69 | ) 70 | -------------------------------------------------------------------------------- /metapi/profiles/lsf/lsf_config.py: -------------------------------------------------------------------------------- 1 | import shlex 2 | from collections import OrderedDict 3 | from itertools import chain 4 | from typing import TextIO, Union, List, Any, Dict 5 | 6 | import yaml 7 | 8 | 9 | class Config: 10 | def __init__(self, data: Union[dict, None] = None): 11 | self._data = dict() 12 | if data is not None: 13 | for key, value in data.items(): 14 | self._data[key] = self.concatenate_params(value) 15 | 16 | def __bool__(self) -> bool: 17 | return bool(self._data) 18 | 19 | def __contains__(self, item) -> bool: 20 | return item in self._data 21 | 22 | def get(self, key: str, default: Any = None) -> Any: 23 | return self._data.get(key, default) 24 | 25 | @staticmethod 26 | def args_to_dict(args: str) -> Dict[str, str]: 27 | """Converts a string into a dictionary where key/value pairs are consecutive 28 | elements of the string. 29 | Eg '-J "2" -q 3' --> {'-J': '2', '-q': '3'} 30 | """ 31 | args_iter = shlex.shlex(args, posix=True) 32 | args_iter.whitespace_split = True 33 | return OrderedDict(zip(args_iter, args_iter)) 34 | 35 | @staticmethod 36 | def concatenate_params(params: Union[List[str], str]) -> str: 37 | if isinstance(params, str): 38 | return params 39 | return " ".join(filter(None, params)) 40 | 41 | def default_params(self) -> str: 42 | return self.get("__default__", "") 43 | 44 | def params_for_rule(self, rulename: str) -> str: 45 | """Loads default + rule-specific arguments. 46 | Arguments specified for a rule override default-specified arguments. 47 | Shlex-joining is required to properly pass quoted escapes in yaml 48 | to the shell. 49 | """ 50 | default_params = self.args_to_dict(self.default_params()) 51 | rule_params = self.args_to_dict(self.get(rulename, "")) 52 | default_params.update(rule_params) 53 | return " ".join(map(shlex.quote, chain.from_iterable(default_params.items()))) 54 | 55 | @staticmethod 56 | def from_stream(stream: TextIO) -> "Config": 57 | data = yaml.safe_load(stream) 58 | return Config(data) 59 | -------------------------------------------------------------------------------- /metapi/profiles/slurm/slurm-submit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Snakemake SLURM submit script. 4 | """ 5 | from snakemake.utils import read_job_properties 6 | 7 | import slurm_utils 8 | from CookieCutter import CookieCutter 9 | 10 | # cookiecutter arguments 11 | SBATCH_DEFAULTS = CookieCutter.SBATCH_DEFAULTS 12 | CLUSTER = CookieCutter.get_cluster_option() 13 | CLUSTER_CONFIG = CookieCutter.CLUSTER_CONFIG 14 | ADVANCED_ARGUMENT_CONVERSION = CookieCutter.get_advanced_argument_conversion() 15 | 16 | RESOURCE_MAPPING = { 17 | "time": ("time", "runtime", "walltime"), 18 | "mem": ("mem", "mem_mb", "ram", "memory"), 19 | "mem-per-cpu": ("mem-per-cpu", "mem_per_cpu", "mem_per_thread"), 20 | "nodes": ("nodes", "nnodes"), 21 | } 22 | 23 | # parse job 24 | jobscript = slurm_utils.parse_jobscript() 25 | job_properties = read_job_properties(jobscript) 26 | 27 | sbatch_options = {} 28 | cluster_config = slurm_utils.load_cluster_config(CLUSTER_CONFIG) 29 | 30 | # 1) sbatch default arguments and cluster 31 | sbatch_options.update(slurm_utils.parse_sbatch_defaults(SBATCH_DEFAULTS)) 32 | sbatch_options.update(slurm_utils.parse_sbatch_defaults(CLUSTER)) 33 | 34 | # 2) cluster_config defaults 35 | sbatch_options.update(cluster_config["__default__"]) 36 | 37 | # 3) Convert resources (no unit conversion!) and threads 38 | sbatch_options.update( 39 | slurm_utils.convert_job_properties(job_properties, RESOURCE_MAPPING) 40 | ) 41 | 42 | # 4) cluster_config for particular rule 43 | sbatch_options.update(cluster_config.get(job_properties.get("rule"), {})) 44 | 45 | # 5) cluster_config options 46 | sbatch_options.update(job_properties.get("cluster", {})) 47 | 48 | # 6) Advanced conversion of parameters 49 | if ADVANCED_ARGUMENT_CONVERSION: 50 | sbatch_options = slurm_utils.advanced_argument_conversion(sbatch_options) 51 | 52 | # 7) Format pattern in snakemake style 53 | sbatch_options = slurm_utils.format_values(sbatch_options, job_properties) 54 | 55 | # ensure sbatch output dirs exist 56 | for o in ("output", "error"): 57 | slurm_utils.ensure_dirs_exist(sbatch_options[o]) if o in sbatch_options else None 58 | 59 | # submit job and echo id back to Snakemake (must be the only stdout) 60 | print(slurm_utils.submit_job(jobscript, **sbatch_options)) 61 | -------------------------------------------------------------------------------- /scripts/aggregate_genomecov.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | 4 | def aggregate(cov): 5 | ''' 6 | bedtools genomecov -ibam sample.mapped.sorted.bam -g contigs_c10K.len > sample_cov.txt 7 | produce a histogram of coverage of the exons throughout the genome 8 | 9 | output format explain: 10 | 1. chromosome(or entire genome) 11 | 2. depth of coverage from features in input file 12 | 3. number of bases on chromosome(or genome) with depth equal to column 2 13 | 4. size of chromosome(or entire genome) in base pairs 14 | 5. fraction of baes on chromosome(or entire genome) with depth equal to column 2 15 | so column_5 = column_3 / column_4 16 | all sum(column_3{column_1}) = column_4{column_1} 17 | all sum(column_5{column_1}) = 1 18 | 19 | k119_2 1 30 399 0.075188 20 | k119_2 2 27 399 0.0676692 21 | k119_2 3 151 399 0.378446 22 | k119_2 4 79 399 0.197995 23 | k119_2 5 54 399 0.135338 24 | k119_2 6 39 399 0.0977444 25 | k119_2 7 19 399 0.047619 26 | k119_3 0 387 473 0.818182 27 | k119_3 1 86 473 0.181818 28 | k119_4 4 1 340 0.00294118 29 | ''' 30 | with open(cov, 'r') as in_handle: 31 | cov_num = {} 32 | chr_len = {} 33 | chr_list = [] 34 | for line in in_handle: 35 | chr, depth, num, len, frac = line.strip().split('\t') 36 | if chr not in chr_len: 37 | chr_len[chr] = int(len) 38 | cov_num[chr] = int(depth) * int(num) 39 | chr_list.append(chr) 40 | else: 41 | cov_num[chr] += int(depth) * int(num) 42 | for chr_name in chr_list: 43 | print("%s,%f" % (chr_name, cov_num[chr_name] / chr_len[chr_name])) 44 | 45 | def main(): 46 | parser = argparse.ArgumentParser(description='aggregate the output of bedtools') 47 | parser.add_argument('-cov', type=str, help='input coverage file') 48 | args = parser.parse_args() 49 | 50 | aggregate(args.cov) 51 | 52 | if __name__ == '__main__': 53 | main() 54 | 55 | # awk -F'\t' '{l[$1]=l[$1]+($2*$3);r[$1]=$4} END {for (i in l){print i","(l[i]/r[i])}}' -------------------------------------------------------------------------------- /metapi/profiles/sge/sge-status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import re 3 | import subprocess as sp 4 | import shlex 5 | import sys 6 | import time 7 | import logging 8 | 9 | logger = logging.getLogger("__name__") 10 | logger.setLevel(40) 11 | 12 | STATUS_ATTEMPTS = 20 13 | 14 | jobid = int(sys.argv[1]) 15 | job_status = "running" 16 | 17 | # WARNING this currently has no support for task array jobs 18 | 19 | for i in range(STATUS_ATTEMPTS): 20 | # first try qstat to see if job is running 21 | # we can use `qstat -s pr -u "*"` to check for all running and pending jobs 22 | try: 23 | qstat_res = sp.check_output(shlex.split(f"qstat -s pr")).decode().strip() 24 | 25 | # skip the header using [2:] 26 | res = { 27 | int(x.split()[0]) : x.split()[4] for x in qstat_res.splitlines()[2:] 28 | } 29 | 30 | # job is in an unspecified error state 31 | if "E" in res[jobid]: 32 | job_status = "failed" 33 | break 34 | 35 | job_status = "running" 36 | break 37 | 38 | except sp.CalledProcessError as e: 39 | logger.error("qstat process error") 40 | logger.error(e) 41 | except KeyError as e: 42 | # if the job has finished it won't appear in qstat and we should check qacct 43 | # this will also provide the exit status (0 on success, 128 + exit_status on fail) 44 | # Try getting job with scontrol instead in case sacct is misconfigured 45 | try: 46 | qacct_res = sp.check_output(shlex.split(f"qacct -j {jobid}")) 47 | 48 | exit_code = int(re.search("exit_status ([0-9]+)", qacct_res.decode()).group(1)) 49 | 50 | if exit_code == 0: 51 | job_status = "success" 52 | break 53 | 54 | if exit_code != 0: 55 | job_status = "failed" 56 | break 57 | 58 | except sp.CalledProcessError as e: 59 | logger.warning("qacct process error") 60 | logger.warning(e) 61 | if i >= STATUS_ATTEMPTS - 1: 62 | job_status = "failed" 63 | break 64 | else: 65 | # qacct can be quite slow to update on large servers 66 | time.sleep(5) 67 | pass 68 | 69 | print(job_status) 70 | -------------------------------------------------------------------------------- /scripts/contigs_from_sample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import os 4 | from Bio.SeqIO.FastaIO import SimpleFastaParser 5 | 6 | def contigs_from_sample(contigs_len, sc_out): 7 | info = {} 8 | #count = 0 9 | with open(contigs_len, 'r') as handle: 10 | for line in handle: 11 | key = '_'.join(line.split("_")[:3]) 12 | len = int(line.split("\t")[-1]) 13 | if key not in info: 14 | info[key] = {} 15 | info[key]["num"] = 1 16 | info[key]["len"] = len 17 | else: 18 | info[key]["num"] += 1 19 | info[key]["len"] += len 20 | #count += 1 21 | #if count == 10000: 22 | # break 23 | with open(sc_out, 'w') as out: 24 | out.write("sample_name\ttotal_contigs_num\ttotal_contigs_len\n") 25 | for key in info: 26 | out.write(key + "\t" + str(info[key]["num"]) + "\t" + 27 | str(info[key]["len"]) + "\n") 28 | 29 | def contigs_from_sample_list(contigs_list, sc_out): 30 | info = {} 31 | with open(contigs_list, 'r') as contigs_handle: 32 | for contigs_path in contigs_handle: 33 | key = os.path.basename(contigs_path.strip()).split(".")[0] 34 | if key not in info: 35 | info[key] = {} 36 | info[key]["num"] = 0 37 | info[key]["num_gt2kb"] = 0 38 | info[key]["len"] = 0 39 | info[key]["len_gt2kb"] = 0 40 | with open(contigs_path.strip(), 'r') as contigs_fa: 41 | for title, seq in SimpleFastaParser(contigs_fa): 42 | info[key]["num"] += 1 43 | info[key]["len"] += len(seq) 44 | if len(seq) >= 2000: 45 | info[key]["num_gt2kb"] += 1 46 | info[key]["len_gt2kb"] += len(seq) 47 | with open(sc_out, 'w') as out: 48 | out.write("sample_name\ttotal_contigs_num\ttotal_contigs_num_gt2kb\ttotal_contigs_len\ttotal_contigs_len_gt2kb\n") 49 | for key in info: 50 | out.write("%s\t%d\t%d\t%d\t%d\n" % (key, info[key]["num"], info[key]["num_gt2kb"], info[key]["len"], info[key]["len_gt2kb"])) 51 | 52 | def main(): 53 | #contigs_from_sample(sys.argv[1], sys.argv[2]) 54 | contigs_from_sample_list(sys.argv[1], sys.argv[2]) 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /metapi/profiles/lsf/OSLayer.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | import time 4 | import uuid 5 | from pathlib import Path 6 | from typing import Tuple, List 7 | 8 | if not __name__.startswith("tests.src."): 9 | sys.path.append(str(Path(__file__).parent.absolute())) 10 | from CookieCutter import CookieCutter 11 | else: 12 | from .CookieCutter import CookieCutter 13 | 14 | stdout = str 15 | stderr = str 16 | 17 | 18 | class TailError(Exception): 19 | pass 20 | 21 | 22 | class OSLayer: 23 | """ 24 | This class provides an abstract layer to communicating with the OS. 25 | Its main purpose is to enable OS operations mocking, so we don't actually need to 26 | make file operations or create processes. 27 | """ 28 | 29 | @staticmethod 30 | def mkdir(directory: Path): 31 | directory.mkdir(parents=True, exist_ok=True) 32 | 33 | @staticmethod 34 | def remove_file(file: Path): 35 | if file.is_file(): 36 | file.unlink() 37 | 38 | @staticmethod 39 | def run_process(cmd: str) -> Tuple[stdout, stderr]: 40 | completed_process = subprocess.run( 41 | cmd, check=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE 42 | ) 43 | return ( 44 | completed_process.stdout.decode().strip(), 45 | completed_process.stderr.decode().strip(), 46 | ) 47 | 48 | @staticmethod 49 | def print(string: str): 50 | print(string) 51 | 52 | @staticmethod 53 | def get_uuid4_string() -> str: 54 | return str(uuid.uuid4()) 55 | 56 | @staticmethod 57 | def tail(path: str, num_lines: int = 10) -> List[bytes]: 58 | if not Path(path).exists(): 59 | # allow for filesystem latency 60 | time.sleep(CookieCutter.get_latency_wait()) 61 | if not Path(path).exists(): 62 | raise FileNotFoundError("{} does not exist.".format(path)) 63 | 64 | process = subprocess.Popen( 65 | ["tail", "-n", str(num_lines), path], 66 | stdout=subprocess.PIPE, 67 | stderr=subprocess.PIPE, 68 | ) 69 | exit_code = process.wait() 70 | if exit_code != 0: 71 | raise TailError( 72 | "Failed to execute the tail command on the file {} due to the " 73 | "following error:\n{}".format(path, process.stderr.read().decode()) 74 | ) 75 | return process.stdout.readlines() 76 | -------------------------------------------------------------------------------- /metapi/wrappers/simulate_reads.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import gzip 6 | import subprocess 7 | from Bio import SeqIO 8 | 9 | 10 | def simulate_short_reads( 11 | genomes, output_prefix, r1, r2, abunf, model, reads_num, abundance, threads, logf, 12 | ): 13 | if len(abundance) != 0: 14 | with open(abunf, "w") as outh: 15 | for (g, a) in zip(genomes, abundance): 16 | inh = gzip.open(g, "rt") if g.endswith(".gz") else open(g, "r") 17 | genome = [] 18 | total_len = 0 19 | for record in SeqIO.parse(inh, "fasta"): 20 | total_len += len(record.seq) 21 | genome.append((record.id, len(record.seq))) 22 | for s in genome: 23 | outh.write("%s\t%f\n" % 24 | (s[0], float(a) * s[1] / total_len)) 25 | inh.close() 26 | 27 | args = ( 28 | ["iss", "generate", "--cpus", str(threads), "--genomes"] 29 | + genomes 30 | + ["--n_reads", reads_num, "--model", model, "--output", output_prefix] 31 | ) 32 | 33 | if len(abundance) != 0: 34 | args += ["--abundance_file", abunf] 35 | print(" ".join(args)) 36 | env = os.environ.copy() 37 | proc = subprocess.Popen( 38 | args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, encoding="utf-8", 39 | ) 40 | output, error = proc.communicate() 41 | 42 | with open(logf, "w") as logh: 43 | logh.write(error) 44 | 45 | if proc.returncode == 0: 46 | if len(abundance) == 0: 47 | default_abunf = output_prefix + "_abundance.txt" 48 | if os.path.exists(default_abunf): 49 | os.rename(default_abunf, abunf) 50 | subprocess.run(f"pigz -p {threads} {output_prefix}_R1.fastq", shell=True) 51 | subprocess.run(f"pigz -p {threads} {output_prefix}_R2.fastq", shell=True) 52 | os.rename(f"{output_prefix}_R1.fastq.gz", r1) 53 | os.rename(f"{output_prefix}_R2.fastq.gz", r2) 54 | else: 55 | sys.exit(1) 56 | 57 | 58 | simulate_short_reads( 59 | snakemake.input["genomes"], 60 | snakemake.params["output_prefix"], 61 | snakemake.output["r1"], 62 | snakemake.output["r2"], 63 | snakemake.output["abunf"], 64 | snakemake.params["model"], 65 | snakemake.params["reads_num"], 66 | snakemake.params["abundance"], 67 | snakemake.threads, 68 | str(snakemake.log)) 69 | 70 | 71 | -------------------------------------------------------------------------------- /scripts/fastq_contig_size.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import os 4 | import csv 5 | 6 | def gen_size_tsv(fqlist, ctglist, tsvout): 7 | '''gen data size tsv out''' 8 | fq_size = {} 9 | ctg_size = {} 10 | file_size = {} 11 | file_size["header"] = ["fq_1", "fq_2", "fq_s", "contig", "sample_name"] 12 | file_size["body"] = [] 13 | 14 | with open(fqlist, 'r') as fq_handle, open(ctglist, 'r') as ctg_handle: 15 | for (fq_line, ctg_line) in zip(fq_handle, ctg_handle): 16 | (reads_a, reads_b, reads_s) = fq_line.strip().split() 17 | fq_name = os.path.basename(reads_a).split('.')[0] 18 | ctg_name = os.path.basename(ctg_line).split('.')[0] 19 | if fq_name not in fq_size: 20 | fq_size[fq_name] = {} 21 | fq_size[fq_name]["fq_1"] = os.path.getsize(reads_a) 22 | fq_size[fq_name]["fq_2"] = os.path.getsize(reads_b) 23 | fq_size[fq_name]["fq_s"] = os.path.getsize(reads_s) 24 | if ctg_name not in ctg_size: 25 | ctg_size[ctg_name] = {} 26 | ctg_size[ctg_name] = os.path.getsize(ctg_line.strip()) 27 | 28 | assert sorted(fq_size.keys()) == sorted(ctg_size.keys()) 29 | 30 | for key in ctg_size: 31 | file_size_ = {} 32 | file_size_["sample_name"] = key 33 | file_size_["fq_1"] = fq_size[key]["fq_1"] 34 | file_size_["fq_2"] = fq_size[key]["fq_2"] 35 | file_size_["fq_s"] = fq_size[key]["fq_s"] 36 | file_size_["contig"] = ctg_size[key] 37 | file_size["body"].append(file_size_) 38 | 39 | with open(tsvout, 'w') as out_handle: 40 | f_tsv = csv.DictWriter(out_handle, file_size["header"], delimiter='\t') 41 | f_tsv.writeheader() 42 | f_tsv.writerows(file_size["body"]) 43 | 44 | 45 | def main(): 46 | '''main function''' 47 | parser = argparse.ArgumentParser( 48 | description='''research relationships between fastq size and contigs size: 49 | Usage: python fastq_contig_size_relationship.py --fqlist ./212S_rmhost_fqgz.pathlist.paired --ctglist ./212S_assembly_contigs.pathlist --tsvout fq_contigs_size.ts 50 | ''') 51 | parser.add_argument('--fqlist', type=str, 52 | help='rmhost fastq file path list') 53 | parser.add_argument('--ctglist', type=str, 54 | help='contigs file path list') 55 | parser.add_argument('--tsvout', type=str, 56 | help='tsv out put') 57 | args = parser.parse_args() 58 | gen_size_tsv(args.fqlist, args.ctglist, args.tsvout) 59 | 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /metapi/profiles/generic/scheduler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | import sys, os 5 | from subprocess import Popen, PIPE 6 | import yaml 7 | 8 | 9 | def eprint(*args, **kwargs): 10 | print(*args, file=sys.stderr, **kwargs) 11 | 12 | 13 | # let snakemake read job_properties 14 | from snakemake.utils import read_job_properties 15 | 16 | 17 | 18 | jobscript = sys.argv[1] 19 | job_properties = read_job_properties(jobscript) 20 | 21 | #default paramters defined in cluster_spec (accessed via snakemake read_job_properties) 22 | cluster_param= job_properties["cluster"] 23 | 24 | if job_properties["type"]=='single': 25 | cluster_param['name'] = job_properties['rule'] 26 | elif job_properties["type"]=='group': 27 | cluster_param['name'] = job_properties['groupid'] 28 | else: 29 | raise NotImplementedError(f"Don't know what to do with job_properties['type']=={job_properties['type']}") 30 | 31 | 32 | # don't overwrite default parameters if defined in rule (or config file) 33 | if ('threads' in job_properties) and ('threads' not in cluster_param): 34 | cluster_param["threads"] = job_properties["threads"] 35 | for res in ['time','mem']: 36 | if (res in job_properties["resources"]) and (res not in cluster_param): 37 | cluster_param[res] = job_properties["resources"][res] 38 | 39 | # time in hours 40 | if "time" in cluster_param: 41 | cluster_param["time"]=int(cluster_param["time"]*60) 42 | 43 | 44 | # check which system you are on and load command command_options 45 | key_mapping_file=os.path.join(os.path.dirname(__file__),"key_mapping.yaml") 46 | command_options=yaml.load(open(key_mapping_file), 47 | Loader=yaml.BaseLoader) 48 | system= command_options['system'] 49 | command= command_options[system]['command'] 50 | 51 | key_mapping= command_options[system]['key_mapping'] 52 | 53 | # construct command: 54 | for key in key_mapping: 55 | if key in cluster_param: 56 | command+=" " 57 | command+=key_mapping[key].format(cluster_param[key]) 58 | 59 | command+=' {}'.format(jobscript) 60 | 61 | eprint("submit command: "+command) 62 | 63 | p = Popen(command.split(' '), stdout=PIPE, stderr=PIPE) 64 | output, error = p.communicate() 65 | if p.returncode != 0: 66 | raise Exception("Job can't be submitted\n"+output.decode("utf-8")+error.decode("utf-8")) 67 | else: 68 | res= output.decode("utf-8") 69 | 70 | if system=='lsf': 71 | import re 72 | match = re.search(r"Job <(\d+)> is submitted", res) 73 | jobid = match.group(1) 74 | 75 | elif system=='pbs': 76 | jobid= res.strip().split('.')[0] 77 | 78 | else: 79 | jobid= int(res.strip().split()[-1]) 80 | 81 | print(jobid) 82 | -------------------------------------------------------------------------------- /scripts/split_fx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # please see http://biopython.org/wiki/Split_large_file 3 | import argparse 4 | import os 5 | import errno 6 | 7 | from Bio import SeqIO 8 | 9 | 10 | def batch_iterator(iterator, batch_size): 11 | """Returns lists of length batch_size. 12 | 13 | This can be used on any iterator, for example to batch up 14 | SeqRecord objects from Bio.SeqIO.parse(...), or to batch 15 | Alignment objects from Bio.AlignIO.parse(...), or simply 16 | lines from a file handle. 17 | 18 | This is a generator function, and it returns lists of the 19 | entries from the supplied iterator. Each list will have 20 | batch_size entries, although the final list may be shorter. 21 | """ 22 | 23 | entry = True 24 | while entry: 25 | batch = [] 26 | while len(batch) < batch_size: 27 | try: 28 | # entry = iterator.next() 29 | entry = next(iterator) 30 | except StopIteration: 31 | entry = None 32 | if entry is None: 33 | break 34 | batch.append(entry) 35 | if batch: 36 | yield batch 37 | 38 | 39 | # TODO 40 | # def split_fastq() 41 | # def split_alignment() 42 | 43 | 44 | def split_fasta(fa_file, batch_size, outdir, onedir): 45 | record_iter = SeqIO.parse(open(fa_file, 'r'), "fasta") 46 | i = 0 47 | for i, batch in enumerate(batch_iterator(record_iter, batch_size), start = 1): 48 | if onedir: 49 | splitfa = os.path.join(outdir, "split_%i.fa" % (i)) 50 | else: 51 | splitdir = os.path.join(outdir, "split_" + str(i)) 52 | try: 53 | os.makedirs(splitdir) 54 | except OSError as e: 55 | if e.errno != errno.EEXIST: 56 | raise 57 | splitfa = os.path.join(splitdir, "split_%i.fa" % (i)) 58 | 59 | with open(splitfa, 'w') as out_h: 60 | count = SeqIO.write(batch, out_h, "fasta") 61 | print("wrote %i records to %s" % (count, splitfa)) 62 | return i 63 | 64 | 65 | def main(): 66 | """split large fasta/fastq file by seq size""" 67 | parser = argparse.ArgumentParser(description='split large fasta/fastq file by seq size') 68 | parser.add_argument('-f', type=str, help='input file, a large fasta or fastq file') 69 | parser.add_argument('-n', type=int, help='each splited file base size', default=1000) 70 | parser.add_argument('-outdir', type=str, help='a directory store splited file') 71 | 72 | args = parser.parse_args() 73 | split_fasta(args.f, args.n, args.outdir, False) 74 | 75 | if __name__ == '__main__': 76 | main() -------------------------------------------------------------------------------- /scripts/megahit_hadoop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Author: liuxing 3 | # Email: liuxing2@genomics.cn 4 | 5 | if [[ $# -ne 8 ]];then 6 | echo 7 | echo "usage: $0 -l FastaqFileList -o OutputDirPath -d HdfsOutputPath -n NumberOfTasks 8 | -l fastaq file list, please make a list including all fastq file, path one sample per line 9 | and seperate the read1 read2 and singleRead with space or table e.g: read1.fq read2.fq singleRead.fq 10 | -o output directory path, the directory that you would write the run script and assembly result 11 | -d HDFS output path, e.g: /user/liuxing2/megahitout 12 | -n the number of tasks, equal to the number of the samples " 13 | echo 14 | else 15 | while [[ -n "$1" ]] 16 | do 17 | case "$1" in 18 | -l) fqfilelist="$2" 19 | shift ;; 20 | -o) outpath="$2" 21 | shift ;; 22 | -d) dfsoutpath="$2" 23 | shift ;; 24 | -n) maps="$2" 25 | shift ;; 26 | esac 27 | shift 28 | done 29 | if [[ ! -d $outpath ]];then 30 | mkdir $outpath 31 | fi 32 | 33 | echo "while read LINE 34 | do 35 | if [[ -n \$LINE ]];then 36 | echo \$LINE; 37 | read1=\`echo \$LINE| awk '{print \$2}'\` 38 | read2=\`echo \$LINE| awk '{print \$3}'\` 39 | reads=\`echo \$LINE| awk '{print \$4}'\` 40 | base=\`basename \$read1\` 41 | prefix=\${base%%.*} 42 | outputfilename=\${prefix}.megahit_asm 43 | /hwfssz1/ST_META/CD/zhujie/program/bioenv/bin/megahit -1 \$read1 -2 \$read2 -r \$reads -o ${outpath}/\$outputfilename --out-prefix \$prefix 44 | fi 45 | done" >${outpath}/megahit.sh 46 | 47 | echo "/hwfssz1/BIGDATA_COMPUTING/hadoop/job_submit/10.53.20.169/CDH/bin/hadoop fs -rm -r -skipTrash $dfsoutpath 48 | /hwfssz1/BIGDATA_COMPUTING/hadoop/job_submit/10.53.20.169/CDH/bin/hadoop jar /hwfssz1/BIGDATA_COMPUTING/hadoop/job_submit/10.53.20.169/CDH/jars/hadoop-streaming-2.6.0-cdh5.11.1.jar -D mapreduce.job.name=\"megahit\" -D mapreduce.job.maps=$maps -D mapreduce.job.reduces=0 -D mapreduce.map.memory.mb=25600 -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat -input file:$fqfilelist -output $dfsoutpath -mapper \"sh megahit.sh\" -file ${outpath}/megahit.sh 49 | 50 | /hwfssz1/BIGDATA_COMPUTING/hadoop/job_submit/10.53.20.169/CDH/bin/hadoop fs -rm -r -skipTrash $dfsoutpath" >${outpath}/megahit_hadoopsubmit.sh 51 | fi 52 | 53 | -------------------------------------------------------------------------------- /scripts/get_bin_id_by_ccsh.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import csv 4 | import os 5 | import re 6 | 7 | 8 | def get_bin_id(checkmout_list, out_tsv, completeness, contamination): 9 | headers = [ 10 | "sample_id", "bin_id", "marker_lineage", "genomes", "markers", 11 | "marker_sets", "completeness", "contamination", "strain_heterogeneity" 12 | ] 13 | samples_bin_info = [] 14 | with open(checkmout_list, "r") as list_handle: 15 | for checkmout in list_handle: 16 | with open(checkmout.strip(), 'r') as checkmout_handle: 17 | print("processing %s" % checkmout.strip()) 18 | sample_id = os.path.basename(checkmout.strip()).split('.')[0] 19 | next(checkmout_handle) 20 | next(checkmout_handle) 21 | next(checkmout_handle) 22 | for info in checkmout_handle: 23 | if info.strip().startswith("R0"): 24 | info_l = re.split(r'\s+', info.strip()) 25 | if (float(info_l[-2]) < contamination) and (float( 26 | info_l[-3]) > completeness): 27 | bin_info = {} 28 | bin_info['sample_id'] = sample_id 29 | bin_info["bin_id"] = info_l[0] 30 | bin_info[ 31 | "marker_lineage"] = info_l[1] + " " + info_l[2] 32 | bin_info["genomes"] = info_l[3] 33 | bin_info["markers"] = info_l[4] 34 | bin_info["marker_sets"] = info_l[5] 35 | bin_info["completeness"] = info_l[-3] 36 | bin_info["contamination"] = info_l[-2] 37 | bin_info["strain_heterogeneity"] = info_l[-1] 38 | samples_bin_info.append(bin_info) 39 | with open(out_tsv, 'w') as out_handle: 40 | f_tsv = csv.DictWriter(out_handle, headers, delimiter="\t") 41 | f_tsv.writeheader() 42 | f_tsv.writerows(samples_bin_info) 43 | 44 | 45 | def main(): 46 | parser = argparse.ArgumentParser( 47 | description='''get bin id by completeness cutoff and contamination 48 | cutoff''') 49 | parser.add_argument('-l', type=str, help='checkmout list of many samples') 50 | parser.add_argument( 51 | '-o', 52 | type=str, 53 | help='bin id and completeness, contamination output file') 54 | parser.add_argument( 55 | '-c1', type=float, help='completeness cutoff', default=70.0) 56 | parser.add_argument( 57 | '-c2', type=float, help='contamination cutoff', default=30.0) 58 | args = parser.parse_args() 59 | get_bin_id(args.l, args.o, args.c1, args.c2) 60 | 61 | 62 | if __name__ == '__main__': 63 | main() 64 | -------------------------------------------------------------------------------- /scripts/insert_size_ploter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import pandas as pd 5 | from glob import glob 6 | import os 7 | import re 8 | from plotnine import * 9 | 10 | 11 | def parse_bam_stats(bam_stats_list): 12 | insert_size_df = pd.DataFrame() 13 | bam_stats_list_ = [] 14 | if re.search(r'\*', bam_stats_list[0]): 15 | bam_stats_list_ = glob(bam_stats_list[0]) 16 | else: 17 | bam_stats_list_ = bam_stats_list 18 | 19 | for bam_stats_file in bam_stats_list_: 20 | df = pd.DataFrame(columns=["insert_size", "pairs_total", 21 | "inward_oriented_pairs", 22 | "outward_oriented_pairs", 23 | "other_pairs", "sample_id"]) 24 | sample_id = os.path.basename(bam_stats_file).split(".")[0] 25 | with open(bam_stats_file, 'r') as ih: 26 | for line in ih: 27 | if line.startswith("IS"): 28 | line_list = re.split(r'\s+', line.strip()) 29 | df = df.append({"sample_id": sample_id, 30 | "insert_size": line_list[1], 31 | "pairs_total": line_list[2], 32 | "inward_oriented_pairs": line_list[3], 33 | "outward_oriented_pairs": line_list[4], 34 | "other_pairs": line_list[5]}, ignore_index=True) 35 | 36 | insert_size_df = pd.concat([insert_size_df, df]) 37 | return insert_size_df 38 | 39 | 40 | def plot_insert_size(insert_size_df, outpdf): 41 | df_l = insert_size_df.melt(id_vars=["insert_size", "sample_id"], 42 | value_vars=["pairs_total", 43 | "inward_oriented_pairs", 44 | "outward_oriented_pairs", 45 | "other_pairs"], 46 | var_name="type", 47 | value_name="count") 48 | is_plot = (ggplot(df_l, aes(x='insert_size', y='count')) 49 | + geom_point(aes(fill='type', colour='type'), size=0.2) 50 | + facet_wrap('~sample_id', scales='free') 51 | + ggtitle('insert size distribution')) 52 | is_plot.save(outpdf, width=16, height=16) 53 | 54 | 55 | def main(): 56 | parser = argparse.ArgumentParser('plot insert size for samtools bamstats') 57 | parser.add_argument('-i', nargs='*', help='bamstats file list, separated by spaces') 58 | parser.add_argument('-o', type=str, help='insert size plot output, pdf format') 59 | 60 | args = parser.parse_args() 61 | 62 | df = parse_bam_stats(args.i) 63 | plot_insert_size(df, args.o) 64 | 65 | 66 | if __name__ == '__main__': 67 | main() 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /scripts/megahit_sge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env 2 | import shutil 3 | import os 4 | from datetime import datetime 5 | import argparse 6 | 7 | from asub import submit_job 8 | 9 | # TODO 10 | #def assembly(fqlist): 11 | 12 | def coassembly(fqlist, thread, outdir, prefix, queue, project, resource): 13 | r1 = [] 14 | r2 = [] 15 | with open(fqlist, 'r') as in_handle: 16 | for line in in_handle: 17 | fq_1, fq_2 = line.strip().split("\t") 18 | r1.append(os.path.abspath(fq_1)) 19 | r2.append(os.path.abspath(fq_2)) 20 | pe1 = ",".join(r1) 21 | pe2 = ",".join(r2) 22 | coasm_shell = "%s -1 %s -2 %s -t %d --out-dir %s --out-prefix %s\n" % (shutil.which("megahit"), pe1, pe2, thread, outdir, prefix) 23 | print(coasm_shell) 24 | 25 | with open("./megahit_coasm.sh", 'w') as sh_h: 26 | sh_h.write(coasm_shell) 27 | with open("./megahit_coasm_submit.sh", 'w') as sge_h: 28 | sge_h.write("qsub -cwd -q %s -P %s -l %s megahit_coasm.sh\n" % (queue, project, resource)) 29 | 30 | ''' 31 | jobname = "megahit_coasm" + "_" + datetime.now().strftime("%Y%m%d%H%M%S") 32 | logdir = jobname + "_qsub" 33 | if os.path.exists(logdir): 34 | os.remove(logdir) 35 | os.makedirs(logdir) 36 | 37 | jobfile = os.path.join(logdir, jobname + "_1.sh") 38 | with open(jobfile, 'w') as out_handle: 39 | out_handle.write(coasm_shell) 40 | 41 | submit_job(jobname, 1, queue, project, resource, logdir) 42 | ''' 43 | 44 | 45 | def main(): 46 | parser = argparse.ArgumentParser(description='using megahit to do assembly or coassembly') 47 | parser.add_argument('-asm', action='store_true', help='do assembly', default=False) 48 | parser.add_argument('-coasm', action='store_true', help='do coassembly', default=False) 49 | parser.add_argument('-fqlist', type=str, help='clean pair-ended reads, each line format: reads_1.fq.gz reads_2.fq.gz') 50 | parser.add_argument('-thread', type=int, help="number of CPU threads, at least 2 if GPU enabled. [# of logical processors]", default=8) 51 | parser.add_argument('-outdir', type=str, help='output directory', default="coasm_results") 52 | parser.add_argument('-prefix', type=str, help='coassembly prefix', default="megahit_coasm.out") 53 | parser.add_argument('-queue', type=str, help='submit queue', default='st.q') 54 | parser.add_argument('-project', type=str, help='project id', default='F16ZQSB1SY2779') 55 | parser.add_argument('-resource',type=str, help='resourse requirment', default='vf=30G,p=8') 56 | 57 | args = parser.parse_args() 58 | 59 | assert int(args.resource.split("=")[2]) == args.thread, "please let p number equal thread number" 60 | 61 | #if args.asm: 62 | # assembly(args.fqlist) 63 | 64 | if args.coasm: 65 | coassembly(args.fqlist, args.thread, args.outdir, args.prefix, args.queue, args.project, args.resource) 66 | 67 | if __name__ == '__main__': 68 | main() -------------------------------------------------------------------------------- /scripts/split_mummer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import errno 4 | import os 5 | import shutil 6 | 7 | from asub import submit_job 8 | from split_fx import split_fasta 9 | 10 | 11 | def gen_job(qry_fa, min_cluster, split_num, split_dir, job_dir, results_dir): 12 | nucmer = shutil.which("nucmer") 13 | for i in range(1, split_num + 1): 14 | # split/split_1.fa 15 | # job/mummer_1.sh 16 | # split/split_2.fa 17 | # job/mummer_2.sh 18 | # results/nucmer_1.delta 19 | job_sh = os.path.join(job_dir, "mummer_%i.sh" % (i)) 20 | ref_fa = os.path.join(split_dir, "split_%i.fa" % (i)) 21 | prefix = os.path.join(results_dir, "nucmer_%i" % (i)) 22 | with open(job_sh, 'w') as job_h: 23 | job_h.write("%s -maxmatch -c %d %s %s -p %s\n" % (nucmer, min_cluster, ref_fa, qry_fa, prefix)) 24 | 25 | # TODO 26 | # def merge(): 27 | 28 | def main(): 29 | parser = argparse.ArgumentParser(description='''split reference, submit mummer array job to SGE, finally merge mummer results''') 30 | parser.add_argument('-ref', type=str, help='reference fasta file') 31 | parser.add_argument('-qry', type=str, help='query fasta file') 32 | parser.add_argument('-c', type=int, help='Sets the minimum length of a cluster of matches, default: 65', default=65) 33 | parser.add_argument('-size', type=int, help='how many seq records split into a group, default: 10000', default=10000) 34 | parser.add_argument('-outdir', type=str, help='output directory, default: ./', default="./") 35 | parser.add_argument('-queue', type=str, help='submit queue, default: st.q', default='st.q') 36 | parser.add_argument('-project', type=str, help='project id, default: F16ZQSB1SY2779', default='F16ZQSB1SY2779') 37 | parser.add_argument('-resource',type=str, help='resourse requirment, default: vf=1G,p=1', default='vf=1G,p=1') 38 | args = parser.parse_args() 39 | 40 | # make split, job, results dirs 41 | split_dir = os.path.join(os.path.abspath(args.outdir), "split") 42 | try: 43 | os.makedirs(split_dir) 44 | except OSError as e: 45 | if e.errno != errno.EEXIST: 46 | raise 47 | job_dir = os.path.join(os.path.abspath(args.outdir), "job") 48 | try: 49 | os.makedirs(job_dir) 50 | except OSError as e: 51 | if e.errno != errno.EEXIST: 52 | raise 53 | results_dir = os.path.join(os.path.abspath(args.outdir), "results") 54 | try: 55 | os.makedirs(results_dir) 56 | except OSError as e: 57 | if e.errno != errno.EEXIST: 58 | raise 59 | qry_fa = os.path.abspath(args.qry) 60 | 61 | # split reference fasta 62 | split_num = split_fasta(args.ref, args.size, split_dir, True) 63 | gen_job(qry_fa, args.c, split_num, split_dir, job_dir, results_dir) 64 | submit_job("mummer", split_num, args.queue, args.project, args.resource, job_dir) 65 | 66 | if __name__ == '__main__': 67 | main() 68 | -------------------------------------------------------------------------------- /scripts/assembly_info.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(ggplot2) 3 | library(dplyr) 4 | library(tidyr) 5 | library(readr) 6 | library(stringr) 7 | library(argparser) 8 | library(here) 9 | 10 | parse_asm <- function(path_f) 11 | { 12 | return 13 | read_delim(path_f, delim = '\t') %>% 14 | arrange(scaf_L50) %>% 15 | select( 16 | filename, n_scaffolds, scaf_bp, 17 | scaf_N50, scaf_L50, 18 | scaf_N90, scaf_L90, 19 | scaf_max, scaf_n_gt50K, scaf_pct_gt50K, 20 | gc_avg, gc_std) 21 | } 22 | 23 | average_asm <- function(asm_df) 24 | { 25 | return 26 | asm_df %>% 27 | select( 28 | n_scaffolds, scaf_bp, 29 | scaf_N50, scaf_L50, 30 | scaf_N90, scaf_L90, scaf_max, 31 | scaf_n_gt50K, scaf_pct_gt50K, 32 | gc_avg, gc_std) %>% 33 | summarise( 34 | n_scaffolds_average = mean(n_scaffolds), 35 | scaf_bp_average = mean(scaf_bp), 36 | scaf_N50_average = mean(scaf_N50), 37 | scaf_L50_average = mean(scaf_L50), 38 | scaf_N90_average = mean(scaf_N90), 39 | scaf_L90_average = mean(scaf_L90), 40 | scaf_max_average = mean(scaf_max), 41 | scaf_n_gt50K_average = mean(scaf_n_gt50K), 42 | scaf_pct_gt50K_average = mean(scaf_pct_gt50K), 43 | gc_avg_average = mean(gc_avg), 44 | gc_std_average = mean(gc_std)) %>% 45 | gather(key, value) %>% 46 | mutate(value_human = value / 1000) 47 | } 48 | 49 | asm_boxplot <- function(df, title) 50 | { 51 | p <- 52 | df %>% 53 | gather(key, value, -filename) %>% 54 | mutate(key = factor( 55 | key, 56 | levels = c( 57 | "n_scaffolds", "scaf_bp", 58 | "scaf_N50", "scaf_L50", 59 | "scaf_N90", "scaf_L90", 60 | "scaf_max", "scaf_n_gt50K", "scaf_pct_gt50K", 61 | "gc_avg", "gc_std"))) %>% 62 | ggplot(., aes(key, value)) + 63 | geom_boxplot(aes(fill = key), outlier.size = 0.5) + 64 | geom_jitter(size = 1, width = 0.25) + 65 | facet_wrap(~ key, scales = "free") + 66 | theme( 67 | panel.grid = element_blank(), 68 | axis.text.x = element_blank(), 69 | axis.ticks.x = element_blank(), 70 | axis.title = element_blank(), 71 | legend.title = element_blank()) + 72 | ggtitle(title) 73 | return(p) 74 | } 75 | 76 | parser <- arg_parser("plot assembly statistics") %>% 77 | add_argument("--assembly_info", help="assembly statistics info table") %>% 78 | add_argument("--pdf", help="assembly statistics plot", default="assembly_statistics.pdf") 79 | 80 | args <- parse_args(parser) 81 | asm_df <- parse_asm(args$assembly_info) 82 | average_asm_df <- average_asm(asm_df) 83 | plot <- asm_boxplot(asm_df, "8 soil and 2 wood samples megahit assembly statistics") 84 | ggsave(args$pdf, plot, width = 10, height = 10) 85 | -------------------------------------------------------------------------------- /metapi/predictor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import gzip 5 | import re 6 | import sys 7 | import argparse 8 | from Bio import SeqIO 9 | 10 | 11 | def parse_gff(gff_file, min_len): 12 | ''' 13 | extract pep id under the requirement of the minimal contig length cutoff from the GFF file generated by prodigal 14 | ''' 15 | save = False 16 | min_len = int(min_len) 17 | pep_id_list = [] 18 | with gzip.open(gff_file, "rt") as ih: 19 | for line in ih: 20 | seq_len = 0 21 | 22 | if line.startswith("##") or line.startswith("# Model Data") or line.strip() == '"': 23 | continue 24 | 25 | elif line.startswith("# Sequence Data"): 26 | line_split = line.strip().split(";") 27 | for token in line_split: 28 | if "seqlen=" in token: 29 | seq_len = int(token[token.find("=") + 1:]) 30 | if seq_len < min_len: 31 | save = False 32 | else: 33 | save = True 34 | elif save: 35 | line_split = re.split("\\s+", line.strip()) 36 | seq_id = line_split[0] 37 | trans_id = line_split[-1].split(";")[0].split("_")[-1] 38 | pep_id = f'''{seq_id}_{trans_id}''' 39 | pep_id_list.append(pep_id) 40 | else: 41 | continue 42 | return pep_id_list 43 | 44 | 45 | def extract_faa(faa_file, pep_id_list, out_file, assembly_group=None): 46 | if os.path.dirname(out_file) != "": 47 | os.makedirs(os.path.dirname(out_file), exist_ok=True) 48 | 49 | with gzip.open(out_file, "wt") as oh: 50 | with gzip.open(faa_file, "rt") as ih: 51 | for seq in SeqIO.parse(ih, "fasta"): 52 | if seq.id in pep_id_list: 53 | if assembly_group is not None: 54 | seq.id = f'''{assembly_group}C{seq.id}''' 55 | seq.name = f'''{assembly_group}C{seq.name}''' 56 | seq.description = f'''{assembly_group}C{seq.description}''' 57 | SeqIO.write(seq, oh, "fasta") 58 | 59 | 60 | def main(): 61 | parser = argparse.ArgumentParser("PEP extractor") 62 | parser.add_argument("--faa-file", dest="faa_file", type=str, required=True, help="protein file, gzipped") 63 | parser.add_argument("--gff-file", dest="gff_file", type=str, required=True, help="gff file, gzipped") 64 | parser.add_argument("--min-contig", dest="min_contig", default=2000, type=int, help="minimal contig length, default: 2000") 65 | parser.add_argument("--out-file", dest="out_file", type=str, required=True, help="output protein file, gzipped") 66 | args = parser.parse_args() 67 | 68 | pep_id_list = parse_gff(args.gff_file, args.min_contig) 69 | if len(pep_id_list) > 0: 70 | extract_faa(args.faa_file, pep_id_list, args.out_file) 71 | else: 72 | sys.exit("Emplty protein file after contigs length control") 73 | 74 | 75 | if __name__ == '__main__': 76 | main() -------------------------------------------------------------------------------- /metapi/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from metapi.configer import metaconfig 4 | from metapi.configer import parse_yaml 5 | from metapi.configer import update_config 6 | from metapi.configer import custom_help_formatter 7 | 8 | from metapi.tooler import parse 9 | from metapi.tooler import merge 10 | 11 | from metapi.simulator import parse_genomes 12 | from metapi.simulator import get_simulate_info 13 | from metapi.simulator import simulate_short_reads 14 | 15 | from metapi.sampler import HEADERS 16 | from metapi.sampler import parse_samples 17 | from metapi.sampler import get_reads 18 | from metapi.sampler import get_sample_id 19 | from metapi.sampler import get_sample_id_ 20 | from metapi.sampler import get_samples_id_by_assembly_group 21 | from metapi.sampler import get_samples_id_by_binning_group 22 | from metapi.sampler import get_samples_id_by_assembly_and_binning_group 23 | from metapi.sampler import get_assembly_group_by_binning_group 24 | from metapi.sampler import get_binning_group_by_assembly_group 25 | from metapi.sampler import get_multibinning_group_by_assembly_group 26 | 27 | from metapi.sampler import get_raw_input_list 28 | from metapi.sampler import get_raw_input_dict 29 | 30 | from metapi.sampler import get_samples_for_assembly_list 31 | from metapi.sampler import get_samples_for_assembly_dict 32 | from metapi.sampler import get_samples_for_assembly_megahit 33 | from metapi.sampler import get_samples_for_assembly_idba_ud 34 | from metapi.sampler import get_samples_for_assembly_spades 35 | from metapi.sampler import get_samples_for_assembly_plass 36 | from metapi.sampler import get_samples_for_assembly_opera_ms 37 | from metapi.sampler import get_samples_for_metaquast 38 | 39 | from metapi.sampler import get_samples_bax 40 | from metapi.sampler import get_samples_bax_multi 41 | from metapi.sampler import get_samples_bax_multi_all 42 | from metapi.sampler import get_samples_scaftigs 43 | 44 | from metapi.qcer import change 45 | from metapi.qcer import compute_host_rate 46 | from metapi.qcer import qc_summary_merge 47 | from metapi.qcer import qc_bar_plot 48 | from metapi.qcer import parse_fastp_json 49 | 50 | from metapi.assembler import assembler_init 51 | from metapi.assembler import parse_assembly 52 | from metapi.assembler import parse_assembly_spades_params 53 | 54 | from metapi.aligner import flagstats_summary 55 | 56 | from metapi.predictor import parse_gff 57 | from metapi.predictor import extract_faa 58 | 59 | from metapi.binner import get_binning_info 60 | from metapi.binner import generate_mags 61 | from metapi.binner import extract_mags_report 62 | from metapi.binner import combine_jgi 63 | 64 | from metapi.checkmer import checkm_prepare 65 | from metapi.checkmer import checkm_reporter 66 | 67 | from metapi.classifier import demultiplex 68 | from metapi.classifier import gtdbtk_prepare_from_mags 69 | from metapi.classifier import gtdbtk_prepare_from_genes 70 | 71 | from metapi.taxonomyer import refine_taxonomy 72 | 73 | from metapi.uploader import gen_samples_info 74 | from metapi.uploader import gen_info 75 | 76 | from metapi.__about__ import __version__, __author__ 77 | 78 | name = "metapi" 79 | -------------------------------------------------------------------------------- /scripts/metapi_config_update.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import os 5 | from metapi import configer 6 | 7 | 8 | def update_config( 9 | workdir, 10 | rmhost_host_fasta, 11 | rmhost_bwa_index, 12 | rmhost_bowtie2_index, 13 | kraken2_db, 14 | prof_index_metadata, 15 | prof_taxonomy, 16 | prof_jgi_index, 17 | project_id, 18 | ): 19 | 20 | conf_file = os.path.join(workdir, "config.yaml") 21 | cluster_file = os.path.join(workdir, "cluster.yaml") 22 | conf_file_up = os.path.join(workdir, "config_update.yaml") 23 | cluster_file_up = os.path.join(workdir, "cluster_update.yaml") 24 | 25 | conf = configer.parse_yaml(os.path.join(workdir, "config.yaml")) 26 | cluster = configer.parse_yaml(os.path.join(workdir, "cluster.yaml")) 27 | 28 | conf["params"]["rmhost"]["host_fasta"] = rmhost_host_fasta 29 | conf["params"]["rmhost"]["bwa"]["index_prefix"] = rmhost_bwa_index 30 | conf["params"]["rmhost"]["bowtie2"]["index_prefix"] = rmhost_bowtie2_index 31 | conf["params"]["classify"]["kraken2"]["database"] = kraken2_db 32 | conf["params"]["profiling"]["jgi"]["index_metadata"] = prof_index_metadata 33 | conf["params"]["profiling"]["jgi"]["taxonomy"] = prof_taxonomy 34 | conf["params"]["profiling"]["jgi"]["index_prefix"] = prof_jgi_index 35 | 36 | cluster["__default__"]["project"] = project_id 37 | 38 | configer.update_config(conf_file, conf_file_up, conf, remove=False) 39 | os.rename(conf_file_up, conf_file) 40 | 41 | configer.update_config(cluster_file, cluster_file_up, cluster, remove=False) 42 | os.rename(cluster_file_up, cluster_file) 43 | 44 | 45 | def main(): 46 | parser = argparse.ArgumentParser("update metapi config.yaml") 47 | parser.add_argument("-d", "--workdir", type=str, help="work dir", default="./") 48 | parser.add_argument("-a", "--rmhost_host_fasta", type=str, help="rmhost host fasta") 49 | parser.add_argument( 50 | "-i", "--rmhost_bwa_index", type=str, help="rmhost bwa index prefix" 51 | ) 52 | parser.add_argument( 53 | "-I", "--rmhost_bowtie2_index", type=str, help="rmhost bowtie2 index prefix" 54 | ) 55 | parser.add_argument("-k", "--kraken2_db", type=str, help="kraken2 database") 56 | parser.add_argument( 57 | "-m", "--profiling_index_metadata", type=str, help="profiling index metadata" 58 | ) 59 | parser.add_argument( 60 | "-t", "--profiling_taxonomy", type=str, help="profiling taxonomy" 61 | ) 62 | parser.add_argument( 63 | "-j", "--profiling_jgi_index", type=str, help="profiling jgi index prefix" 64 | ) 65 | parser.add_argument("-p", "--project_id", type=str, help="project id") 66 | args = parser.parse_args() 67 | 68 | update_config( 69 | args.workdir, 70 | args.rmhost_host_fasta, 71 | args.rmhost_bwa_index, 72 | args.rmhost_bowtie2_index, 73 | args.kraken2_db, 74 | args.profiling_index_metadata, 75 | args.profiling_taxonomy, 76 | args.profiling_jgi_index, 77 | args.project_id, 78 | ) 79 | 80 | 81 | if __name__ == "__main__": 82 | main() 83 | -------------------------------------------------------------------------------- /metapi/wrappers/prodigal_wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import glob 4 | import os 5 | import stat 6 | import sys 7 | import subprocess 8 | import concurrent.futures 9 | 10 | import pandas as pd 11 | from checkm import prodigal 12 | 13 | 14 | def run_prodigal(input_list): 15 | bin_fa = os.path.abspath(input_list[0]) 16 | output_dir = os.path.abspath(input_list[1]) 17 | 18 | bin_id = os.path.basename(os.path.splitext(os.path.splitext(bin_fa)[0])[0]) 19 | 20 | pep_file = os.path.join(output_dir, bin_id + ".faa") 21 | cds_file = os.path.join(output_dir, bin_id + ".ffn") 22 | gff_file = os.path.join(output_dir, bin_id + ".gff") 23 | 24 | pep_file_gz = pep_file + ".gz" 25 | cds_file_gz = cds_file + ".gz" 26 | gff_file_gz = gff_file + ".gz" 27 | 28 | prodigal_runner = prodigal.ProdigalRunner(output_dir) 29 | prodigal_runner.aaGeneFile = pep_file 30 | prodigal_runner.ntGeneFile = cds_file 31 | prodigal_runner.gffFile = gff_file 32 | 33 | best_translation_table = prodigal_runner.run(bin_fa, True) 34 | 35 | if os.path.exists(pep_file) and (os.path.getsize(pep_file) > 0): 36 | subprocess.run(f'''pigz -f {pep_file}''', shell=True) 37 | if os.path.exists(cds_file) and (os.path.getsize(cds_file) > 0): 38 | subprocess.run(f'''pigz -f {cds_file}''', shell=True) 39 | if os.path.exists(gff_file) and (os.path.getsize(gff_file) > 0): 40 | subprocess.run(f'''pigz -f {gff_file}''', shell=True) 41 | else: 42 | subprocess.run(f'''rm -rf {pep_file}''', shell=True) 43 | subprocess.run(f'''rm -rf {cds_file}''', shell=True) 44 | subprocess.run(f'''rm -rf {gff_file}''', shell=True) 45 | 46 | if best_translation_table in [4, 11]: 47 | if (os.path.exists(pep_file_gz)) and (os.path.exists(cds_file_gz)) and (os.path.exists(gff_file_gz)) and (os.stat(pep_file_gz)[stat.ST_SIZE]) > 0: 48 | return (bin_id, bin_fa, pep_file_gz, best_translation_table) 49 | else: 50 | return None 51 | else: 52 | if (os.path.exists(pep_file_gz)) and (os.path.exists(cds_file_gz)) and (os.path.exists(gff_file_gz)) and (os.stat(pep_file_gz)[stat.ST_SIZE]) > 0: 53 | return (bin_id, bin_fa, pep_file_gz, f"unknown: {best_translation_table}") 54 | else: 55 | return None 56 | 57 | 58 | workers = int(sys.argv[1]) 59 | input_mags_dir = os.path.dirname(sys.argv[2]) 60 | output_done = sys.argv[3] 61 | output_dir = os.path.dirname(output_done) 62 | 63 | bin_list = glob.glob(input_mags_dir + "/*.fa.gz") 64 | 65 | input_list = [] 66 | for bin_fa in bin_list: 67 | input_list.append((bin_fa, output_dir)) 68 | 69 | table_list = [] 70 | 71 | 72 | subprocess.run(f'''rm -rf {output_dir}''', shell=True) 73 | subprocess.run(f'''mkdir -p {output_dir}''', shell=True) 74 | 75 | with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor: 76 | for table_df in executor.map(run_prodigal, input_list): 77 | if table_df is not None: 78 | table_list.append(table_df) 79 | 80 | table_df = pd.DataFrame(table_list, columns=["bin_id", "bin_file", "pep_file", "best_translation_table"]) 81 | table_df.to_csv(output_done, sep="\t", index=False) -------------------------------------------------------------------------------- /scripts/merge_checkm_out.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import pandas as pd 5 | import re 6 | from glob import glob 7 | import sys 8 | from pprint import pprint 9 | 10 | 11 | def merge(checkm_list, sort_by): 12 | df = pd.DataFrame() 13 | if re.search(r'\*', checkm_list[0]): 14 | checkm_list_ = glob(checkm_list[0]) 15 | else: 16 | checkm_list_ = checkm_list 17 | for checkm_file in checkm_list_: 18 | checkm_df = pd.DataFrame(columns=["bin_id", "marker_lineage", 19 | "genomes", "markers", "marker_sets", 20 | "0", "1", "2", "3", "4", "5+", 21 | "completeness", "contamination", "strain_heterogeneity"]) 22 | with open(checkm_file, 'r') as ih: 23 | print("analysis %s" % checkm_file) 24 | next(ih), next(ih), next(ih) 25 | for line in ih: 26 | if not line.startswith("--"): 27 | line_list = re.split(r'\s+', line.strip()) 28 | checkm_df = checkm_df.append({"bin_id": line_list[0], 29 | "marker_lineage": "-".join(line_list[1:3]), 30 | "genomes": line_list[3], 31 | "markers": line_list[4], 32 | "marker_sets": line_list[5], 33 | "0": line_list[6], 34 | "1": line_list[7], 35 | "2": line_list[8], 36 | "3": line_list[9], 37 | "4": line_list[10], 38 | "5+": line_list[11], 39 | "completeness": line_list[12], 40 | "contamination": line_list[13], 41 | "strain_heterogeneity": line_list[14]}, ignore_index=True) 42 | df = pd.concat([df, checkm_df]) 43 | if sort_by == "completeness": 44 | df = df.sort_values(by=["completeness", "contamination", "strain_heterogeneity"], 45 | ascending=[False, True, True]) 46 | else: 47 | df = df.sort_values(by="bin_id") 48 | return df 49 | 50 | 51 | def main(): 52 | parser = argparse.ArgumentParser("merge many checkm out txt to one") 53 | parser.add_argument('-l', '--list', nargs='*', help='checkm out txt list, separated by spaces') 54 | parser.add_argument('-o', '--output', default=sys.stdout, 55 | help='merge results, if not specific it, will print stdout') 56 | parser.add_argument('-s', '--sort', choices=['bin_id', 'completeness'], default="completeness", 57 | help='sort merged checkm output') 58 | args = parser.parse_args() 59 | 60 | df = merge(args.list, args.sort) 61 | df.to_csv(args.output, sep='\t', index=False) 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | -------------------------------------------------------------------------------- /metapi/simulator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import gzip 5 | import sys 6 | import subprocess 7 | import pandas as pd 8 | from Bio import SeqIO 9 | 10 | 11 | def parse_genomes(samples_tsv, output_dir, check_samples=False): 12 | header = ["id", "genome", "abundance", "reads_num", "model"] 13 | 14 | genomes_df = pd.read_csv(samples_tsv, sep="\t").set_index("id", drop=False) 15 | 16 | cancel = False 17 | for i in header: 18 | if i not in genomes_df.columns: 19 | cancel = True 20 | print(f'Error: {i} not in {genomes_df.columns} header') 21 | 22 | for i in genomes_df.index.unique(): 23 | if "." in i: 24 | cancel = True 25 | print('Error: sample id %s contains ".", please remove all "."' % i) 26 | 27 | if cancel: 28 | sys.exit(1) 29 | 30 | genomes_df["fq1"] = genomes_df.apply( 31 | lambda x: os.path.join( 32 | output_dir, "short_reads/%s.simulate.1.fq.gz" % x["id"], 33 | ), 34 | axis=1, 35 | ) 36 | genomes_df["fq2"] = genomes_df.apply( 37 | lambda x: os.path.join( 38 | output_dir, "short_reads/%s.simulate.2.fq.gz" % x["id"], 39 | ), 40 | axis=1, 41 | ) 42 | return genomes_df 43 | 44 | 45 | def simulate_short_reads( 46 | genomes, output_prefix, r1, r2, abunf, model, reads_num, abundance, threads, logf, 47 | ): 48 | if len(abundance) != 0: 49 | with open(abunf, "w") as outh: 50 | for (g, a) in zip(genomes, abundance): 51 | inh = gzip.open(g, "rt") if g.endswith(".gz") else open(g, "r") 52 | genome = [] 53 | total_len = 0 54 | for record in SeqIO.parse(inh, "fasta"): 55 | total_len += len(record.seq) 56 | genome.append((record.id, len(record.seq))) 57 | for s in genome: 58 | outh.write("%s\t%f\n" % 59 | (s[0], float(a) * s[1] / total_len)) 60 | inh.close() 61 | 62 | args = ( 63 | ["iss", "generate", "--cpus", str(threads), "--genomes"] 64 | + genomes 65 | + ["--n_reads", reads_num, "--model", model, "--output", output_prefix] 66 | ) 67 | 68 | if len(abundance) != 0: 69 | args += ["--abundance_file", abunf] 70 | print(" ".join(args)) 71 | env = os.environ.copy() 72 | proc = subprocess.Popen( 73 | args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, encoding="utf-8", 74 | ) 75 | output, error = proc.communicate() 76 | 77 | with open(logf, "w") as logh: 78 | logh.write(error) 79 | 80 | if proc.returncode == 0: 81 | if len(abundance) == 0: 82 | default_abunf = output_prefix + "_abundance.txt" 83 | if os.path.exists(default_abunf): 84 | os.rename(default_abunf, abunf) 85 | subprocess.run(f"pigz -p {threads} {output_prefix}_R1.fastq", shell=True) 86 | subprocess.run(f"pigz -p {threads} {output_prefix}_R2.fastq", shell=True) 87 | os.rename(f"{output_prefix}_R1.fastq.gz", r1) 88 | os.rename(f"{output_prefix}_R2.fastq.gz", r2) 89 | else: 90 | sys.exit(1) 91 | 92 | 93 | def get_simulate_info(genomes_df, wildcards, col): 94 | return genomes_df.loc[[wildcards.sample], col].dropna().tolist() 95 | -------------------------------------------------------------------------------- /scripts/clstr_szie_tab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #from Bio.SeqIO.FastaIO import SimpleFastaParser 3 | import argparse 4 | import re 5 | 6 | pattern = re.compile(r'\d+\t(\d+)[a-z]{2}, >(.+)\.\.\. \*') 7 | #pattern = re.compile(r'\d+\t(\d+)[a-z]{2},\s>(.+)\.\.\.\s\*') 8 | #pattern = re.compile(r'\d+\t(\d+)nt, >(.+)\.\.\. \*') 9 | #pattern = re.compile(r'\d+\t(\d+)nt,\s>(.+)\.\.\.\s\*') 10 | 11 | # this parser base code comes from Bio.SeqIO.FastaIO.SimpleFastaParser :) 12 | def cdhit_clstr_parser(handle): 13 | """Generator function to iterate over cdhit clstr records (as string tuple) 14 | 15 | >Cluster 0 16 | 0 1131322nt, >k119_12676... * 17 | 1 84315nt, >k119_210239... at -/99.66% 18 | 2 73592nt, >k119_187067... at +/99.86% 19 | 3 70665nt, >k119_160147... at -/99.32% 20 | 4 66352nt, >k119_217379... at +/99.89% 21 | 5 63337nt, >k119_125106... at +/99.28% 22 | 6 63232nt, >k119_150147... at -/99.80% 23 | 7 59840nt, >k119_197728... at +/99.04% 24 | 8 59306nt, >k119_59391... at -/99.00% 25 | >Cluster 5343379 26 | 0 2000nt, >k119_192744... * 27 | >Cluster 5343380 28 | 0 2000nt, >k119_222307... * 29 | >Cluster 5343381 30 | 0 2000nt, >k119_232332... * 31 | >Cluster 5343382 32 | 0 2000nt, >k119_241124... * 33 | >Cluster 5343383 34 | 0 2000nt, >k119_253638... * 35 | 36 | """ 37 | #Skip any text before the first record (e.g. blank lines, comments) 38 | seq_id = "" 39 | seq_len = 0 40 | clstr_size = 0 41 | while True: 42 | line = handle.readline() 43 | if line == "": 44 | return # Premature end of file, or just empty? 45 | if line[0] == ">": 46 | break 47 | 48 | while True: 49 | clstr_size = 0 50 | if line[0] != ">": 51 | raise ValueError( 52 | "Records in cdhit cluster file(fasta format) should start with '>' character") 53 | clstr_name = line[1:].rstrip() 54 | line = handle.readline() 55 | while True: 56 | if not line: 57 | break 58 | if line[0] == ">": 59 | break 60 | # lines contain many cluster records 61 | #lines.append(line.rstrip()) 62 | clstr_size += 1 63 | matches = re.search(pattern, line) 64 | if matches: 65 | seq_len = matches.group(1) 66 | seq_id = matches.group(2) 67 | 68 | line = handle.readline() 69 | yield clstr_name, seq_id, seq_len, clstr_size 70 | 71 | if not line: 72 | return # StopIteration 73 | 74 | assert False, "Should not reach this line" 75 | 76 | def clstr_size_tab(clstr_file, clstr_size_out): 77 | with open(clstr_size_out, 'w') as out_handle: 78 | out_handle.write("cluster_name\tcluster_size\tsequence_id\tsequence_length\n") 79 | with open(clstr_file, 'r') as clstr_handle: 80 | for clstr_name, seq_id, seq_len, clstr_size in cdhit_clstr_parser(clstr_handle): 81 | clstr_name = "cluster_" + clstr_name.split(' ')[1] 82 | out_handle.write(clstr_name + "\t" + str(clstr_size) + "\t" + seq_id + "\t" + str(seq_len) + "\n") 83 | 84 | def main(): 85 | parser = argparse.ArgumentParser(description='parse cdhit cluster file and get cluste size distribution') 86 | parser.add_argument('--clstr', type=str, help='cluster file') 87 | parser.add_argument('--out', type=str, help='cluster size distribution') 88 | args = parser.parse_args() 89 | 90 | clstr_size_tab(args.clstr, args.out) 91 | 92 | if __name__ == '__main__': 93 | main() 94 | 95 | 96 | -------------------------------------------------------------------------------- /metapi/wrappers/gtdbtk_postprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import subprocess 4 | import concurrent.futures 5 | from pprint import pprint 6 | 7 | 8 | def parse(stats_file): 9 | if os.path.exists(stats_file): 10 | try: 11 | df = pd.read_csv(stats_file, sep="\t") 12 | except pd.errors.EmptyDataError: 13 | print("%s is empty, please check" % stats_file) 14 | return None 15 | 16 | if not df.empty: 17 | return df 18 | else: 19 | return None 20 | else: 21 | print("%s is not exists" % stats_file) 22 | return None 23 | 24 | 25 | def merge(input_list, func, workers, **kwargs): 26 | df_list = [] 27 | with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor: 28 | for df in executor.map(func, input_list): 29 | if df is not None: 30 | df_list.append(df) 31 | 32 | df_ = pd.concat(df_list) 33 | 34 | if "output" in kwargs: 35 | df_.to_csv(kwargs["output"], sep="\t", index=False) 36 | return df_ 37 | 38 | 39 | threads = int(snakemake.threads) 40 | 41 | gtdb_done_list = snakemake.input["gtdb_done"] 42 | 43 | gtdb_to_ncbi_script = snakemake.params["gtdb_to_ncbi_script"] 44 | metadata_archaea = snakemake.params["metadata_archaea"] 45 | metadata_bacteria = snakemake.params["metadata_bacteria"] 46 | 47 | table_gtdb = snakemake.output["table_gtdb"] 48 | table_ncbi = snakemake.output["table_ncbi"] 49 | table_all = snakemake.output["table_all"] 50 | 51 | os.makedirs(os.path.dirname(table_all), exist_ok=True) 52 | 53 | gtdb_list = [] 54 | ncbi_list = [] 55 | 56 | for i in gtdb_done_list: 57 | out_dir = os.path.dirname(i) 58 | archaea_tsv = os.path.join(out_dir, "gtdbtk.archaea.summary.tsv") 59 | bacteria_tsv = os.path.join(out_dir, "gtdbtk.bacteria.summary.tsv") 60 | 61 | if os.path.exists(archaea_tsv): 62 | gtdb_list.append(archaea_tsv) 63 | if os.path.exists(bacteria_tsv): 64 | gtdb_list.append(bacteria_tsv) 65 | 66 | gtdb_to_ncbi_summary = os.path.join(out_dir, "gtdbtk.ncbi.summary.tsv") 67 | gtdb_to_ncbi_log = os.path.join(out_dir, "gtdbtk.to.ncbi.log") 68 | 69 | archaea_cmd = "--ar53_metadata_file" 70 | if "ar122" in os.path.realpath(archaea_tsv): 71 | archaea_cmd = "--ar122_metadata_file" 72 | 73 | bacteria_cmd = "--bac120_metadata_file" 74 | 75 | gtdb_to_ncbi_cmd = \ 76 | f''' 77 | python {gtdb_to_ncbi_script} \ 78 | --gtdbtk_output_dir {out_dir} \ 79 | --output_file {gtdb_to_ncbi_summary} \ 80 | {archaea_cmd} {metadata_archaea} \ 81 | {bacteria_cmd} {metadata_bacteria} \ 82 | > {gtdb_to_ncbi_log} 83 | ''' 84 | subprocess.run(gtdb_to_ncbi_cmd, shell=True) 85 | 86 | if os.path.exists(gtdb_to_ncbi_summary): 87 | ncbi_list.append(gtdb_to_ncbi_summary) 88 | 89 | 90 | if len(gtdb_list) > 0: 91 | table_gtdb_df = merge(gtdb_list, parse, threads, output=table_gtdb) 92 | else: 93 | print(f"No {table_gtdb} generate") 94 | 95 | if len(ncbi_list) > 0: 96 | table_ncbi_df = merge(ncbi_list, parse, threads, output=table_ncbi) 97 | else: 98 | print(f"No {table_ncbi} generate") 99 | 100 | 101 | table_gtdb_df = table_gtdb_df.rename(columns={"classification": "GTDB classification"}) 102 | pprint(table_gtdb_df) 103 | 104 | table_ncbi_df = table_ncbi_df.rename(columns={"Genome ID": "user_genome"}) 105 | pprint(table_ncbi_df) 106 | 107 | table_all_df = pd.merge( 108 | table_gtdb_df, table_ncbi_df, how="inner", 109 | on=["user_genome", "GTDB classification"])#\ 110 | 111 | table_all_df.to_csv(table_all, sep="\t", index=False) 112 | -------------------------------------------------------------------------------- /scripts/rename_fasta_id.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from Bio import SeqIO, bgzf 3 | from Bio.SeqIO.FastaIO import FastaIterator, FastaWriter 4 | import gzip 5 | import sys 6 | import os 7 | import argparse 8 | 9 | #with open(sys.argv[2], 'w') as fa_out: 10 | # with open(sys.argv[1], 'r') as fa_in: 11 | # for rec in SeqIO.parse(fa_in, 'fasta'): 12 | # (description, sample_name) = rec.description.split("\t") 13 | # rec.description = sample_name + "_" + description 14 | # rec.id = rec.description.split(' ')[0] 15 | # SeqIO.write(rec, fa_out, 'fasta') 16 | 17 | def change_header_sample(title): 18 | # title(total header) -> (id, name, description) 19 | # R0170300050_tooth_RA.contigs.fa 20 | #from > k119_1 flag=1 multi=7.0000 len=3284 R0170300050_tooth_RA 21 | #to > R0170300050_tooth_RA_k119_1 flag=1 multi=7.0000 len=3284 22 | (one_line, sample_name) = title.split("\t") 23 | id = sample_name + "_" + title.split(' ')[0] 24 | desc = id + ' ' + ' '.join(one_line.split(' ')[1:]) 25 | return id, "", desc 26 | 27 | def change_header_no_sample(title): 28 | # title(total header) -> (id, name, description) 29 | # R0170300050_tooth_RA.contigs.fa 30 | #from > k119_1 flag=1 multi=7.0000 len=3284 31 | #to > R0170300050_tooth_RA_k119_1 flag=1 multi=7.0000 len=3284 32 | id = sample_tag + "_" + title.split(' ')[0] 33 | desc = id + " " + " ".join(title.split(' ')[1:]) 34 | return id, "", desc 35 | 36 | 37 | ## rename header framework 38 | ## just change header_function 39 | def reheader_fasta(fa_in, fa_out, header_function, in_gz, gz): 40 | if in_gz: 41 | in_h = gzip.open(fa_in, 'rt') 42 | else: 43 | in_h = open(fa_in, 'r') 44 | if gz: 45 | out_h = bgzf.BgzfWriter(fa_out, 'wb') 46 | else: 47 | out_h = open(fa_out, 'w') 48 | writer = FastaWriter(out_h) 49 | writer.write_header() 50 | for rec in FastaIterator(in_h, title2ids = header_function): 51 | writer.write_record(rec) 52 | writer.write_footer() 53 | out_h.close() 54 | in_h.close() 55 | 56 | def main(): 57 | ''' 58 | Why write this script ? 59 | Becaust megahit always generate knum_num format contigs id 60 | ''' 61 | parser = argparse.ArgumentParser(description='change fasta file header') 62 | parser.add_argument('-fa', type=str, help='fasta file path') 63 | parser.add_argument('-out', type=str, help='output') 64 | parser.add_argument('-rm', action='store_true', help='delete original fasta file', default=False) 65 | parser.add_argument('-gz', action='store_true', help='compress output fasta file', default=False) 66 | parser.add_argument('-mv', action='store_true', help="rename change id fasta file to original file", default=False) 67 | args = parser.parse_args() 68 | 69 | #assert not args.fa == args.out, "input file name can't equal to output file name" 70 | if (args.out == args.fa) or (not args.out): 71 | args.out = args.fa + ".changeid" 72 | if args.gz: 73 | if not args.out.endswith(".gz"): 74 | args.out = args.out + ".gz" 75 | 76 | in_gz = args.fa.endswith(".gz") 77 | #if args.fa.endswith(".gz"): 78 | # args.gz = True 79 | 80 | global sample_tag 81 | sample_tag = os.path.basename(args.fa).split(".")[0] 82 | 83 | abs_in = os.path.abspath(args.fa) 84 | abs_out = os.path.abspath(args.out) 85 | reheader_fasta(abs_in, abs_out, change_header_no_sample, in_gz, args.gz) 86 | 87 | if args.rm: 88 | os.remove(abs_in) 89 | if args.mv: 90 | if (not in_gz) and args.gz: 91 | abs_in = abs_in + ".gz" 92 | if in_gz and (not args.gz): 93 | abs_in = abs_in.rstrip(".gz") 94 | os.rename(abs_out, abs_in) 95 | 96 | if __name__ == '__main__': 97 | main() -------------------------------------------------------------------------------- /scripts/asub.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # please see https://github.com/lh3/asub 3 | import argparse 4 | import fileinput 5 | import os 6 | import re 7 | import shutil 8 | import stat 9 | import subprocess 10 | import sys 11 | from datetime import datetime 12 | 13 | __author__ = 'Jie Zhu, Jiahui Zhu' 14 | __email__ = 'zhujie@genomics.cn, zhujiahui@genomics.cn' 15 | __version__ = '0.3.1' 16 | __date__ = 'Jun 19, 2018' 17 | 18 | 19 | def parse_job(job_name, job_file, a_job_line, logdir): 20 | with fileinput.input(files=job_file if not job_file is None else ('-', )) as in_h: 21 | job_num = 0 22 | for one_line in in_h: 23 | job_num += 1 24 | job_f = os.path.join(logdir, job_name.rstrip(".sh") + "_" + str(job_num) + ".sh") 25 | with open(job_f, 'w') as job_h: 26 | job_h.write(one_line) 27 | while fileinput.lineno() % a_job_line != 0: 28 | job_h.write(next(in_h)) 29 | #for i in range(1, a_job_line): 30 | # job_h.write(next(in_h)) 31 | return job_num 32 | 33 | 34 | def submit_job(job_name, total_job_num, queue, prj_id, resource, logdir): 35 | submit_f = os.path.join(os.path.curdir, job_name.rstrip(".sh") + "_submit.sh") 36 | array_range = "1-" + str(total_job_num) + ":1" 37 | job_script = os.path.join(logdir, job_name.rstrip(".sh") + "_$SGE_TASK_ID.sh") 38 | num_proc = resource.split('=')[-1] 39 | with open(submit_f, 'w') as submit_h: 40 | submit_h.write('''#!/bin/bash\n\ 41 | #$ -clear 42 | #$ -S /bin/bash 43 | #$ -N %s 44 | #$ -cwd 45 | #$ -l %s 46 | #$ -binding linear:%s 47 | #$ -q %s 48 | #$ -P %s 49 | #$ -t %s 50 | jobscript=%s 51 | bash $jobscript\n''' % (job_name, resource, num_proc, queue, prj_id, array_range, job_script)) 52 | 53 | os.chmod(submit_f, stat.S_IRWXU) 54 | submit_cmd = shutil.which("qsub") + \ 55 | " -e " + os.path.join(logdir, job_name + "_\\$TASK_ID.e") + \ 56 | " -o " + os.path.join(logdir, job_name + "_\\$TASK_ID.o") + " " + submit_f 57 | print(submit_cmd) 58 | subprocess.call(submit_cmd, shell=True) 59 | 60 | def main(): 61 | '''it is a very simple script to submit array job, but you need supply real run command''' 62 | parser = argparse.ArgumentParser(description='make submit array job easy') 63 | parser.add_argument('-jobfile', nargs='*', help='job file to read, if empty, stdin is used') 64 | parser.add_argument('-jobname', type=str, help='job name', default='job') 65 | parser.add_argument('-jobline', type=int, help='set the number of lines to form a job', default=1) 66 | parser.add_argument('-queue', type=str, help='submit queue', default='st.q') 67 | parser.add_argument('-project', type=str, help='project id', default='F16ZQSB1SY2779') 68 | parser.add_argument('-resource',type=str, help='resourse requirment', default='vf=1G,p=1') 69 | parser.add_argument('-logdir', type=str, help='array job log directory') 70 | args = parser.parse_args() 71 | 72 | assert re.match(r'vf=[\d\.]+\w,p=\d+', args.resource), "please specific memory usage and number processor" 73 | assert not re.match(r'^\d+', args.jobname), "array job name cannot start with a digit" 74 | assert args.jobline >= 1, "a job line can't to be zero" 75 | 76 | args.jobname += "_" + datetime.now().strftime("%Y%m%d%H%M%S") 77 | 78 | if not args.logdir: 79 | args.logdir = args.jobname + "_qsub" 80 | args.logdir = args.logdir.rstrip("/") + "/" 81 | 82 | if os.path.exists(args.logdir): 83 | os.remove(args.logdir) 84 | os.makedirs(args.logdir) 85 | 86 | total_job_num = parse_job(args.jobname, args.jobfile, args.jobline, args.logdir) 87 | submit_job(args.jobname, total_job_num, args.queue, args.project, args.resource, args.logdir) 88 | 89 | 90 | if __name__ == '__main__': 91 | main() 92 | -------------------------------------------------------------------------------- /metapi/rules/upload.smk: -------------------------------------------------------------------------------- 1 | if config["upload"]["do"]: 2 | rule upload_generate_samples_info: 3 | input: 4 | config["params"]["samples"] 5 | output: 6 | os.path.join(config["output"]["upload"], "table/MIxS_Samples.xlsx") 7 | run: 8 | metapi.gen_samples_info(SAMPLES, output[0], config) 9 | 10 | 11 | rule upload_md5_short_reads: 12 | input: 13 | alignment_input_with_short_reads 14 | output: 15 | os.path.join(config["output"]["upload"], "md5/short_reads/{sample}.md5") 16 | shell: 17 | ''' 18 | md5sum {input} > {output} 19 | ''' 20 | 21 | 22 | rule upload_generate_run_info: 23 | input: 24 | expand(os.path.join( 25 | config["output"]["upload"], "md5/short_reads/{sample}.md5"), 26 | sample=SAMPLES_ID_LIST) 27 | output: 28 | os.path.join(config["output"]["upload"], "table/Experiment_Run.xlsx") 29 | threads: 30 | config["upload"]["threads"] 31 | run: 32 | metapi.gen_info(input, output[0], config, threads, "sequencing_run") 33 | 34 | 35 | rule upload_sequencing_all: 36 | input: 37 | os.path.join(config["output"]["upload"], "table/Experiment_Run.xlsx"), 38 | os.path.join(config["output"]["upload"], "table/MIxS_Samples.xlsx") 39 | 40 | 41 | localrules: 42 | upload_generate_samples_info, 43 | upload_generate_run_info, 44 | 45 | 46 | if len(ASSEMBLERS) != 0: 47 | rule upload_md5_scaftigs: 48 | input: 49 | os.path.join( 50 | config["output"]["assembly"], 51 | "scaftigs/{binning_group}.{assembly_group}.{assembler}/{binning_group}.{assembly_group}.{assembler}.scaftigs.fa.gz") 52 | output: 53 | os.path.join( 54 | config["output"]["upload"], 55 | "md5/scaftigs/{assembler}/{binning_group}.{assembly_group}.{assembler}.scaftigs.md5") 56 | shell: 57 | ''' 58 | md5sum {input} > {output} 59 | ''' 60 | 61 | 62 | rule upload_generate_assembly_info: 63 | input: 64 | expand(os.path.join( 65 | config["output"]["upload"], 66 | "md5/scaftigs/{{assembler}}/{binning_group}.{assembly_group}.{{assembler}}.scaftigs.md5"), 67 | zip, 68 | binning_group=ASSEMBLY_GROUP["binning_group"], 69 | assembly_group=ASSEMBLY_GROUP["assembly_group"]) 70 | output: 71 | os.path.join( 72 | config["output"]["upload"], 73 | "table/Genome_Assembly_{assembler}.xlsx") 74 | threads: 75 | config["upload"]["threads"] 76 | run: 77 | metapi.gen_info(input, output[0], config, threads, "assembly") 78 | 79 | 80 | rule upload_assembly_all: 81 | input: 82 | expand(os.path.join( 83 | config["output"]["upload"], 84 | "table/Genome_Assembly_{assembler}.xlsx"), 85 | assembler=ASSEMBLERS) 86 | 87 | 88 | localrules: 89 | upload_generate_assembly_info 90 | 91 | 92 | else: 93 | rule upload_assembly_all: 94 | input: 95 | 96 | else: 97 | rule upload_sequencing_all: 98 | input: 99 | 100 | 101 | rule upload_assembly_all: 102 | input: 103 | 104 | 105 | rule upload_all: 106 | input: 107 | rules.upload_sequencing_all.input, 108 | rules.upload_assembly_all.input#, 109 | 110 | 111 | localrules: 112 | upload_sequencing_all, 113 | upload_assembly_all, 114 | upload_all -------------------------------------------------------------------------------- /scripts/kraken2_demultiplex_summary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import sys 5 | import pandas as pd 6 | import pickle 7 | from pprint import pprint 8 | from taxadb.taxid import TaxID 9 | from taxadb.names import SciName 10 | 11 | 12 | def main(args_): 13 | parser = argparse.ArgumentParser("a summary of kraken2 demultiplex pickle") 14 | parser.add_argument( 15 | '--rank', 16 | choices=["superkingdom", "phylum", "class", "order", "family", "genus", "species"], 17 | default="genus", 18 | help='mini rank for merge' 19 | ) 20 | parser.add_argument( 21 | '--taxadb', 22 | type=str, 23 | help='taxonomy database' 24 | ) 25 | parser.add_argument( 26 | '-p', 27 | '--pickle_list', 28 | help='kraken2 demultiplex pickle list' 29 | ) 30 | parser.add_argument( 31 | '-o', 32 | '--summary_output' 33 | ) 34 | args = parser.parse_args(args_) 35 | 36 | LINEAGES = ["no_rank", "subspecies", "species", "genus", "family", 37 | "order", "class", "phylum", "superkingdom"] 38 | 39 | RANK = args.rank 40 | if not args.rank in LINEAGES[1:]: 41 | print("wrong rank %s" % args.rank) 42 | sys.exit(1) 43 | 44 | SUB_LINRAGES = LINEAGES[LINEAGES.index(RANK):] 45 | 46 | TAXID_DB = TaxID(dbtype='sqlite', dbname=args.taxadb) 47 | NAMES_DB = SciName(dbtype='sqlite', dbname=args.taxadb) 48 | 49 | 50 | def get_parent_taxid(tax_id, tax_name, level): 51 | if tax_id == 0: 52 | return "no_rank", 0, "unclassified" 53 | 54 | lineage_dict = TAXID_DB.lineage_id(tax_id, ranks=True) 55 | 56 | if lineage_dict is None: 57 | taxid = NAMES_DB.taxid(tax_name) 58 | if taxid is None: 59 | taxid = NAMES_DB.taxid(tax_name.split()[0]) 60 | if not taxid is None: 61 | lineage_dict = TAXID_DB.lineage_id(taxid, ranks=True) 62 | else: 63 | return "no_rank", tax_id, tax_name 64 | 65 | for rank in SUB_LINRAGES: 66 | if rank in lineage_dict: 67 | return rank, lineage_dict[rank], TAXID_DB.lineage_name(lineage_dict[rank], ranks=True)[rank] 68 | return "no_rank", tax_id, "unclassified" 69 | 70 | 71 | summary_dict = {"taxid": [], 72 | "taxa_name": [], 73 | "reads_count": [], 74 | "rank": [], 75 | "parent_taxid": [], 76 | "parent_taxa_name": []} 77 | 78 | with open(args.pickle_list, 'r') as ih: 79 | for line in ih: 80 | with open(line.strip(), 'rb') as ph: 81 | kk2_ranks_counter = pickle.load(ph) 82 | # pprint(kk2_ranks_counter) 83 | 84 | for taxid in kk2_ranks_counter: 85 | if taxid in summary_dict["taxid"]: 86 | summary_dict["reads_count"][summary_dict["taxid"].index(taxid)] += 2 * kk2_ranks_counter[taxid][1] 87 | else: 88 | summary_dict["taxid"].append(taxid) 89 | summary_dict["taxa_name"].append(kk2_ranks_counter[taxid][0]) 90 | summary_dict["reads_count"].append(2 * kk2_ranks_counter[taxid][1]) 91 | 92 | rank_, taxid_, taxaname_ = get_parent_taxid(taxid, kk2_ranks_counter[taxid][0], RANK) 93 | summary_dict["rank"].append(rank_) 94 | summary_dict["parent_taxid"].append(taxid_) 95 | summary_dict["parent_taxa_name"].append(taxaname_) 96 | 97 | summary_df = pd.DataFrame.from_dict(summary_dict) 98 | 99 | summary_df.to_csv(args.summary_output, index=False, sep='\t') 100 | 101 | 102 | if __name__ == '__main__': 103 | main(sys.argv[1:]) 104 | -------------------------------------------------------------------------------- /scripts/qc_report.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import pandas as pd 4 | import concurrent.futures 5 | import subprocess 6 | import argparse 7 | import os 8 | 9 | def get_reads(df, id_, col_): 10 | return df.loc[[id_], col_].dropna().tolist() 11 | 12 | 13 | def run_(tuple_): 14 | try: 15 | output = subprocess.check_output( 16 | tuple_[0], shell=True, stderr=subprocess.STDOUT, universal_newlines=True 17 | ) 18 | except subprocess.CalledProcessError as e: 19 | print(e.output) 20 | return None 21 | 22 | out_list = output.strip().split("\n") 23 | header = out_list[0].split("\t") 24 | data = [] 25 | 26 | for line in out_list[1:]: 27 | content = tuple(line.split("\t")) 28 | data.append(content) 29 | 30 | df = pd.DataFrame(data, columns=header) 31 | df["id"] = tuple_[1] 32 | df["step"] = tuple_[2] 33 | df["fq_type"] = tuple_[3] 34 | df["reads"] = tuple_[4] 35 | return df 36 | 37 | 38 | def run(cmd_list, workers): 39 | df_list = [] 40 | with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor: 41 | for df in executor.map(run_, cmd_list): 42 | if df is not None: 43 | df_list.append(df) 44 | df_ = pd.concat(df_list) 45 | return df_ 46 | 47 | 48 | def gen(fastq_list, step, fq_encoding, is_pe=True): 49 | fq_df = pd.read_csv(fastq_list, sep="\t").set_index("id") 50 | cmd_list = [] 51 | 52 | for i in fq_df.index.unique(): 53 | fq1_list = get_reads(fq_df, i, "fq1") 54 | if is_pe: 55 | fq2_list = get_reads(fq_df, i, "fq2") 56 | 57 | if is_pe: 58 | if len(fq1_list) == 1: 59 | cmd = "seqkit stats -a -T -b -j 1 -E %s %s %s" % ( 60 | fq_encoding, 61 | fq1_list[0], 62 | fq2_list[0], 63 | ) 64 | cmd_list.append((cmd, i, step, "pe", ["fq1", "fq2"])) 65 | else: 66 | cmd_1 = "cat %s | seqkit stats -a -T -b -j 1 -E %s" % ( 67 | " ".join(fq1_list), 68 | fq_encoding, 69 | ) 70 | cmd_list.append((cmd_1, i, step, "pe", ["fq1"])) 71 | cmd_2 = "cat %s | seqkit stats -a -T -b -j 1 -E %s" % ( 72 | " ".join(fq2_list), 73 | fq_encoding, 74 | ) 75 | cmd_list.append((cmd_2, i, step, "pe", ["fq2"])) 76 | else: 77 | cmd = "cat %s | seqkit stats -a -T -b -j 1 -E %s" % ( 78 | " ".join(fq1_list), 79 | fq_encoding, 80 | ) 81 | cmd_list.append((cmd, i, step, "se", ["fq1"])) 82 | return cmd_list 83 | 84 | 85 | def main(): 86 | parser = argparse.ArgumentParser(description="generate quality control report from raw, trimming, rmhost data") 87 | parser.add_argument("--raw_list", help="raw data list, headers: id fq1 fq2") 88 | parser.add_argument("--trimming_list", help="trimming data list, headers: id fq1 fq2") 89 | parser.add_argument("--rmhost_list", help="rmhost data list, headers: id fq1 fq2") 90 | parser.add_argument("--is_se", action='store_true', default=False, help='default: is_pe') 91 | parser.add_argument("--fq_encoding", help="fastq quality encoding, default: sanger", default="sanger") 92 | parser.add_argument("--threads", help="threads, default: 8", default=8) 93 | parser.add_argument("--output", help="qc report output") 94 | 95 | args = parser.parse_args() 96 | 97 | cmd_raw = gen(args.raw_list, "raw", args.fq_encoding, not args.is_se) 98 | cmd_trimming = gen(args.trimming_list, "trimming", args.fq_encoding, not args.is_se) 99 | cmd_rmhost = gen(args.rmhost_list, "rmhost", args.fq_encoding, not args.is_se) 100 | 101 | cmd = cmd_raw + cmd_trimming + cmd_rmhost 102 | 103 | df = run(cmd, args.threads) 104 | df.to_csv(args.output, sep='\t', index=False) 105 | 106 | if __name__ == "__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /scripts/clean_statout_to_matrix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ## Metagenomics Institute of BGI Research 3 | ## zhujie@genomics.cn 4 | ## 2017-11-29 5 | ## GPL-V3 6 | 7 | import os 8 | import argparse 9 | 10 | ##TODO 11 | ## clean and SE 12 | 13 | def parse_pe_clean_statout(handle, min_l, max_l): 14 | header_list = [str(i) for i in range(min_l, max_l + 1)] 15 | value_dict = {} 16 | for key in header_list: 17 | value_dict[key] = 0 18 | 19 | ## total info 20 | for key,value in zip(handle.readline().strip().split(), handle.readline().strip().split()): 21 | header_list.append(key) 22 | value_dict[key] = value 23 | 24 | ## reads_1 info 25 | tag = True 26 | for key,value in zip(handle.readline().strip().split(), handle.readline().strip().split()): 27 | if tag: 28 | tag = False 29 | else: 30 | key = key + "_1" 31 | header_list.append(key) 32 | value_dict[key] = value 33 | 34 | ## reads_2 info 35 | tag = True 36 | for key,value in zip(handle.readline().strip().split(), handle.readline().strip().split()): 37 | if tag: 38 | tag = False 39 | else: 40 | key = key + "_2" 41 | header_list.append(key) 42 | value_dict[key] = value 43 | 44 | ## reads_single info 45 | tag = True 46 | for key,value in zip(handle.readline().strip().split(), handle.readline().strip().split()): 47 | if tag: 48 | tag = False 49 | else: 50 | key = key + "_single" 51 | header_list.append(key) 52 | value_dict[key] = value 53 | 54 | ## length info 55 | next(handle) 56 | total_filter_base = 0 57 | total_filter_reads = 0 58 | total_filter_reads_len_gt80 = 0 59 | L80 = 0 60 | for line in handle: 61 | line_list = line.strip().split() 62 | reads_len = line_list[0] 63 | reads_num = line_list[1] 64 | total_filter_reads += int(reads_num) 65 | total_filter_base += int(reads_len) * int(reads_num) 66 | if (int(reads_len) >= 80): 67 | total_filter_reads_len_gt80 += int(reads_num) 68 | value_dict[str(reads_len)] = str(reads_num) 69 | 70 | L80 = total_filter_reads_len_gt80 / total_filter_reads 71 | header_list.append("total_filter_base") 72 | value_dict["total_filter_base"] = str(total_filter_base) 73 | header_list.append("total_filter_reads") 74 | value_dict["total_filter_reads"] = str(total_filter_reads) 75 | header_list.append("L80") 76 | value_dict["L80"] = str(L80) 77 | 78 | return (value_dict, header_list) 79 | 80 | def gen_len_matrix(dirname, min_l, max_l): 81 | no_header = True 82 | for fl in os.listdir(dirname): 83 | if fl.endswith("stat_out"): 84 | sample_name = fl.split("/")[-1].split(".")[0] 85 | statout = os.path.join(dirname, fl) 86 | with open(statout, 'r') as h: 87 | tuple_ = parse_pe_clean_statout(h, min_l, max_l) 88 | if no_header: 89 | header = "sample_name\t" + "\t".join(tuple_[1]) 90 | print(header) 91 | body = sample_name + "\t" + "\t".join([tuple_[0][key] for key in tuple_[1]]) 92 | print(body) 93 | no_header = False 94 | else: 95 | body = sample_name + "\t" + "\t".join([tuple_[0][key] for key in tuple_[1]]) 96 | print(body) 97 | 98 | def main(): 99 | parser = argparse.ArgumentParser("convert many clean statout to a matrix\n \ 100 | e.g: python clean_statout_to_matrix.py ../data/clean_statout -m 100 -n 30 > ../data/length_clean_statout.tsv\n") 101 | parser.add_argument("-d", "--statout_dir", help="a directory contain many samples clean statout file") 102 | parser.add_argument("-m", "--max_len", type=int, help="max reads length") 103 | parser.add_argument("-n", "--min_len", type=int, help="min reads length") 104 | args = parser.parse_args() 105 | gen_len_matrix(args.statout_dir, args.min_len, args.max_len) 106 | 107 | if __name__ == "__main__": 108 | main() 109 | -------------------------------------------------------------------------------- /scripts/extract_bins_from_mgs_profile.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio.Alphabet import generic_dna 4 | from Bio import SeqIO 5 | import argparse 6 | import re 7 | import os 8 | import time 9 | 10 | 11 | def take_second(elem): 12 | return elem[1] 13 | 14 | 15 | def parse_mgs(mgs_profile): 16 | bins = {} 17 | bins_num = [] 18 | with open(mgs_profile, 'r') as ih: 19 | for line in ih: 20 | line_list = re.split(r"\s+|,", line.strip(",|\n")) 21 | bin_id = line_list[0] 22 | contigs_count = int(line_list[1]) 23 | bins[bin_id] = [] 24 | bins_num.append((bin_id, contigs_count)) 25 | for contig_id in line_list[2:]: 26 | bins[bin_id].append(contig_id) 27 | bins_num.sort(key=take_second, reverse=True) 28 | return bins, bins_num 29 | 30 | 31 | def extract(contigs_list, bins, bins_num, head, tail, outdir): 32 | files = [] 33 | all_count = 0 34 | with open(contigs_list, 'r') as ih: 35 | for line in ih: 36 | files.append(line.strip()) 37 | 38 | begin = time.time() 39 | records = SeqIO.index_db(":memory:", files, "fasta", generic_dna) 40 | end = time.time() 41 | print("index db: %.2f s" % (end - begin)) 42 | 43 | begin = time.time() 44 | 45 | if head is not None: 46 | if head > len(bins_num): 47 | count = len(bins_num) 48 | else: 49 | count = head 50 | all_count += count 51 | for i in range(count): 52 | bin_id = bins_num[i][0] 53 | with open(os.path.join(outdir, bin_id + ".fa"), 'w') as oh: 54 | for contig_id in bins[bin_id]: 55 | if contig_id in records: 56 | SeqIO.write(records[contig_id], oh, 'fasta') 57 | else: 58 | print("%s has not find %s" % (bin_id, contig_id)) 59 | 60 | if tail is not None: 61 | if tail > len(bins_num): 62 | count = len(bins_num) 63 | else: 64 | count = tail 65 | all_count += count 66 | for i in range(count): 67 | bin_id = bins_num[-(i+1)][0] 68 | with open(os.path.join(outdir, bin_id + ".fa"), 'w') as oh: 69 | for contig_id in bins[bin_id]: 70 | if contig_id in records: 71 | SeqIO.write(records[contig_id], oh, 'fasta') 72 | else: 73 | print("%s has not find %s" % (bin_id, contig_id)) 74 | 75 | if (head is None) and (tail is None): 76 | for bin_id in bins: 77 | all_count += 1 78 | with open(os.path.join(outdir, bin_id + ".fa"), 'w') as oh: 79 | for contig_id in bins[bin_id]: 80 | if contig_id in records: 81 | SeqIO.write(records[contig_id], oh, 'fasta') 82 | else: 83 | print("%s has not find %s" % (bin_id, contig_id)) 84 | 85 | records.close() 86 | end = time.time() 87 | 88 | print("extract %d bins: %.2f s" % (all_count, end - begin)) 89 | 90 | 91 | def main(): 92 | parser = argparse.ArgumentParser("get bins fasta from mgs contigs/scafoolds profile") 93 | parser.add_argument('-p', '--profile', type=str, help='mgs contigs/scaffolds profile') 94 | parser.add_argument('-l', '--contigs_list', type=str, help='assembly contigs/scaffolds fasta path list') 95 | parser.add_argument('-o', '--outdir', type=str, help='bins output dir') 96 | parser.add_argument('--head', type=int, default=None, help='head number bins') 97 | parser.add_argument('--tail', type=int, default=None, help='tail number bins') 98 | 99 | args = parser.parse_args() 100 | if not os.path.exists(args.outdir): 101 | os.makedirs(args.outdir, exist_ok=True) 102 | 103 | (bins, bins_num) = parse_mgs(args.profile) 104 | 105 | if (args.head is not None) and (args.tail is not None): 106 | assert args.head + args.tail <= len(bins_num), "too many head or too many tail" 107 | 108 | extract(args.contigs_list, bins, bins_num, args.head, args.tail, args.outdir) 109 | 110 | 111 | if __name__ == '__main__': 112 | main() 113 | -------------------------------------------------------------------------------- /scripts/filter_pe_fastq_by_size.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import glob 5 | import os 6 | 7 | # see: http://goo.gl/kTQMs 8 | SYMBOLS = { 9 | 'customary' : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'), 10 | 'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', 'zetta', 'iotta'), 11 | 'iec' : ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'), 12 | 'iec_ext' : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', 'zebi', 'yobi'), 13 | } 14 | 15 | def bytes2human(n, format='%(value).1f %(symbol)s', symbols='customary'): 16 | """ 17 | Convert n bytes into a human readable string based on format. 18 | symbols can be either "customary", "customary_ext", "iec" or "iec_ext", 19 | see: http://goo.gl/kTQMs 20 | 21 | >>> bytes2human(0) 22 | '0.0 B' 23 | >>> bytes2human(0.9) 24 | '0.0 B' 25 | >>> bytes2human(1) 26 | '1.0 B' 27 | >>> bytes2human(1.9) 28 | '1.0 B' 29 | >>> bytes2human(1024) 30 | '1.0 K' 31 | >>> bytes2human(1048576) 32 | '1.0 M' 33 | >>> bytes2human(1099511627776127398123789121) 34 | '909.5 Y' 35 | 36 | >>> bytes2human(9856, symbols="customary") 37 | '9.6 K' 38 | >>> bytes2human(9856, symbols="customary_ext") 39 | '9.6 kilo' 40 | >>> bytes2human(9856, symbols="iec") 41 | '9.6 Ki' 42 | >>> bytes2human(9856, symbols="iec_ext") 43 | '9.6 kibi' 44 | 45 | >>> bytes2human(10000, "%(value).1f %(symbol)s/sec") 46 | '9.8 K/sec' 47 | 48 | >>> # precision can be adjusted by playing with %f operator 49 | >>> bytes2human(10000, format="%(value).5f %(symbol)s") 50 | '9.76562 K' 51 | """ 52 | n = int(n) 53 | if n < 0: 54 | raise ValueError("n < 0") 55 | symbols = SYMBOLS[symbols] 56 | prefix = {} 57 | for i, s in enumerate(symbols[1:]): 58 | prefix[s] = 1 << (i+1)*10 59 | for symbol in reversed(symbols[1:]): 60 | if n >= prefix[symbol]: 61 | value = float(n) / prefix[symbol] 62 | return format % locals() 63 | return format % dict(symbol=symbols[0], value=n) 64 | 65 | def human2bytes(s): 66 | """ 67 | Attempts to guess the string format based on default symbols 68 | set and return the corresponding bytes as an integer. 69 | When unable to recognize the format ValueError is raised. 70 | 71 | >>> human2bytes('0 B') 72 | 0 73 | >>> human2bytes('1 K') 74 | 1024 75 | >>> human2bytes('1 M') 76 | 1048576 77 | >>> human2bytes('1 Gi') 78 | 1073741824 79 | >>> human2bytes('1 tera') 80 | 1099511627776 81 | 82 | >>> human2bytes('0.5kilo') 83 | 512 84 | >>> human2bytes('0.1 byte') 85 | 0 86 | >>> human2bytes('1 k') # k is an alias for K 87 | 1024 88 | >>> human2bytes('12 foo') 89 | Traceback (most recent call last): 90 | ... 91 | ValueError: can't interpret '12 foo' 92 | """ 93 | init = s 94 | num = "" 95 | while s and s[0:1].isdigit() or s[0:1] == '.': 96 | num += s[0] 97 | s = s[1:] 98 | num = float(num) 99 | letter = s.strip() 100 | for name, sset in SYMBOLS.items(): 101 | if letter in sset: 102 | break 103 | else: 104 | if letter == 'k': 105 | # treat 'k' as an alias for 'K' as per: http://goo.gl/kTQMs 106 | sset = SYMBOLS['customary'] 107 | letter = letter.upper() 108 | else: 109 | raise ValueError("can't interpret %r" % init) 110 | prefix = {sset[0]:1} 111 | for i, s in enumerate(sset[1:]): 112 | prefix[s] = 1 << (i+1)*10 113 | return int(num * prefix[letter]) 114 | 115 | 116 | def main(): 117 | FASTQ_SIZE = human2bytes(sys.argv[2]) 118 | samples = {} 119 | for i in glob.glob(sys.argv[1].rstrip("/") + "/*.1.fq.gz"): 120 | if os.path.getsize(i) >= FASTQ_SIZE: 121 | sample_id = os.path.basename(i).split(".")[0] 122 | samples[sample_id] = [os.path.abspath(i), os.path.abspath(i.replace("1.fq.gz", "2.fq.gz"))] 123 | 124 | print("id\tfq1\tfq2") 125 | for i in samples: 126 | print("%s\t%s\t%s" % (i, samples[i][0], samples[i][1])) 127 | 128 | 129 | if __name__ == '__main__': 130 | main() 131 | -------------------------------------------------------------------------------- /metapi/profiles/lsf/memory_units.py: -------------------------------------------------------------------------------- 1 | import re 2 | from enum import Enum 3 | from typing import Union 4 | from collections import namedtuple 5 | 6 | 7 | class InvalidSuffix(Exception): 8 | pass 9 | 10 | 11 | class InvalidPower(Exception): 12 | pass 13 | 14 | 15 | class InvalidMemoryString(Exception): 16 | pass 17 | 18 | 19 | Scale = namedtuple("Scale", ["power", "metric_suffix"]) 20 | 21 | 22 | SCALE_MAP = { 23 | "B": Scale(0, "B"), 24 | "K": Scale(1, "KB"), 25 | "M": Scale(2, "MB"), 26 | "G": Scale(3, "GB"), 27 | "T": Scale(4, "TB"), 28 | "P": Scale(5, "PB"), 29 | "E": Scale(6, "EB"), 30 | "Z": Scale(7, "ZB"), 31 | } 32 | 33 | 34 | class Unit(Enum): 35 | BYTES = SCALE_MAP["B"] 36 | KILO = SCALE_MAP["K"] 37 | MEGA = SCALE_MAP["M"] 38 | GIGA = SCALE_MAP["G"] 39 | TERA = SCALE_MAP["T"] 40 | PETA = SCALE_MAP["P"] 41 | EXA = SCALE_MAP["E"] 42 | ZETTA = SCALE_MAP["Z"] 43 | 44 | @staticmethod 45 | def from_suffix(suffix: str) -> "Unit": 46 | first_letter = suffix[0].upper() 47 | if first_letter not in SCALE_MAP: 48 | valid_suffixes = ",".join( 49 | scale.metric_suffix for scale in SCALE_MAP.values() 50 | ) 51 | raise InvalidSuffix( 52 | "{suffix}. Valid suffixes are: {valid_suffixes}".format( 53 | suffix=suffix, valid_suffixes=valid_suffixes 54 | ) 55 | ) 56 | return Unit(SCALE_MAP[first_letter]) 57 | 58 | @staticmethod 59 | def from_power(power: int) -> "Unit": 60 | valid_powers = [] 61 | for scale in SCALE_MAP.values(): 62 | if scale.power == power: 63 | return Unit(scale) 64 | valid_powers.append(scale.power) 65 | 66 | raise InvalidPower( 67 | "{power}. Valid powers are: {valid}".format( 68 | power=power, valid=",".join(str(p) for p in valid_powers) 69 | ) 70 | ) 71 | 72 | @property 73 | def power(self) -> int: 74 | return self.value.power 75 | 76 | @property 77 | def suffix(self) -> str: 78 | return self.value.metric_suffix 79 | 80 | 81 | Number = Union[int, float] 82 | 83 | 84 | class Memory: 85 | def __init__(self, value: Number = 1, unit: Unit = Unit.BYTES): 86 | self.value = value 87 | self.unit = unit 88 | self._decimal_scaling_factor = 1000 89 | self._binary_scaling_factor = 1024 90 | 91 | def __eq__(self, other: "Memory") -> bool: 92 | return self.bytes() == other.bytes() 93 | 94 | def __repr__(self) -> str: 95 | val = ( 96 | int(self.value) 97 | if isinstance(self.value, int) or self.value.is_integer() 98 | else self.value 99 | ) 100 | return "{val}{sfx}".format(val=val, sfx=self.suffix) 101 | 102 | @property 103 | def power(self) -> int: 104 | return self.unit.power 105 | 106 | @property 107 | def suffix(self) -> str: 108 | return self.unit.suffix 109 | 110 | def _scaling_factor(self, decimal: bool = True) -> int: 111 | return self._decimal_scaling_factor if decimal else self._binary_scaling_factor 112 | 113 | def bytes(self, decimal_multiples: bool = True) -> float: 114 | scaling_factor = self._scaling_factor(decimal_multiples) 115 | return float(self.value * (scaling_factor ** self.power)) 116 | 117 | def to(self, unit: Unit, decimal_multiples: bool = True) -> "Memory": 118 | scaling_factor = self._scaling_factor(decimal_multiples) ** unit.power 119 | size = self.bytes(decimal_multiples) 120 | size /= scaling_factor 121 | 122 | return Memory(size, unit) 123 | 124 | @staticmethod 125 | def from_str(s: str) -> "Memory": 126 | valid_suffixes = "".join(scale.metric_suffix for scale in SCALE_MAP.values()) 127 | regex = re.compile( 128 | r"^(?P[0-9]*\.?[0-9]+)\s*(?P[{}]B?)?$".format(valid_suffixes), 129 | re.IGNORECASE, 130 | ) 131 | match = regex.search(s) 132 | 133 | if not match: 134 | raise InvalidMemoryString("{s} is an invalid memory string.".format(s=s)) 135 | 136 | size = float(match.group("size")) 137 | suffix = match.group("sfx") or "B" 138 | unit = Unit.from_suffix(suffix) 139 | 140 | return Memory(size, unit) 141 | -------------------------------------------------------------------------------- /metapi/rules/binning_report.smk: -------------------------------------------------------------------------------- 1 | if len(BINNERS_CHECKM) != 0: 2 | rule binning_report: 3 | input: 4 | lambda wildcards: get_binning_done(wildcards, [wildcards.binner_checkm]) 5 | output: 6 | directory( 7 | os.path.join( 8 | config["output"]["binning"], 9 | "report/{assembler}/{binner_checkm}/{binning_group}.{assembly_group}")) 10 | params: 11 | binning_group = "{binning_group}", 12 | assembly_group = "{assembly_group}", 13 | assembler = "{assembler}", 14 | binner = "{binner_checkm}" 15 | priority: 16 | 35 17 | run: 18 | import glob 19 | 20 | shell('''rm -rf {output}''') 21 | shell('''mkdir -p {output}''') 22 | 23 | bin_list = glob.glob(os.path.dirname(input[0]) + "/*.fa.gz") 24 | header_list = [ 25 | "binning_group", "assembly_group", "bin_id", "bin_file", "assembler", "binner", 26 | "chr", "length", "#A", "#C", "#G", "#T", 27 | "#2", "#3", "#4", "#CpG", "#tv", "#ts", "#CpG-ts"] 28 | header_name = "\\t".join(header_list) 29 | 30 | for bin_fa in bin_list: 31 | bin_id = os.path.basename(os.path.splitext(os.path.splitext(bin_fa)[0])[0]) 32 | bin_file = os.path.abspath(bin_fa) 33 | header_content = "\\t".join([params.binning_group, params.assembly_group, bin_id, bin_file, params.assembler, params.binner]) 34 | stats_file = os.path.join(output[0], bin_id + ".seqtk.comp.tsv.gz") 35 | 36 | shell( 37 | ''' 38 | seqtk comp %s | \ 39 | awk \ 40 | 'BEGIN \ 41 | {{print "%s"}}; \ 42 | {{print "%s" "\t" $0}}' | \ 43 | gzip -c > %s 44 | ''' % (bin_fa, header_name, header_content, stats_file)) 45 | 46 | 47 | rule binning_report_merge: 48 | input: 49 | expand(os.path.join( 50 | config["output"]["binning"], 51 | "report/{{assembler}}/{{binner_checkm}}/{binning_group}.{assembly_group}"), 52 | zip, 53 | binning_group=ASSEMBLY_GROUP["binning_group"], 54 | assembly_group=ASSEMBLY_GROUP["assembly_group"]) 55 | output: 56 | summary = os.path.join( 57 | config["output"]["binning"], 58 | "report/assembly_stats_{assembler}.{binner_checkm}.tsv.gz") 59 | params: 60 | min_length = config["params"]["assembly"]["report"]["min_length"], 61 | len_ranges = config["params"]["assembly"]["report"]["len_ranges"] 62 | threads: 63 | config["params"]["binning"]["threads"] 64 | run: 65 | import glob 66 | comp_list = [] 67 | for i in input: 68 | comp_list += glob.glob(i + "/*.seqtk.comp.tsv.gz") 69 | 70 | if len(comp_list) != 0: 71 | metapi.assembler_init( 72 | params.len_ranges, 73 | ["binning_group", "assembly_group", "bin_id", "bin_file", "assembler", "binner"]) 74 | comp_list_ = [(j, params.min_length) for j in comp_list] 75 | metapi.merge( 76 | comp_list_, metapi.parse_assembly, 77 | threads, output=output.summary) 78 | else: 79 | shell('''touch {output.summary}''') 80 | 81 | 82 | rule binning_report_all: 83 | input: 84 | expand(os.path.join( 85 | config["output"]["binning"], 86 | "report/assembly_stats_{assembler}.{binner_checkm}.tsv.gz"), 87 | assembler=ASSEMBLERS, 88 | binner_checkm=BINNERS_CHECKM) 89 | 90 | else: 91 | rule binning_report_all: 92 | input: 93 | 94 | 95 | rule binning_all: 96 | input: 97 | rules.binning_metabat2_all.input, 98 | rules.binning_maxbin2_all.input, 99 | rules.binning_concoct_all.input, 100 | rules.binning_graphbin2_all.input, 101 | rules.binning_vamb_all.input, 102 | rules.binning_semibin_all.input, 103 | rules.binning_dastools_all.input, 104 | rules.binning_report_all.input 105 | 106 | 107 | localrules: 108 | binning_report_all, 109 | binning_all -------------------------------------------------------------------------------- /scripts/post_assembly_binning.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import os 4 | 5 | import pandas as pd 6 | 7 | 8 | def codegen(samples_tsv, output_dir): 9 | samples = pd.read_csv(samples_tsv, sep='\t').set_index("bin_id", drop=False) 10 | 11 | os.makedirs(output_dir, exist_ok=True) 12 | 13 | index_dir = os.path.join(output_dir, "00.index") 14 | os.makedirs(index_dir, exist_ok=True) 15 | 16 | mapping_dir = os.path.join(output_dir, "01.mapping") 17 | os.makedirs(mapping_dir, exist_ok=True) 18 | 19 | asm_dir = os.path.join(output_dir, "02.assembly") 20 | os.makedirs(asm_dir, exist_ok=True) 21 | 22 | checkm_asm_dir = os.path.join(output_dir, "03.checkm_asm") 23 | os.makedirs(checkm_asm_dir, exist_ok=True) 24 | checkm_asm_input_dir = os.path.join(checkm_asm_dir, "input") 25 | os.makedirs(checkm_asm_input_dir, exist_ok=True) 26 | checkm_asm_output_dir = os.path.join(checkm_asm_dir, "output") 27 | os.makedirs(checkm_asm_output_dir, exist_ok=True) 28 | 29 | with open(os.path.join(output_dir, "step1.index.sh"), 'w') as oh1, \ 30 | open(os.path.join(output_dir, "step2.mapping.sh"), 'w') as oh2, \ 31 | open(os.path.join(output_dir, "step3.assembly_spades.sh"), 'w') as oh3, \ 32 | open(os.path.join(output_dir, "step3.assembly_shovill_spades.sh"), 'w') as oh4, \ 33 | open(os.path.join(output_dir, "step3.assembly_shovill_megahit.sh"), 'w') as oh5, \ 34 | open(os.path.join(output_dir, "step3.assembly_shovill_velvet.sh"), 'w') as oh6, \ 35 | open(os.path.join(output_dir, "step3.assembly_shovill_skesa.sh"), 'w') as oh7, \ 36 | open(os.path.join(output_dir, "step4.links_asm_fa.sh"), 'w') as oh8, \ 37 | open(os.path.join(output_dir, "step5.checkm_lineage_wf.sh"), 'w') as oh9: 38 | 39 | for bin_id in samples.index: 40 | # index 41 | prefix = os.path.join(index_dir, bin_id) 42 | cmd = "bwa index %s -p %s\n" % (samples.loc[bin_id, "bins_fna_path"], prefix) 43 | oh1.write(cmd) 44 | 45 | # mapping and extract reads 46 | r1 = os.path.join(mapping_dir, "%s.r1.fq.gz" % bin_id) 47 | r2 = os.path.join(mapping_dir, "%s.r2.fq.gz" % bin_id) 48 | stat = os.path.join(mapping_dir, "%s-flagstat.txt" % bin_id) 49 | cmd = "bwa mem -t 8 %s %s %s | tee >(samtools flagstat -@8 - > %s) | samtools fastq -@8 -F 12 -n -1 %s -2 %s -\n" % ( 50 | prefix, samples.loc[bin_id, "fq1"], samples.loc[bin_id, "fq2"], stat, r1, r2) 51 | oh2.write(cmd) 52 | 53 | # assembly 54 | ## spades 55 | bins_asm_dir = os.path.join(asm_dir, bin_id + ".spades_out") 56 | cmd = "spades.py -1 %s -2 %s -k 21,29,39,59,79,99 --threads 8 -o %s\n" % (r1, r2, bins_asm_dir) 57 | oh3.write(cmd) 58 | asm_fa = os.path.join(bins_asm_dir, "scaffolds.fasta") 59 | asm_fa_ = os.path.join(checkm_asm_input_dir, bin_id + ".spades.fa") 60 | oh8.write("ln -s %s %s\n" % (asm_fa, asm_fa_)) 61 | 62 | ## shovill 63 | ### spades or megahit or velvet or skesa 64 | for assembler, file_handle in zip(["spades", "megahit", "velvet", "skesa"], [oh4, oh5, oh6, oh7]): 65 | bins_asm_dir = os.path.join(asm_dir, bin_id + ".shovill_%s_out" % assembler) 66 | cmd = "shovill --cpus 8 --ram 20 --keepfiles --assembler %s --outdir %s --R1 %s --R2 %s\n" % (assembler, bins_asm_dir, r1, r2) 67 | file_handle.write(cmd) 68 | asm_fa = os.path.join(bins_asm_dir, "contigs.fa") 69 | asm_fa_ = os.path.join(checkm_asm_input_dir, bin_id + "." + assembler + ".fa") 70 | oh8.write("ln -s %s %s\n" % (asm_fa, asm_fa_)) 71 | 72 | checkm_asm_out = os.path.join(checkm_asm_dir, "checkm_asm.txt") 73 | checkm_asm_log = os.path.join(checkm_asm_dir, "checkm_asm.log") 74 | checkm_asm_cmd = "checkm lineage_wf -f %s -t 8 --pplacer_threads 8 -x fa %s/ %s/ 2>%s\n" % \ 75 | (checkm_asm_out, checkm_asm_input_dir, checkm_asm_output_dir, checkm_asm_log) 76 | oh9.write(checkm_asm_cmd) 77 | 78 | 79 | def main(): 80 | parser = argparse.ArgumentParser(description='reassembly reads') 81 | parser.add_argument('-s', '--samples', type=str, help='metagenomics bins and paired reads list') 82 | parser.add_argument('-o', '--outdir', type=str, help='output directory') 83 | args = parser.parse_args() 84 | codegen(args.samples, args.outdir) 85 | 86 | 87 | if __name__ == "__main__": 88 | main() 89 | -------------------------------------------------------------------------------- /scripts/estimate_T2T_data_size.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import pandas as pd 4 | import requests 5 | import xmltodict 6 | import argparse 7 | from rich import print 8 | from rich.console import Console 9 | 10 | # https://github.com/Textualize/rich/issues/67 11 | _console = Console() 12 | 13 | class RichArgumentParser(argparse.ArgumentParser): 14 | def _print_message(self, message, file=None): 15 | _console.print(message) 16 | 17 | def add_argument_group(self, *args, **kwargs): 18 | group = super().add_argument_group(*args, **kwargs) 19 | group.title = f"[cyan]{group.title.title()}[/cyan]" 20 | return group 21 | 22 | 23 | class RichRawTextHelpFormatter(argparse.RawTextHelpFormatter): 24 | def _split_lines(self, text, width): 25 | return [f"[yellow]{line}[/yellow]" for line in text.splitlines()] 26 | 27 | 28 | # see: http://goo.gl/kTQMs 29 | SYMBOLS = { 30 | 'customary' : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'), 31 | 'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', 'zetta', 'iotta'), 32 | 'iec' : ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'), 33 | 'iec_ext' : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', 'zebi', 'yobi'), 34 | } 35 | 36 | 37 | def bytes2human(n, format='%(value).1f %(symbol)s', symbols='customary'): 38 | n = int(n) 39 | if n < 0: 40 | raise ValueError("n < 0") 41 | symbols = SYMBOLS[symbols] 42 | prefix = {} 43 | for i, s in enumerate(symbols[1:]): 44 | prefix[s] = 1 << (i+1)*10 45 | for symbol in reversed(symbols[1:]): 46 | if n >= prefix[symbol]: 47 | value = float(n) / prefix[symbol] 48 | return format % locals() 49 | return format % dict(symbol=symbols[0], value=n) 50 | 51 | 52 | def human2bytes(s): 53 | init = s 54 | num = "" 55 | while s and s[0:1].isdigit() or s[0:1] == '.': 56 | num += s[0] 57 | s = s[1:] 58 | if num != "": 59 | num = float(num) 60 | else: 61 | raise ValueError(f"can't covert {s} to float") 62 | letter = s.strip() 63 | #print(letter) 64 | for name, sset in SYMBOLS.items(): 65 | if letter in sset: 66 | break 67 | else: 68 | if (letter == 'k') or (letter == "m") or (letter == "g"): 69 | # treat 'k' as an alias for 'K' as per: http://goo.gl/kTQMs 70 | sset = SYMBOLS['customary'] 71 | letter = letter.upper() 72 | else: 73 | raise ValueError("can't interpret %r" % init) 74 | prefix = {sset[0]:1} 75 | for i, s in enumerate(sset[1:]): 76 | prefix[s] = 1 << (i+1)*10 77 | return int(num * prefix[letter]) 78 | 79 | 80 | def generate_xml(http_link): 81 | print(f'''Parsing: {http_link}\n''') 82 | r = requests.get(http_link) 83 | if "xml" in r.headers['content-type']: 84 | print(f'''Success: get XML document from the link: {http_link}\n''') 85 | return r.text 86 | else: 87 | print(f'''Error: can't get XML document from the link: {http_link}\nExiting\n''') 88 | return None 89 | 90 | 91 | def estimate_size(xml_str, output=None): 92 | xml_dict = xmltodict.parse(xml_str) 93 | if "ListBucketResult" in xml_dict: 94 | file_info_df = pd.DataFrame(xml_dict["ListBucketResult"]["Contents"])\ 95 | .astype({"Size": int})\ 96 | .sort_values(["Size", "Key"]) 97 | print(file_info_df) 98 | 99 | if output is not None: 100 | file_info_df.to_csv(output, sep="\t", index=False) 101 | 102 | total_size = sum(file_info_df["Size"]) 103 | total_size_human = bytes2human(total_size) 104 | print(f'''\nTotal file size is: {total_size}''') 105 | print(f'''\nTotal file size is: {total_size_human}''') 106 | else: 107 | print("\nError: parse XML document error\nExiting") 108 | 109 | 110 | def main(): 111 | parser = RichArgumentParser("Estimate T2T data size") 112 | parser.add_argument("--http-link", dest="http_link", 113 | default="https://s3-us-west-2.amazonaws.com/human-pangenomics?/delimiter=/&prefix=T2T", 114 | help="T2T file/directory S3 link, default:\nhttps://s3-us-west-2.amazonaws.com/human-pangenomics?/delimiter=/&prefix=T2T") 115 | parser.add_argument("--output", dest="output", default=None, 116 | help="a tsv file contains file information, default=None") 117 | args = parser.parse_args() 118 | 119 | xml_str = generate_xml(args.http_link) 120 | estimate_size(xml_str, args.output) 121 | 122 | 123 | if __name__ == "__main__": 124 | main() 125 | -------------------------------------------------------------------------------- /scripts/mapping_statistics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import csv 4 | import fileinput 5 | import os 6 | import re 7 | from decimal import * 8 | 9 | 10 | """ 11 | How to assess the quality of metagenomcis assembly 12 | https://www.biostars.org/p/128629/#128639 13 | 14 | Brian Bushnell saied: 15 | calculate the percentage of reads that map back to the assembly 16 | if only 50% of your reads map to the assembly, it is not very complete 17 | but if 95% of your reads map to the assembly, then even if it is 18 | somewhat fragmented, that's probably very good 19 | 20 | It might also help to look at the percentage properly paired reads to 21 | detect any chimeras, something that seems especially relevant in a 22 | metagenome assembly 23 | 24 | the most useful tools is quast, quality assessment tool for 25 | genome ascalculate map rate from bamsemblies 26 | 27 | http://genomebio.org/alignment-stats-bwa/ 28 | getting alignment stats out of bwa 29 | 30 | bwa mem -t 6 ref read.1.fq read.2.fq \ 31 | | samtools view -@6 -Sbh - \ 32 | | tee >(samtools flagstat - > stats.out) > aln.bam 33 | 34 | http://www.pnas.org/content/pnas/113/42/11901.full.pdf 35 | Deep sequencing of 10,000 human genomes(Amalio Telenti and J.Craig Venter) 36 | 测序深度 = reads长度 × 比对的reads数目 / 参考序列长度 37 | 38 | metabat_coverage 39 | concoct_coverage 40 | checkm_coverage 41 | """ 42 | 43 | 44 | def mapping_rate(flagstats, out_file, method): 45 | """ 46 | get alignment rate from sorted bam file 47 | samtools -flagstat --threads 8 sample.sort.bam 48 | """ 49 | headers = [ 50 | 'sample_id', 'total_num', 'read_1_num', 'read_2_num', 'mapping_type', 51 | 'mapped_num', 'mapped_rate', 'paired_num', 'paired_rate', 52 | 'singletons_num', 'singletons_rate', 'mate_mapped_num', 53 | 'mate_mapped_num_mapQge5' 54 | ] 55 | mapping_info = [] 56 | getcontext().prec = 8 57 | 58 | # with open(flagstat_list, 'r') as list_handle: 59 | if method == 1: 60 | list_handle = open(flagstats, 'r') 61 | if method == 2: 62 | list_handle = flagstats 63 | 64 | for flagstat_file in list_handle: 65 | info = {} 66 | info['sample_id'] = os.path.basename( 67 | flagstat_file.strip()).split('.')[0] 68 | stat_list = open(flagstat_file.strip(), 'r').readlines() 69 | info['total_num'] = stat_list[0].split(' ')[0] 70 | info['read_1_num'] = stat_list[6].split(' ')[0] 71 | info['read_2_num'] = stat_list[7].split(' ')[0] 72 | 73 | mapped = re.split(r'\(|\s+', stat_list[4]) 74 | info['mapped_num'] = mapped[0] 75 | info['mapped_rate'] = Decimal(mapped[5].rstrip('%')) / Decimal(100) 76 | 77 | paired = re.split(r'\(|\s+', stat_list[8]) 78 | info['paired_num'] = paired[0] 79 | paired_rate = paired[6].rstrip('%') 80 | if paired_rate != "N/A": 81 | info['paired_rate'] = Decimal(paired_rate) / Decimal(100) 82 | info['mapping_type'] = "paired-end" 83 | else: 84 | info['paired_rate'] = paired_rate 85 | info["mapping_type"] = "single-end" 86 | 87 | singletons = re.split(r'\(|\s+', stat_list[-3]) 88 | info['singletons_num'] = singletons[0] 89 | singletons_rate = singletons[5].rstrip('%') 90 | if singletons_rate != "N/A": 91 | info['singletons_rate'] = Decimal(singletons_rate) / Decimal(100) 92 | else: 93 | info['singletons_rate'] = singletons_rate 94 | 95 | info['mate_mapped_num'] = re.split(r'\(|\s+', stat_list[-2])[0] 96 | info['mate_mapped_num_mapQge5'] = re.split(r'\(|\s+', stat_list[-1])[0] 97 | mapping_info.append(info) 98 | 99 | with open(out_file, 'w') as out_handle: 100 | f_tsv = csv.DictWriter(out_handle, headers, delimiter='\t') 101 | f_tsv.writeheader() 102 | f_tsv.writerows(mapping_info) 103 | 104 | 105 | def main(): 106 | """main funciton""" 107 | parser = argparse.ArgumentParser( 108 | description='compute alignment rate based bam file') 109 | parser.add_argument( 110 | '-statlist', default=None, type=str, help='sorted bam file list') 111 | parser.add_argument( 112 | '-statfiles', default=None, nargs='*', help='sorted bam file') 113 | parser.add_argument( 114 | '-outfile', type=str, help='output alignment rate file') 115 | args = parser.parse_args() 116 | if args.statlist: 117 | method = 1 118 | mapping_rate(args.statlist, args.outfile, method) 119 | if args.statfiles: 120 | method = 2 121 | mapping_rate(args.statfiles, args.outfile, method) 122 | 123 | 124 | if __name__ == '__main__': 125 | main() 126 | -------------------------------------------------------------------------------- /metapi/checkmer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import argparse 5 | import concurrent.futures 6 | import subprocess 7 | import pandas as pd 8 | import numpy as np 9 | from natsort import index_natsorted 10 | 11 | 12 | def checkm_prepare(gene_table, batch_num, mags_dir): 13 | os.makedirs(mags_dir, exist_ok=True) 14 | 15 | table_df = pd.read_csv(gene_table, sep="\t") 16 | table_df = table_df.sort_values( 17 | by="bin_id", 18 | key=lambda x: np.argsort( 19 | index_natsorted(table_df["bin_id"]))).reset_index(drop=True) 20 | 21 | batchid = -1 22 | if len(table_df) > 0: 23 | for batch in range(0, len(table_df), batch_num): 24 | batchid += 1 25 | table_split = table_df.iloc[batch:batch+batch_num, ] 26 | table_split.to_csv( 27 | os.path.join(mags_dir, f"mags_input.{batchid}.tsv"), 28 | sep="\t", index=False, header=None) 29 | else: 30 | subprocess.run(f'''touch {os.path.join(mags_dir, "mags_input.0.tsv")}''', shell=True) 31 | 32 | 33 | def MIMAG_quality_level(row): 34 | """ 35 | https://doi.org/10.1038/nbt.3893 36 | """ 37 | if (row["completeness"] > 90.0) and (row["contamination"] < 5.0): 38 | return "high_quality" 39 | elif (row["completeness"] > 50.0) and (row["contamination"] < 10.0): 40 | return "medium_quality" 41 | else: 42 | return "low_quality" 43 | 44 | 45 | def SGB_quality_level(row): 46 | """ 47 | https://doi.org/10.1016/j.cell.2019.01.001 48 | """ 49 | if ( 50 | (row["strain_heterogeneity"] < 0.5) 51 | and (row["completeness"] > 90.0) 52 | and (row["contamination"] < 5.0) 53 | ): 54 | return "high_quality" 55 | elif (row["completeness"] > 50.0) and (row["contamination"] < 5.0): 56 | return "medium_quality" 57 | else: 58 | return "low_quality" 59 | 60 | 61 | def quality_score(row): 62 | """ 63 | https://doi.org/10.1038/s41586-019-0965-1 64 | """ 65 | return row["completeness"] - 5 * row["contamination"] 66 | 67 | 68 | def parse_checkm_table(checkm_table): 69 | if os.path.getsize(checkm_table) > 0: 70 | checkm_df = pd.read_csv(checkm_table, sep="\t") 71 | return checkm_df 72 | else: 73 | return None 74 | 75 | 76 | def checkm_reporter(checkm_list, output, threads): 77 | df_list = [] 78 | with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor: 79 | for df in executor.map(parse_checkm_table, checkm_list): 80 | if df is not None: 81 | df_list.append(df) 82 | 83 | df_ = pd.DataFrame( 84 | columns=[ 85 | "bin_id", 86 | "marker_lineage", 87 | "genomes", 88 | "markers", 89 | "marker_sets", 90 | "completeness", 91 | "contamination", 92 | "strain_heterogeneity", 93 | "MIMAG_quality_level", 94 | "SGB_quality_level", 95 | "quality_score"]) 96 | 97 | if len(df_list) >= 1: 98 | df_ = pd.concat(df_list).rename( 99 | columns={ 100 | "Bin Id": "bin_id", 101 | "Marker lineage": "marker_lineage", 102 | "# genomes": "genomes", 103 | "# markers": "markers", 104 | "# marker sets": "marker_sets", 105 | "Completeness": "completeness", 106 | "Contamination": "contamination", 107 | "Strain heterogeneity": "strain_heterogeneity", 108 | } 109 | ) 110 | 111 | if not df_.empty: 112 | df_["MIMAG_quality_level"] = df_.apply( 113 | lambda x: MIMAG_quality_level(x), axis=1) 114 | df_["SGB_quality_level"] = df_.apply( 115 | lambda x: SGB_quality_level(x), axis=1) 116 | df_["quality_score"] = df_.apply(lambda x: quality_score(x), axis=1) 117 | 118 | if output is not None: 119 | df_.to_csv(output, sep="\t", index=False) 120 | 121 | return df_ 122 | 123 | 124 | def main(): 125 | parser = argparse.ArgumentParser("CheckM reporter") 126 | parser.add_argument("--checkm_list", type=str, help="checkm out list") 127 | parser.add_argument("--output", type=str, required=True, 128 | help="checkm output file") 129 | parser.add_argument( 130 | "--threads", type=int, default=8, help="threads used on combine CheckM output" 131 | ) 132 | args = parser.parse_args() 133 | 134 | checkm_list = [l.strip() for l in open(args.checkm_list, "r")] 135 | checkm_reporter(checkm_list, args.output, args.threads) 136 | 137 | 138 | if __name__ == "__main__": 139 | main() 140 | -------------------------------------------------------------------------------- /scripts/t2d_abundance_merger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pandas as pd 4 | import concurrent.futures 5 | import os 6 | import sys 7 | import argparse 8 | 9 | def global_init(index_metadata): 10 | global INDEX_METADATA__ 11 | INDEX_METADATA__ = pd.read_csv(index_metadata, sep='\t') 12 | 13 | 14 | def get_mgs_id(row): 15 | return "_".join(row["ID"].split("_")[0:-1]) 16 | 17 | 18 | def get_abun_df_hsx(abun_file): 19 | sample_id = os.path.basename(abun_file).split(".")[0] 20 | 21 | try: 22 | if os.path.exists(abun_file): 23 | abun = pd.read_csv(abun_file, sep='\t') 24 | else: 25 | print("%s is not exists" % abun_file) 26 | return None, None 27 | except pd.errors.EmptyDataError: 28 | print("%s is empty" % abun_file) 29 | return None, None 30 | 31 | abun["mgs_id"] = abun.apply(get_mgs_id, axis=1) 32 | 33 | count_df = abun.loc[:, ["mgs_id", "reads_pairs"]]\ 34 | .groupby("mgs_id")\ 35 | .agg({"reads_pairs": 'sum'})\ 36 | .rename(columns={"reads_pairs": sample_id}) 37 | abun_df = abun.loc[:, ["mgs_id", "gene_abundance"]]\ 38 | .groupby("mgs_id")\ 39 | .agg({"gene_abundance": 'sum'})\ 40 | .rename(columns={"gene_abundance": sample_id}) 41 | return count_df, abun_df 42 | 43 | 44 | def get_abun_df_jgi(depth_file): 45 | sample_id = os.path.basename(depth_file).split(".")[0] 46 | 47 | try: 48 | if os.path.exists(depth_file): 49 | depth = pd.read_csv(depth_file, sep='\t') 50 | else: 51 | print("%s is not exists" % depth_file) 52 | return None, None 53 | except pd.errors.EmptyDataError: 54 | print("%s is empty" % depth_file) 55 | return None, None 56 | 57 | depth = depth.rename(columns={"contigName": "contig_name"})\ 58 | .merge(INDEX_METADATA__)\ 59 | .groupby("mgs_id")\ 60 | .agg({"totalAvgDepth": "mean"}) 61 | depth[sample_id] = depth["totalAvgDepth"] / sum(depth["totalAvgDepth"]) 62 | depth_df = depth.loc[:, ["totalAvgDepth"]].rename(columns={"totalAvgDepth": sample_id}) 63 | abun_df = depth.loc[:, [sample_id]] 64 | return depth_df, abun_df 65 | 66 | 67 | def get_all_abun_df(abun_files, workers, func): 68 | count_list = [] 69 | abun_list = [] 70 | with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor: 71 | for count_df, abun_df in executor.map(func, abun_files): 72 | if (not count_df is None) and (not abun_df is None): 73 | count_list.append(count_df) 74 | abun_list.append(abun_df) 75 | 76 | count_df_ = pd.concat(count_list, axis=1).reset_index() 77 | abun_df_ = pd.concat(abun_list, axis=1).reset_index() 78 | 79 | return count_df_, abun_df_ 80 | 81 | 82 | def main(): 83 | parser = argparse.ArgumentParser('merge many samples abundance file to one profile') 84 | parser.add_argument( 85 | '-l', 86 | '--abundance_list', 87 | type=str, 88 | help='abundance list') 89 | parser.add_argument( 90 | '--method', 91 | default="hsx", 92 | choices=["hsx", "jgi"], 93 | help='compute method' 94 | ) 95 | parser.add_argument( 96 | '--database', 97 | default=None, 98 | help='contig and genome relationships' 99 | ) 100 | parser.add_argument( 101 | '--threads', 102 | default=8, 103 | type=int, 104 | help='threads' 105 | ) 106 | parser.add_argument( 107 | '--out_count_profile', 108 | type=str, 109 | help='output count profile') 110 | parser.add_argument( 111 | '--out_abundance_profile', 112 | type=str, 113 | help='output abundance profile') 114 | args = parser.parse_args() 115 | 116 | abun_files = pd.read_csv(args.abundance_list, names=["path"])\ 117 | .loc[:, "path"].values 118 | 119 | if args.method == "jgi" and args.database is None: 120 | print("pleas supply database when parse jgi depth file") 121 | sys.exit(1) 122 | 123 | if args.method == "hsx": 124 | count_df, abun_df = get_all_abun_df(abun_files, args.threads, get_abun_df_hsx) 125 | elif args.method == "jgi": 126 | global_init(args.database) 127 | count_df, abun_df = get_all_abun_df(abun_files, args.threads, get_abun_df_jgi) 128 | else: 129 | print("unsupport method: %s" % args.method) 130 | 131 | count_df.to_csv(args.out_count_profile, sep='\t', index=False) 132 | abun_df.to_csv(args.out_abundance_profile, sep='\t', index=False) 133 | 134 | 135 | if __name__ == '__main__': 136 | main() 137 | --------------------------------------------------------------------------------