├── setup.cfg
├── tests
    ├── test_bash
    │   ├── test.sh
    │   └── Snakefile
    ├── test_executor
    │   ├── test.sh
    │   ├── test.py
    │   └── Snakefile
    └── test_spades
    │   ├── test.sh
    │   ├── metapi.yaml
    │   └── Snakefile
├── docs
    ├── mag_workflow.png
    └── metapi.dio
├── metapi
    ├── profiles
    │   ├── lsf
    │   │   ├── lsf_jobscript.sh
    │   │   ├── config.yaml
    │   │   ├── CookieCutter.py
    │   │   ├── lsf_config.py
    │   │   ├── OSLayer.py
    │   │   └── memory_units.py
    │   ├── slurm
    │   │   ├── slurm-jobscript.sh
    │   │   ├── settings.json
    │   │   ├── config.yaml
    │   │   ├── CookieCutter.py
    │   │   ├── slurm-status.py
    │   │   └── slurm-submit.py
    │   ├── pbs-torque
    │   │   ├── pbs-jobscript.sh
    │   │   ├── config.yaml
    │   │   └── pbs-status.py
    │   ├── sge
    │   │   ├── sge-jobscript.sh
    │   │   ├── config.yaml
    │   │   └── sge-status.py
    │   └── generic
    │   │   ├── config.yaml
    │   │   ├── lsf_status.py
    │   │   ├── pbs_status.py
    │   │   ├── key_mapping.yaml
    │   │   ├── cluster_config.yaml
    │   │   ├── slurm_status.py
    │   │   └── scheduler.py
    ├── __about__.py
    ├── envs
    │   ├── drep.yaml
    │   ├── kmcp.yaml
    │   ├── blast.yaml
    │   ├── cdhit.yaml
    │   ├── checkv.yaml
    │   ├── fastqc.yaml
    │   ├── multiqc.yaml
    │   ├── kneaddata.yaml
    │   ├── plass.yaml
    │   ├── quast.yaml
    │   ├── virsorter2.yaml
    │   ├── metabat2.yaml
    │   ├── taxonkit.yaml
    │   ├── gtdbtk.yaml
    │   ├── idba.yaml
    │   ├── simulate.yaml
    │   ├── spades.yaml
    │   ├── dastools.yaml
    │   ├── galah.yaml
    │   ├── predict.yaml
    │   ├── maxbin2.yaml
    │   ├── megahit.yaml
    │   ├── report.yaml
    │   ├── checkm.yaml
    │   ├── trimming.yaml
    │   ├── krakenuniq.yaml
    │   ├── raw.yaml
    │   ├── kraken2.yaml
    │   ├── phamb.yaml
    │   ├── align.yaml
    │   ├── concoct.yaml
    │   ├── deepvirfinder.yaml
    │   ├── vamb.yaml
    │   └── semibin.yaml
    ├── wrappers
    │   ├── concoct_postprocess.py
    │   ├── maxbin2_postprocess.py
    │   ├── dastools_postprocess.py
    │   ├── hmmsearch_wrapper.py
    │   ├── vamb
    │   │   ├── write_abundances.py
    │   │   ├── abundances_mask.py
    │   │   ├── create_abundances.py
    │   │   └── concatenate.py
    │   ├── misc.py
    │   ├── prokka_wrapper.py
    │   ├── simulate_reads.py
    │   ├── prodigal_wrapper.py
    │   └── gtdbtk_postprocess.py
    ├── snakefiles
    │   ├── simulate_wf.smk
    │   └── gene_wf.smk
    ├── visualization
    │   └── dada2_stats_barplot.R
    ├── tooler.py
    ├── rules
    │   ├── qcreport.smk
    │   ├── simulate.smk
    │   ├── upload.smk
    │   └── binning_report.smk
    ├── predictor.py
    ├── __init__.py
    ├── simulator.py
    └── checkmer.py
├── requirements.txt
├── run_metapi.py
├── environment.yml
├── scripts
    ├── perl_test.pl
    ├── cout_seq_by_line.py
    ├── samples_validator.py
    ├── job.py
    ├── find_ATG.pl
    ├── contigs_filter_by_len.py
    ├── parse_mgs_profile.py
    ├── merge_sig_csv.py
    ├── get_bins_id.py
    ├── print_reads_length.py
    ├── batch_prokka.py
    ├── checkm_link.py
    ├── merge_fasta_by_len.py
    ├── filter_pe_fastq_by_len.py
    ├── fasta_length_tab.py
    ├── kraken2_reads_merger.py
    ├── animf_cluster.py
    ├── find_path.py
    ├── taxonomy_info_covert.py
    ├── get_prodigal_gbk_result.py
    ├── asm_status_wrapper.py
    ├── cut_up_fasta_concoct.py
    ├── aggregate_genomecov.py
    ├── contigs_from_sample.py
    ├── fastq_contig_size.py
    ├── split_fx.py
    ├── megahit_hadoop.sh
    ├── get_bin_id_by_ccsh.py
    ├── insert_size_ploter.py
    ├── megahit_sge.py
    ├── split_mummer.py
    ├── assembly_info.r
    ├── metapi_config_update.py
    ├── merge_checkm_out.py
    ├── clstr_szie_tab.py
    ├── rename_fasta_id.py
    ├── asub.py
    ├── kraken2_demultiplex_summary.py
    ├── qc_report.py
    ├── clean_statout_to_matrix.py
    ├── extract_bins_from_mgs_profile.py
    ├── filter_pe_fastq_by_size.py
    ├── post_assembly_binning.py
    ├── estimate_T2T_data_size.py
    ├── mapping_statistics.py
    └── t2d_abundance_merger.py
├── MANIFEST.in
├── .gitignore
├── .circleci
    └── config.yml
└── setup.py


/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/tests/test_bash/test.sh:
--------------------------------------------------------------------------------
1 | snakemake --snakefile Snakefile -c 1 --until all
2 | 


--------------------------------------------------------------------------------
/tests/test_executor/test.sh:
--------------------------------------------------------------------------------
1 | 
2 | snakemake --snakefile Snakefile -c 1 --until all


--------------------------------------------------------------------------------
/docs/mag_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ohmeta/metapi/HEAD/docs/mag_workflow.png


--------------------------------------------------------------------------------
/metapi/profiles/lsf/lsf_jobscript.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # properties = {properties}
3 | {exec_job}


--------------------------------------------------------------------------------
/tests/test_spades/test.sh:
--------------------------------------------------------------------------------
1 | snakemake --snakefile Snakefile -c 1 --until all --use-conda
2 | 


--------------------------------------------------------------------------------
/metapi/profiles/slurm/slurm-jobscript.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # properties = {properties}
3 | {exec_job}
4 | 


--------------------------------------------------------------------------------
/metapi/profiles/pbs-torque/pbs-jobscript.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # properties = {properties}
3 | {exec_job}
4 | 


--------------------------------------------------------------------------------
/metapi/__about__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | __version__ = '3.0.0'
4 | __author__ = "Jie Zhu, Fangming Yang, Ye Peng"
5 | 


--------------------------------------------------------------------------------
/metapi/envs/drep.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - drep=3.5.0
7 |   - pigz
8 |   - jq


--------------------------------------------------------------------------------
/metapi/envs/kmcp.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - kmcp=0.9.4
7 |   - pigz
8 |   - jq


--------------------------------------------------------------------------------
/tests/test_spades/metapi.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - conda-forge
3 |     - bioconda
4 |     - defaults
5 | dependencies:
6 |     - metapi=2.3.0
7 | 


--------------------------------------------------------------------------------
/metapi/envs/blast.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - blast=2.15.0
7 |   - pigz
8 |   - jq


--------------------------------------------------------------------------------
/metapi/envs/cdhit.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - cd-hit=4.8.1
7 |   - pigz
8 |   - jq


--------------------------------------------------------------------------------
/metapi/envs/checkv.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - checkv=1.0.1
7 |   - pigz
8 |   - jq


--------------------------------------------------------------------------------
/metapi/envs/fastqc.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - fastqc=0.12.1
7 |   - pigz
8 |   - jq


--------------------------------------------------------------------------------
/metapi/envs/multiqc.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - multiqc=1.21
7 |   - pigz
8 |   - jq


--------------------------------------------------------------------------------
/metapi/envs/kneaddata.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - kneaddata=0.12.0
7 |   - pigz
8 |   - jq


--------------------------------------------------------------------------------
/metapi/envs/plass.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - plass=4.687d7
7 |   - pigz
8 |   - jq
9 | 


--------------------------------------------------------------------------------
/metapi/envs/quast.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - quast=5.2.0
7 |   - pigz
8 |   - jq
9 | 


--------------------------------------------------------------------------------
/metapi/envs/virsorter2.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - virsorter=2.2.4
7 |   - pigz
8 |   - jq


--------------------------------------------------------------------------------
/metapi/profiles/sge/sge-jobscript.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # properties = {properties}
3 | 
4 | # exit on first error
5 | set -o errexit
6 | 
7 | {exec_job}
8 | 


--------------------------------------------------------------------------------
/metapi/envs/metabat2.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - metabat2=2.15
7 |   - pigz
8 |   - jq
9 | 


--------------------------------------------------------------------------------
/metapi/envs/taxonkit.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - taxonkit
7 |   - csvtk
8 |   - pigz
9 |   - jq


--------------------------------------------------------------------------------
/metapi/envs/gtdbtk.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - gtdbtk=2.3.2
7 |   - pandas
8 |   - pigz
9 |   - jq


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas
 2 | numpy
 3 | ruamel.yaml
 4 | snakemake >=7.0
 5 | openpyxl
 6 | natsort
 7 | biopython >=1.73
 8 | seaborn
 9 | matplotlib
10 | executor


--------------------------------------------------------------------------------
/tests/test_executor/test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | 
4 | from executor import execute
5 | 
6 | output = str(snakemake.output)
7 | execute(f'''touch {output}''')


--------------------------------------------------------------------------------
/metapi/envs/idba.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - idba=1.1.3
 7 |   - seqtk
 8 |   - pigz
 9 |   - jq
10 | 


--------------------------------------------------------------------------------
/metapi/envs/simulate.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - nodefaults
5 | dependencies:
6 |   - insilicoseq=2.0.1
7 |   - pandas
8 |   - biopython
9 |   - pigz


--------------------------------------------------------------------------------
/metapi/envs/spades.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - spades=3.15.5
 7 |   - pigz
 8 |   - fd-find
 9 |   - jq
10 | 


--------------------------------------------------------------------------------
/metapi/envs/dastools.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - python
 7 |   - das_tool=1.1.7
 8 |   - pigz
 9 |   - jq
10 | 


--------------------------------------------------------------------------------
/metapi/profiles/slurm/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "SBATCH_DEFAULTS": "",
3 |     "CLUSTER_NAME": "",
4 |     "CLUSTER_CONFIG": "cluster.yaml",
5 |     "ADVANCED_ARGUMENT_CONVERSION": "no"
6 | }


--------------------------------------------------------------------------------
/metapi/envs/galah.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - galah=0.4.0
 7 |   - dashing
 8 |   - fastani
 9 |   - pigz
10 |   - jq


--------------------------------------------------------------------------------
/metapi/envs/predict.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - prodigal=2.6.3
 7 |   - prokka=1.14.6
 8 |   - pigz
 9 |   - jq
10 | 


--------------------------------------------------------------------------------
/run_metapi.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import sys
 5 | 
 6 | sys.path.insert(0, os.path.dirname(__file__))
 7 | 
 8 | from metapi.corer import main
 9 | main()
10 | 


--------------------------------------------------------------------------------
/metapi/envs/maxbin2.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - maxbin2=2.2.7
 7 |   - fraggenescan=1.31
 8 |   - pigz
 9 |   - jq
10 | 


--------------------------------------------------------------------------------
/metapi/envs/megahit.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - megahit=1.2.9
 7 |   - gfa1
 8 |   - pigz
 9 |   - fd-find
10 |   - jq
11 | 


--------------------------------------------------------------------------------
/metapi/envs/report.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - seqtk=1.4
 7 |   - seqkit=2.8.0
 8 |   - bioawk=1.0
 9 |   - pigz
10 |   - jq


--------------------------------------------------------------------------------
/metapi/envs/checkm.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - checkm-genome=1.2.2
 7 |   - prodigal=2.6.3
 8 |   - pandas=1.5.1
 9 |   - pigz
10 |   - jq


--------------------------------------------------------------------------------
/metapi/envs/trimming.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - sickle-trim=1.33
 7 |   - fastp=0.23.4
 8 |   - trimmomatic=0.39
 9 |   - pigz
10 |   - jq


--------------------------------------------------------------------------------
/metapi/envs/krakenuniq.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - krakenuniq=1.0.4
 7 |   - bracken=2.9
 8 |   - jellyfish=1.0.3
 9 |   - pigz
10 |   - jq
11 | 


--------------------------------------------------------------------------------
/metapi/envs/raw.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - python
 7 |   - coreutils
 8 |   - seqkit
 9 |   - pigz
10 |   - jq
11 |   - executor
12 |   - sra-tools=3.0.3


--------------------------------------------------------------------------------
/metapi/envs/kraken2.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - kraken2=2.1.3
 7 |   - krakentools=1.2
 8 |   - bracken=2.9
 9 |   - krona=2.8.1
10 |   - pigz
11 |   - jq
12 | 


--------------------------------------------------------------------------------
/metapi/profiles/pbs-torque/config.yaml:
--------------------------------------------------------------------------------
1 | cluster: "pbs-submit.py --depend \"{dependencies}\""
2 | cluster-status: "pbs-status.py"
3 | jobscript: "pbs-jobscript.sh"
4 | jobs: 5000
5 | immediate-submit: False
6 | verbose: true
7 | notemp: true
8 | 


--------------------------------------------------------------------------------
/metapi/profiles/sge/config.yaml:
--------------------------------------------------------------------------------
1 | restart-times: 3
2 | jobscript: sge-jobscript.sh
3 | cluster: "sge-submit.py"
4 | cluster-status: "sge-status.py"
5 | max-jobs-per-second: 1
6 | max-status-checks-per-second: 1
7 | latency-wait: 60
8 | local-cores: 1
9 | 


--------------------------------------------------------------------------------
/metapi/envs/phamb.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - phamb=1.0.1
 7 |   - hmmer=3.4
 8 |   - pyhmmer=0.10.9
 9 |   - joblib
10 |   - pandas
11 |   - numpy
12 |   - biopython
13 |   - pigz
14 |   - jq


--------------------------------------------------------------------------------
/metapi/profiles/slurm/config.yaml:
--------------------------------------------------------------------------------
1 | restart-times: 3
2 | jobscript: "slurm-jobscript.sh"
3 | cluster-config: "cluster.yaml"
4 | cluster: "slurm-submit.py"
5 | cluster-status: "slurm-status.py"
6 | max-jobs-per-second: 10
7 | max-status-checks-per-second: 10
8 | latency-wait: 80


--------------------------------------------------------------------------------
/metapi/envs/align.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - minimap2=2.27
 7 |   - samtools=1.19.2
 8 |   - bwa=0.7.17
 9 |   - bwa-mem2=2.2.1
10 |   - bowtie2=2.5.3
11 |   - seqtk
12 |   - seqkit
13 |   - pigz
14 |   - jq


--------------------------------------------------------------------------------
/metapi/envs/concoct.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - concoct=1.1.0
 7 |   - libopenblas=*=openmp*
 8 |   - mkl
 9 |   - python>=3
10 |   - samtools>=1.9
11 |   - scikit-learn=1.1.*
12 |   - pigz
13 |   - jq
14 | variables:
15 |   USE_OPENMP: 1


--------------------------------------------------------------------------------
/metapi/envs/deepvirfinder.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - nodefaults
 5 | dependencies:
 6 |   - python=3.6
 7 |   - numpy
 8 |   - theano=1.0.3
 9 |   - keras=2.2.4
10 |   - scikit-learn
11 |   - Biopython
12 |   - h5py=2.10.0
13 |   - mkl-service=2.3.0
14 |   - pigz
15 |   - jq


--------------------------------------------------------------------------------
/metapi/envs/vamb.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - pytorch
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - nodefaults
 6 | dependencies:
 7 |   - pigz
 8 |   - pytorch=*=*cuda11.3*
 9 |   - pysam
10 |   - numpy
11 |   - pandas
12 |   - jq
13 |   - pip
14 |   - pip:
15 |     - git+https://github.com/RasmussenLab/vamb@v4.1.3
16 | 


--------------------------------------------------------------------------------
/metapi/profiles/lsf/config.yaml:
--------------------------------------------------------------------------------
 1 | latency-wait: "5"
 2 | jobscript: "lsf_jobscript.sh"
 3 | use-conda: "False"
 4 | use-singularity: "False"
 5 | printshellcmds: "False"
 6 | restart-times: "0"
 7 | jobs: "500"
 8 | cluster: "lsf_submit.py"
 9 | cluster-status: "lsf_status.py"
10 | max-jobs-per-second: "10"
11 | max-status-checks-per-second: "10"


--------------------------------------------------------------------------------
/metapi/envs/semibin.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - pytorch
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - nodefaults
 6 | dependencies:
 7 |   - pytorch=*=*cuda11.3*
 8 |   - mkl=2023.2.0
 9 |   - pigz
10 |   - jq
11 |   - pandas
12 |   - numexpr=2.9.0
13 |   - mmseqs2
14 |   - hmmer
15 |   - prodigal
16 |   - bedtools
17 |   - samtools
18 |   - fraggenescan
19 |   - semibin=2.1.0


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: metapi
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - nodefaults
 6 | dependencies:
 7 |   - setuptools
 8 |   - pandas
 9 |   - numpy
10 |   - ruamel.yaml
11 |   - snakemake >=7.0
12 |   - openpyxl
13 |   - natsort
14 |   - biopython >=1.73
15 |   - seaborn
16 |   - matplotlib
17 |   - seqtk
18 |   - seqkit
19 |   - pigz
20 |   - fd-find
21 |   - executor


--------------------------------------------------------------------------------
/tests/test_executor/Snakefile:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env snakemake
 2 | 
 3 | 
 4 | rule touch_1:
 5 |     output:
 6 |         "done1"
 7 |     script:
 8 |         "test.py"
 9 | 
10 | 
11 | rule touch_2:
12 |     output:
13 |         "done2"
14 |     run:
15 |         from executor import execute
16 |         execute(f'''touch {output}''')
17 | 
18 | 
19 | rule all:
20 |     input:
21 |         "done1",
22 |         "done2"


--------------------------------------------------------------------------------
/scripts/perl_test.pl:
--------------------------------------------------------------------------------
 1 | #/usr/bin/env perl
 2 | use strict;
 3 | use warnings;
 4 | my $a = "hello/world";
 5 | my $b = join('', $a, "/summary");
 6 | print "$a\n";
 7 | print "$b\n";
 8 | print "$ARGV[0]\n";
 9 | print "$ARGV[1]\n";
10 | print "$ARGV[2]\n";
11 | print "$ARGV[3]\n";
12 | 
13 | # eg : perl perl_test.pl a b c d
14 | # output:
15 | # hello/world
16 | # hello/world/summary
17 | # a
18 | # b
19 | # c
20 | # d


--------------------------------------------------------------------------------
/metapi/profiles/generic/config.yaml:
--------------------------------------------------------------------------------
 1 | restart-times: 0
 2 | cluster-config: "cluster_config.yaml" #abs path
 3 | cluster: "scheduler.py" #
 4 | cluster-status: "slurm_status.py" #
 5 | max-jobs-per-second: 10
 6 | max-status-checks-per-second: 10
 7 | cores: 99 # how many jobs you want to submit to your cluster queue
 8 | local-cores: 1
 9 | rerun-incomplete: true  # recomended for cluster submissions
10 | keep-going: false
11 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include *.md *.txt
 2 | include LICENSE
 3 | 
 4 | recursive-include metapi/ *
 5 | recursive-include metapi/wrappers *
 6 | recursive-include metapi/rules *
 7 | recursive-include metapi/snakefiles *
 8 | recursive-include metapi/envs *
 9 | recursive-include metapi/config *
10 | 
11 | global-exclude metapi/ *.pyc
12 | global-exclude metapi/__pycache__ *.pyc
13 | global-exclude metapi/wrappers/ *.pyc
14 | global-exclude metapi/wrappers/__pycache__ *.pyc
15 | 


--------------------------------------------------------------------------------
/scripts/cout_seq_by_line.py:
--------------------------------------------------------------------------------
 1 | #!/ust/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | with open(sys.argv[1], 'r') as handle:
 6 |     num = 0
 7 |     for line in handle:
 8 |         num += 1
 9 |         if num == int(sys.argv[2]):
10 |             print(str(num) + ":\t" + line)
11 |             break
12 | 
13 | # awk 'NR==YOUR_LINE{print}' file_name
14 | # sed -n -e YOUR_LINEp file_name
15 | # perl -wnl -e '$. == YOUR_LINE and print and exit;'
16 | # less -SN +TOUR_LINEg file_name (nice!)


--------------------------------------------------------------------------------
/scripts/samples_validator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import pandas as pd
 3 | import sys
 4 | import os
 5 | 
 6 | 
 7 | def main():
 8 |     samples = pd.read_csv(sys.argv[1], sep='\t').set_index("id", drop=False)
 9 |     for i in samples.index:
10 |         fq1, fq2 = samples.loc[i, ["fq1", "fq2"]]
11 |         if (not os.path.exists(fq1)) or (not os.path.exists(fq2)):
12 |             print("error: %s\t%s\t%s" % (i, fq1, fq2))
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     main()
17 | 


--------------------------------------------------------------------------------
/scripts/job.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | import sys
 4 | 
 5 | from snakemake.utils import read_job_properties
 6 | 
 7 | jobscript = sys.argv[1]
 8 | job_properties = read_job_properties(jobscript)
 9 | 
10 | # do something useful with the threads
11 | threads = job_properties[threads]
12 | 
13 | # access property defined in the cluster configuration file (snakemake >=3.6.0)
14 | job_properties["cluster"]["time"]
15 | 
16 | os.system("qsub -t {threads} {script}".format(threads=threads, script=jobscript))
17 | 


--------------------------------------------------------------------------------
/metapi/wrappers/concoct_postprocess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os 
 4 | import sys
 5 | import subprocess
 6 | 
 7 | 
 8 | with os.scandir(sys.argv[1]) as itr:
 9 |     i = 0
10 |     for entry in itr:
11 |         bin_id, suffix = os.path.splitext(entry.name)
12 |         if suffix == ".fa":
13 |             i += 1
14 |             subprocess.run('''mv %s %s''' \
15 |                   % (os.path.join(sys.argv[1], entry.name),
16 |                      os.path.join(sys.argv[2] + "." + str(i) + ".fa")), shell=True)


--------------------------------------------------------------------------------
/scripts/find_ATG.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | use strict;
 3 | use warnings;
 4 | # QQ group: perlchina
 5 | # question: find longest ATG+ sequences
 6 | 
 7 | my $seq = "ATGATGASFSAGATGATGATGSFAATGATGATGATGDSFS";
 8 | 
 9 | my @atg = $seq =~ /((ATG)+)/g;
10 | my @atg_len = map { length($_) } @atg;
11 | print "@atg\n";
12 | print "@atg_len\n\n";
13 | 
14 | print((sort {$b cmp $a} ($seq =~ /(?:ATG)+/g))[0]);
15 | print "\n\n";
16 | 
17 | my @atg_2 = $seq =~ /(?:ATG)+/g;
18 | my @atg_len_2 = map { length($_) } @atg_2;
19 | print "@atg_2\n";
20 | print "@atg_len_2\n";


--------------------------------------------------------------------------------
/metapi/profiles/generic/lsf_status.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | 
 4 | import os
 5 | import sys
 6 | import warnings
 7 | import subprocess
 8 | 
 9 | 
10 | jobid = sys.argv[1]
11 | 
12 | out= subprocess.run(['bjobs','-noheader',jobid],stdout=subprocess.PIPE).stdout.decode('utf-8')
13 | 
14 | state = out.strip().split()[2]
15 | 
16 | 
17 | map_state={"PEND":'running',
18 |            "RUN":'running',
19 |            "PROV":"running",
20 |            "WAIT":'running',
21 |            "DONE":'success',
22 |            "":'success'}
23 | 
24 | print(map_state.get(state,'failed'))
25 | 


--------------------------------------------------------------------------------
/metapi/snakefiles/simulate_wf.smk:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env snakemake
 3 | 
 4 | import sys
 5 | from pprint import pprint
 6 | import pandas as pd
 7 | 
 8 | from snakemake.utils import min_version
 9 | min_version("7.0")
10 | shell.executable("bash")
11 | 
12 | import metapi
13 | 
14 | METAPI_DIR = metapi.__path__[0]
15 | WRAPPER_DIR = os.path.join(METAPI_DIR, "wrappers")
16 | DATA_DIR = os.path.join(METAPI_DIR, "data")
17 | 
18 | 
19 | SAMPLES, DATA_TYPE = metapi.parse_samples(config["params"]["samples"])
20 | 
21 | 
22 | include: "../rules/simulate.smk"
23 | 
24 | 
25 | rule all:
26 |     input:
27 |         rules.simulate_all.input


--------------------------------------------------------------------------------
/metapi/wrappers/maxbin2_postprocess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | 
 7 | with os.scandir(sys.argv[1]) as itr:
 8 |     for entry in itr:
 9 |         bin_id, bin_suffix = os.path.splitext(entry.name)
10 |         bin_name, cluster_num = bin_id.rsplit(".", maxsplit=1)
11 |         bin_id = bin_name + "." + cluster_num.lstrip("0")
12 |         if bin_suffix == ".fasta":
13 |             subprocess.run('''mv %s %s''' \
14 |                   % (os.path.join(sys.argv[1], entry.name),
15 |                      os.path.join(sys.argv[1], bin_id + ".fa")), shell=True)
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/metapi/wrappers/dastools_postprocess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | import glob
 6 | import subprocess
 7 | 
 8 | 
 9 | bins_prefix = sys.argv[1] #.replace("dastools.bin", "")
10 | 
11 | mags_list = glob.glob(os.path.join(sys.argv[1] + "_DASTool_bins", "*.fa"))
12 | 
13 | if len(mags_list) > 0:
14 |     for bin_fa in mags_list:
15 |         if (os.path.getsize(bin_fa) > 0) and  (not "*" in bin_fa):
16 |             binner = os.path.basename(bin_fa).split(".")[0]
17 |             if (binner != "unbinned") and (binner != "*"):
18 |                 bin_fa_ = bins_prefix + "." + os.path.basename(bin_fa).replace(binner, f'''{binner}_dastools.bin''')
19 |                 subprocess.run(f'''mv {bin_fa} {bin_fa_}''', shell=True)


--------------------------------------------------------------------------------
/scripts/contigs_filter_by_len.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from contigs_to_gene import cut_fasta_by_len
 3 | import argparse
 4 | 
 5 | def main():
 6 |     parser = argparse.ArgumentParser(description="cut fasta by len")
 7 |     parser.add_argument('-fa', type=str, help='scaffolds or contigs file path')
 8 |     parser.add_argument('-sclen', type=int, help='scaffold or contigs length cutoff, default: 500', default=500)
 9 |     parser.add_argument('-outdir', type=str, help='output dir store gene prediction results')
10 |     parser.add_argument('-prefix', type=str, help='prefix for file name')
11 |     args = parser.parse_args()
12 |     cut_fasta_by_len(args.fa, args.sclen, args.outdir, args.prefix, ".fa")
13 | 
14 | if __name__ == '__main__':
15 |     main()


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # vscode
 2 | .vscode
 3 | 
 4 | # pycharm
 5 | .idea
 6 | 
 7 | # kdevelop
 8 | .kdev4
 9 | metapi.kdev4
10 | 
11 | # snakemake
12 | .snakemake
13 | 
14 | metapi/__pycache__/
15 | metapi/*.pyc 
16 | 
17 | # pipenv
18 | build/
19 | dist/
20 | metapi.egg-info/
21 | release
22 | 
23 | conda/*.gz
24 | 
25 | notebooks/
26 | notebook
27 | 
28 | # test
29 | test/
30 | test/simulation_test/metaconfig.yaml
31 | 
32 | # examples
33 | example/basic_test/data/01.trimmed
34 | example/basic_test/data/02.assembly
35 | example/basic_test/data/03.alignment
36 | example/basic_test/data/04.binning
37 | example/basic_test/data/05.checkm
38 | example/basic_test/data/logs
39 | examples/simulation_test
40 | examples
41 | 
42 | # docs
43 | docs/hello.py
44 | 
45 | # others
46 | index.list
47 | data.tar.gz 


--------------------------------------------------------------------------------
/scripts/parse_mgs_profile.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import re
 4 | import sys
 5 | from pprint import pprint
 6 | 
 7 | 
 8 | def parse(mgs_profile):
 9 |     count = 0
10 |     with open(mgs_profile, 'r') as ih:
11 |         for line in ih:
12 |             line_list = re.split(r"\s+|,", line)
13 |             cag_id = line_list[0]
14 |             seq_count = line_list[1]
15 |             seq_id_list = line_list[2:]
16 |             count += 1
17 |             a = 0
18 |             if count == 1:
19 |                 for i in seq_id_list:
20 |                     print(i)
21 |                     a += 1
22 |                 print(a)
23 |                 print(seq_count)
24 |                 break
25 | 
26 | 
27 | def main():
28 |     parse(sys.argv[1])
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     main()
33 | 


--------------------------------------------------------------------------------
/metapi/profiles/generic/pbs_status.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import subprocess
 5 | import xml.etree.cElementTree as ET
 6 | 
 7 | jobid = sys.argv[1]
 8 | 
 9 | try:
10 |     res = subprocess.run("qstat -f -x {}".format(jobid), check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
11 | 
12 |     xmldoc = ET.ElementTree(ET.fromstring(res.stdout.decode())).getroot()
13 |     job_state = xmldoc.findall('.//job_state')[0].text
14 | 
15 |     if job_state == "C":
16 |         exit_status = xmldoc.findall('.//exit_status')[0].text
17 |         if exit_status == '0':
18 |             print("success")
19 |         else:
20 |             print("failed")
21 |     else:
22 |         print("running")
23 | 
24 | except (subprocess.CalledProcessError, IndexError, KeyboardInterrupt) as e:
25 |     print("failed")
26 | 


--------------------------------------------------------------------------------
/metapi/profiles/pbs-torque/pbs-status.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import subprocess
 5 | import xml.etree.cElementTree as ET
 6 | 
 7 | jobid = sys.argv[1]
 8 | 
 9 | try:
10 |     res = subprocess.run("qstat -f -x {}".format(jobid), check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
11 | 
12 |     xmldoc = ET.ElementTree(ET.fromstring(res.stdout.decode())).getroot()
13 |     job_state = xmldoc.findall('.//job_state')[0].text
14 | 
15 |     if job_state == "C":
16 |         exit_status = xmldoc.findall('.//exit_status')[0].text
17 |         if exit_status == '0':
18 |             print("success")
19 |         else:
20 |             print("failed")
21 |     else:
22 |         print("running")
23 | 
24 | except (subprocess.CalledProcessError, IndexError, KeyboardInterrupt) as e:
25 |     print("failed")
26 | 


--------------------------------------------------------------------------------
/scripts/merge_sig_csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import csv
 4 | import argparse
 5 | import pandas as pd
 6 | 
 7 | def merge_csv(csvlist, output):
 8 |     frame = pd.DataFrame()
 9 |     list = []
10 |     with open(csvlist, 'r') as csv_l:
11 |         for csv_f in csv_l:
12 |             df = pd.read_csv(csv_f.strip(), index_col=None, header=0)
13 |             list.append(df)
14 |     frame = pd.concat(list)
15 |     frame.to_csv(output)
16 | 
17 | def main():
18 |     parser = argparse.ArgumentParser(description="merge sourmash sigs to a csv file")
19 |     parser.add_argument('-csvlist', type=str, help='a file contain sig file path list')
20 |     parser.add_argument('-output', type=str, help='output csv file')
21 |     args = parser.parse_args()
22 |     merge_csv(args.csvlist, args.output)
23 | 
24 | if __name__ == '__main__':
25 |     main()
26 | 


--------------------------------------------------------------------------------
/metapi/wrappers/hmmsearch_wrapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import pyhmmer
 3 | import sys
 4 | 
 5 | hmm_threads = sys.argv[0]
 6 | hmm_evalue = sys.argv[1]
 7 | hmm_tbl = sys.argv[2]
 8 | hmm_db = sys.argv[3]
 9 | hmm_seq = sys.argv[4]
10 | 
11 | # reference
12 | # https://github.com/althonos/pyhmmer/issues/22
13 | 
14 | alphabet = pyhmmer.easel.Alphabet.amino()
15 | 
16 | with pyhmmer.easel.SequenceFile(hmm_seq, digital=True, alphabet=alphabet) as seq_file:
17 |     sequences = list(seq_file)
18 | 
19 | with open(hmm_tbl, "wb") as dst:
20 |     with pyhmmer.plan7.HMMFile(hmm_db) as hmm_file:
21 |         for i, hits in enumerate(pyhmmer.hmmsearch(hmm_file, sequences, cpus=hmm_threads, E=hmm_evalue)):
22 |             hits.write(dst, format="targets", header=i==0)
23 | 
24 | # example
25 | # python hmmsearch_wrapper.py 8 0.01 output.tbl virus.hmm test.faa
26 | 


--------------------------------------------------------------------------------
/scripts/get_bins_id.py:
--------------------------------------------------------------------------------
 1 | ##!/usr/bin/env python
 2 | import glob
 3 | import os
 4 | import pprint
 5 | import sys
 6 | 
 7 | import pandas
 8 | 
 9 | 
10 | def parse_mags(mags_dir):
11 |     bin_list = []
12 |     pattern = mags_dir + "/*/*.fa"
13 |     for bin in glob.glob(pattern):
14 |         bin_dict = {}
15 |         bin_fa = os.path.basename(bin)
16 |         bin_id = bin_fa.rstrip(".fa")
17 |         id = ".".join(bin_fa.split(".")[:-3])
18 |         bin_dict["bin_path"] = bin.strip()
19 |         bin_dict["bin_id"] = bin_id
20 |         bin_dict["id"] = id
21 |         bin_list.append(bin_dict)
22 |     pprint.pprint(bin_list)
23 |     #bin_df = pandas.DataFrame(bin_list).set_index("bin_id", drop=False)
24 |     #pprint.pprint(bin_df)
25 |     # a = bin_df.loc["s1.bin.2", ["bin_path"]].dropna()[0]
26 |     # print(a)
27 | 
28 | 
29 | parse_mags(sys.argv[1])
30 | 


--------------------------------------------------------------------------------
/metapi/profiles/lsf/CookieCutter.py:
--------------------------------------------------------------------------------
 1 | class CookieCutter:
 2 |     """
 3 |     Cookie Cutter wrapper
 4 |     """
 5 | 
 6 |     @staticmethod
 7 |     def get_default_threads() -> int:
 8 |         return int("1")
 9 | 
10 |     @staticmethod
11 |     def get_default_mem_mb() -> int:
12 |         return int("1024")
13 | 
14 |     @staticmethod
15 |     def get_log_dir() -> str:
16 |         return "logs/cluster"
17 | 
18 |     @staticmethod
19 |     def get_default_queue() -> str:
20 |         return ""
21 | 
22 |     @staticmethod
23 |     def get_lsf_unit_for_limits() -> str:
24 |         return "KB"
25 | 
26 |     @staticmethod
27 |     def get_unknwn_behaviour() -> str:
28 |         return "wait"
29 | 
30 |     @staticmethod
31 |     def get_zombi_behaviour() -> str:
32 |         return "ignore"
33 | 
34 |     @staticmethod
35 |     def get_latency_wait() -> float:
36 |         return float("5")
37 | 


--------------------------------------------------------------------------------
/metapi/profiles/slurm/CookieCutter.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Based on lsf CookieCutter.py
 3 | #
 4 | import os
 5 | import json
 6 | 
 7 | d = os.path.dirname(__file__)
 8 | with open(os.path.join(d, "settings.json")) as fh:
 9 |     settings = json.load(fh)
10 | 
11 | 
12 | class CookieCutter:
13 | 
14 |     SBATCH_DEFAULTS = settings['SBATCH_DEFAULTS']
15 |     CLUSTER_NAME = settings['CLUSTER_NAME']
16 |     CLUSTER_CONFIG = settings['CLUSTER_CONFIG']
17 |     ADVANCED_ARGUMENT_CONVERSION = settings['ADVANCED_ARGUMENT_CONVERSION']
18 | 
19 |     @staticmethod
20 |     def get_cluster_option() -> str:
21 |         cluster = CookieCutter.CLUSTER_NAME
22 |         if cluster != "":
23 |             return f"--cluster={cluster}"
24 |         return ""
25 | 
26 |     @staticmethod
27 |     def get_advanced_argument_conversion() -> bool:
28 |         val = {"yes": True, "no": False}[
29 |             CookieCutter.ADVANCED_ARGUMENT_CONVERSION
30 |         ]
31 |         return val
32 | 


--------------------------------------------------------------------------------
/scripts/print_reads_length.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | "get each reads length form fasta/fastq file"
 3 | import argparse
 4 | import gzip
 5 | from Bio import SeqIO
 6 | 
 7 | def print_len(infile, seqtype):
 8 |     '''print_len function'''
 9 |     if infile.endswith(".gz"):
10 |         handle = gzip.open(infile, 'rt')
11 |     else:
12 |         handle = open(infile, 'rt')
13 |     for reads in SeqIO.parse(handle, seqtype):
14 |         print(reads.id, "\t", len(reads))
15 |     handle.close()
16 | 
17 | 
18 | def main():
19 |     '''main function'''
20 |     parser = argparse.ArgumentParser(description='print each reads id and length info form fasta/fastq file')
21 |     parser.add_argument('--infile', action='store', dest='infile', help='input fasta/fastq file')
22 |     parser.add_argument('--seqtype', action='store', dest='seqtype', help='input file seq type, fasta or fastq')
23 |     args = parser.parse_args()
24 |     print_len(args.infile, args.seqtype)
25 | 
26 | if __name__ == "__main__":
27 |     main()
28 | 


--------------------------------------------------------------------------------
/tests/test_bash/Snakefile:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env snakemake
 2 | 
 3 | rule download:
 4 |     output:
 5 |         r1 = "ecoli_1K.1.fq.gz",
 6 |         r2 = "ecoli_1K.2.fq.gz"
 7 |     threads:
 8 |         1
 9 |     shell:
10 |         '''
11 |         curl -o ecoli_1K.1.fq.gz https://raw.githubusercontent.com/ablab/spades/spades_3.15.5/assembler/test_dataset/ecoli_1K_1.fq.gz
12 |         curl -o ecoli_1K.2.fq.gz https://raw.githubusercontent.com/ablab/spades/spades_3.15.5/assembler/test_dataset/ecoli_1K_2.fq.gz
13 |         '''
14 | 
15 | 
16 | rule decompress:
17 |     input:
18 |         r1 = "ecoli_1K.1.fq.gz",
19 |         r2 = "ecoli_1K.2.fq.gz"
20 |     output:
21 |         r1 = "ecoli_1K.1.fq",
22 |         r2 = "ecoli_1K.2.fq"
23 |     shell:
24 |         '''
25 |         R1={input.r1}
26 |         R2={input.r2}
27 |         pigz -dc {input.r1} > ${{R1%.gz}}
28 |         pigz -dc {input.r2} > ${{R2%.gz}}
29 |         '''
30 | 
31 | 
32 | rule all:
33 |     input:
34 |         "ecoli_1K.1.fq",
35 |         "ecoli_1K.2.fq"
36 | 


--------------------------------------------------------------------------------
/scripts/batch_prokka.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import glob
 4 | import os
 5 | import pprint
 6 | import sys
 7 | 
 8 | 
 9 | def run(dir_list, outdir, logdir):
10 |     cmd_list = []
11 |     count = 1
12 |     with open(dir_list) as f:
13 |         for dir in f:
14 |             count += 1
15 |             bin_list = glob.glob(dir.strip() + "/*.fa")
16 |             for bin in bin_list:
17 |                 bin_id = os.path.basename(bin).rstrip(".fa")
18 |                 prokka_dir = os.path.join(outdir,
19 |                                           os.path.basename(dir.strip()))
20 |                 log = os.path.join(logdir, bin_id + ".prokka.log")
21 |                 cmd = "prokka %s --outdir %s --prefix %s --kingdom Bacteria --cpus 8 2> %s" % (
22 |                     bin.strip(), prokka_dir, bin_id, log)
23 |                 cmd_list.append(cmd)
24 |             if count == 2:
25 |                 break
26 |     return cmd_list
27 | 
28 | 
29 | cmd_list = run(sys.argv[1], sys.argv[2], sys.argv[3])
30 | pprint.pprint(cmd_list)
31 | 


--------------------------------------------------------------------------------
/metapi/profiles/generic/key_mapping.yaml:
--------------------------------------------------------------------------------
 1 | # only parameters defined in key_mapping (see below) are passed to the command in the order specified.
 2 | system: "slurm" #check if system is defined below
 3 | 
 4 | slurm:
 5 |   command: "sbatch --parsable"
 6 |   key_mapping:
 7 |     name: "--job-name={}"
 8 |     threads: "-n {}"
 9 |     mem: "--mem={}g"
10 |     account: "--account={}"
11 |     queue: "--partition={}"
12 |     time: "--time={}"
13 |     nodes: "-N {}"
14 | pbs:
15 |   command: "qsub"
16 |   key_mapping:
17 |     name: "-N {}"
18 |     account: "-A {}"
19 |     queue: "-q {}"
20 |     threads: "-l nodes=1:ppn={}" # always use 1 node
21 |     mem: "-l mem={}gb"
22 |     time: "-l walltime={}00" #min= seconds x 100
23 | lsf:
24 |   command: "bsub -e lsf_%J.log -o lsf_%J.log"
25 |   key_mapping:
26 |     queue: "-q {}"
27 |     name: "-J {}"
28 |     threads: "-n {}"
29 |     mem: '-R "rusage[mem={}000]"'
30 |     account: "-P {}"
31 |     nodes: "-C {}"
32 | 
33 | 
34 | 
35 | # for other cluster systems see: https://slurm.schedmd.com/rosetta.pdf
36 | 


--------------------------------------------------------------------------------
/metapi/profiles/generic/cluster_config.yaml:
--------------------------------------------------------------------------------
 1 | ## This is a yaml file, defining options for specific rules or by default.
 2 | ## The '#' defines a comment.
 3 | ## the two spaces at the beginning of rows below rulenames are important.
 4 | ## For more information see https://snakemake.readthedocs.io/en/stable/executing/cluster-cloud.html#cluster-execution
 5 | 
 6 | # default parameter for all rules
 7 | __default__:
 8 |   #queue: normal
 9 |   nodes: 1
10 | 
11 | 
12 | # The following rules in atlas need need more time/memory.
13 | # If you need to submit them to different queues you can configure this as outlined.
14 | 
15 | # run_megahit:
16 | #   queue: bigmem
17 | # run_spades:
18 | #   queue: bigmem
19 | 
20 | #gtdb-tk classify uses 'large_mem' and log time
21 | # classify:
22 | #   queue: bigmem-long
23 | 
24 | # run_checkm_lineage_wf:
25 | #   queue: long
26 | 
27 | # run_all_checkm_lineage_wf:
28 | #   queue: long
29 | 
30 | # You can  overwrite values for specific rules
31 | # rulename:
32 | #   queue: long
33 | #   account: ""
34 | #   time:  # h
35 | #   threads:
36 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | # Check https://circleci.com/docs/2.0/language-python/ for more details
 2 | 
 3 | version: 2.1
 4 | 
 5 | orbs:
 6 |   python: circleci/python@0.2.1
 7 | 
 8 | jobs:
 9 |   build-and-test:
10 |     executor: python/default
11 |     steps:
12 |       - run:
13 |           name: conda create
14 |           command: |
15 |             ls $HOME
16 |             if [ ! -d "/home/circleci/conda" ]; then
17 |                 wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
18 |                 /bin/bash Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/conda
19 |             else
20 |                 echo "Miniconda is already installed, continuing to build."
21 |             fi
22 |       - save_cache:
23 |           paths:
24 |             - /home/circleci/conda
25 |           key: v2-dependencies
26 | 
27 |       - run:
28 |           name: conda build
29 |           command: |
30 |             cd ~/metapi
31 |             /bin/bash ~/metapi/conda
32 |             conda build ./
33 |       - store_artifacts:
34 |           path: ~/repo/build
35 |           destination: singularity-containers
36 | 
37 | workflows:
38 |   main:
39 |     jobs:
40 |       - build-and-test
41 | 


--------------------------------------------------------------------------------
/metapi/wrappers/vamb/write_abundances.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import argparse
 3 | import vamb
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def write_abundances(
 8 |     mask_refhash: Path, bampath: Path, min_identity: float, outfile: Path
 9 | ):
10 |     """For every sample, compute the abundances given the mask and refhashes"""
11 |     loadnpz = np.load(mask_refhash)
12 |     refhash = loadnpz["refhash"]
13 |     mask = loadnpz["mask"]
14 |     refhash = refhash.reshape(1)[0]
15 |     (abundance, _) = vamb.parsebam.Abundance.run_pycoverm(
16 |         paths=[bampath],
17 |         minid=min_identity,
18 |         target_refhash=refhash,
19 |         target_identifiers=None,
20 |         mask=mask,
21 |     )
22 |     vamb.vambtools.write_npz(outfile, abundance.ravel())
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument("--msk", type=Path, help="mask refhash")
28 |     parser.add_argument("--b", type=Path, help=" bam path")
29 |     parser.add_argument("--min_id", type=float, help="min identity for alignment")
30 |     parser.add_argument("--out", type=Path, help="abundances outfile")
31 | 
32 |     opt = parser.parse_args()
33 | 
34 |     write_abundances(opt.msk, opt.b, opt.min_id, opt.out)
35 | 


--------------------------------------------------------------------------------
/docs/metapi.dio:
--------------------------------------------------------------------------------
1 | <mxGraphModel grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="850" pageHeight="1100" background="#ffffff"><root><mxCell id="0"/><mxCell id="1" parent="0"/><mxCell id="2" value="metapi" style="whiteSpace=wrap;html=1;aspect=fixed;fillColor=#33FF99;" parent="1" vertex="1"><mxGeometry x="140" y="120" width="80" height="80" as="geometry"/></mxCell><mxCell id="3" value="dbpi" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#D5E8D4;" parent="1" vertex="1"><mxGeometry x="40" y="270" width="120" height="60" as="geometry"/></mxCell><mxCell id="4" value="drepi" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#FF99CC;" parent="1" vertex="1"><mxGeometry x="200" y="270" width="120" height="60" as="geometry"/></mxCell><mxCell id="5" value="camipi" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#FF8000;" parent="1" vertex="1"><mxGeometry x="120" y="560" width="120" height="60" as="geometry"/></mxCell><mxCell id="6" value="growpi" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#00FFFF;" parent="1" vertex="1"><mxGeometry x="40" y="380" width="120" height="60" as="geometry"/></mxCell><mxCell id="7" value="CAMI community" style="ellipse;whiteSpace=wrap;html=1;fillColor=#00994D;" parent="1" vertex="1"><mxGeometry x="130" y="700" width="120" height="80" as="geometry"/></mxCell></root></mxGraphModel>


--------------------------------------------------------------------------------
/metapi/wrappers/vamb/abundances_mask.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import argparse
 3 | from vamb.vambtools import RefHasher
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def abundances_mask(headers: Path, mask_refhash: Path, min_contig_size: int):
 8 |     """# Using the headers above, compute the mask and the refhash"""
 9 | 
10 |     mask = []
11 |     identifiers = []
12 | 
13 |     with open(headers) as file:
14 |         for line in file:
15 |             # SN:S27C112075   LN:2239
16 |             (sn, ln) = line.split("\t")
17 |             if sn[:3] != "SN:" or ln[:3] != "LN:":
18 |                 raise ValueError("Unknown format")
19 |             passed = int(ln[3:]) >= min_contig_size
20 |             mask.append(passed)
21 |             if passed:
22 |                 identifiers.append(sn[3:])
23 | 
24 |     np.savez_compressed(
25 |         mask_refhash,
26 |         mask=np.array(mask, dtype=bool),
27 |         refhash=RefHasher.hash_refnames(identifiers),
28 |     )
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument("--h", type=Path, help=" Headers file")
34 |     parser.add_argument("--msk", type=Path, help="mask refhash")
35 | 
36 |     parser.add_argument("--minsize", type=int, help="min contig size")
37 | 
38 |     opt = parser.parse_args()
39 | 
40 |     abundances_mask(opt.h, opt.msk, opt.minsize)
41 | 


--------------------------------------------------------------------------------
/scripts/checkm_link.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import argparse
 5 | 
 6 | 
 7 | def link(link_dir, batch_num, bin_list):
 8 |     bins = []
 9 |     with open(bin_list, "r") as ih:
10 |         for line in ih:
11 |             bins.append(os.path.abspath(line.strip()))
12 | 
13 |     os.makedirs(link_dir, exist_ok=True)
14 | 
15 |     if len(bins) > 0:
16 |         for batch_id in range(0, len(bins), batch_num):
17 |             batch_dir = os.path.join(link_dir, "bins_%d" % batch_id)
18 |             os.makedirs(batch_dir, exist_ok=True)
19 | 
20 |             for bin_fa in bins[batch_id : batch_id + batch_num]:
21 |                 os.symlink(bin_fa, os.path.join(batch_dir, os.path.basename(bin_fa)))
22 |     else:
23 |         os.makedirs(os.path.join(link_dir, "bins_0"), exist_ok=True)
24 | 
25 | 
26 | def main():
27 |     parser = argparse.ArgumentParser("checkm link")
28 |     parser.add_argument("--link_dir", help="a dir contains checkm input link")
29 |     parser.add_argument(
30 |         "--batch_num",
31 |         type=int,
32 |         default=500,
33 |         help="how many bins each cehckm run, default: 500",
34 |     )
35 |     parser.add_argument("--bin_list", help="a file contains all bin path")
36 |     args = parser.parse_args()
37 | 
38 |     link(args.link_dir, args.batch_num, args.bin_list)
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 


--------------------------------------------------------------------------------
/metapi/wrappers/vamb/create_abundances.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import argparse
 3 | import vamb
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def create_abundances(
 8 |     abundances: list[Path], mask_refhash: Path, min_id: float, outfile: Path
 9 | ):
10 |     """Merge the abundances to a single Abundance object and save it"""
11 |     refhash = np.load(mask_refhash)["refhash"]
12 | 
13 |     n_samples = len(abundances)
14 |     first = vamb.vambtools.read_npz(abundances[0])
15 |     print(len(first), n_samples)
16 |     print(first.shape)
17 |     matrix = np.empty((len(first), n_samples), dtype=np.float32)
18 |     matrix[:, 0] = first
19 |     for i, path in enumerate(abundances[1:]):
20 |         matrix[:, i + 1] = vamb.vambtools.read_npz(path)
21 |     abundance = vamb.parsebam.Abundance(
22 |         matrix, [str(i) for i in abundances], min_id, refhash
23 |     )
24 |     abundance.save(outfile)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument("--msk", type=Path, help="mask refhash")
30 |     parser.add_argument("--ab", type=Path, nargs="+", help=" abundancaes list of files")
31 |     parser.add_argument("--min_id", type=float, help="min identity for alignment")
32 |     parser.add_argument("--out", type=Path, help="abundances outfile")
33 | 
34 |     opt = parser.parse_args()
35 | 
36 |     create_abundances(opt.ab, opt.msk, opt.min_id, opt.out)
37 | 


--------------------------------------------------------------------------------
/scripts/merge_fasta_by_len.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | import gzip
 4 | import os
 5 | 
 6 | from Bio import SeqIO
 7 | 
 8 | 
 9 | def merge_fa_by_len(falist, minlen, maxlen, outfa):
10 |     with open(falist, 'r') as falist_h, open(outfa, 'w') as out_h:
11 |         for fa_file in falist_h:
12 |             fa_file = fa_file.rstrip()
13 |             if fa_file.endswith(".gz"):
14 |                 fa_h = gzip.open(fa_file, 'rt')
15 |             else:
16 |                 fa_h = open(fa_file, 'r')
17 |             for record in SeqIO.parse(fa_h, 'fasta'):
18 |                 if (len(record.seq) >= minlen) and (len(record.seq) <= maxlen):
19 |                     SeqIO.write(record, out_h, 'fasta')
20 |             fa_h.close()
21 | 
22 | 
23 | def main():
24 |     parser = argparse.ArgumentParser(description="merge many fasta file to a fasta file by length cutoff")
25 |     parser.add_argument('--falist', type=str, help='input file contain fasta path list')
26 |     parser.add_argument('--minlen', type=int, help='sequences min length cutoff', default=1)
27 |     parser.add_argument('--maxlen', type=int, help='sequences max length cutoff', default=10000000000)
28 |     parser.add_argument('--outfa', type=str, help='output fasta contain sequences which length between [minlen, maxlen]')
29 |     args = parser.parse_args()
30 | 
31 |     merge_fa_by_len(args.falist, args.minlen, args.maxlen, args.outfa)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     main()
36 | 


--------------------------------------------------------------------------------
/metapi/wrappers/misc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import argparse
 5 | import subprocess
 6 | import sys
 7 | 
 8 | 
 9 | def link_or_cat(args):
10 |     fq_gz = os.path.join(args.output_dir, args.basename + ".fq.gz")
11 | 
12 |     if (not os.path.exists(fq_gz)) or (os.path.getsize(fq_gz) == 0):
13 |         subprocess.call(
14 |             f'''rm -rf {fq_gz}''',
15 |             shell=True, stdout=sys.stdout, stderr=sys.stderr)
16 | 
17 |         if len(args.input_file) == 1:
18 |             reads = os.path.realpath(args.input_file[0])
19 |             subprocess.call(
20 |                 f'''
21 |                 pushd {args.output_dir} && \
22 |                 ln -s {reads} {args.basename}.fq.gz && \
23 |                 popd
24 |                 ''', shell=True, stdout=sys.stdout, stderr=sys.stderr)
25 |         else:
26 |             reads = " ".join(args.input_file)
27 |             subprocess.call(
28 |                 f'''
29 |                 cat {reads} > {args.output_dir}/{args.basename}.fq.gz
30 |                 ''', shell=True, stdout=sys.stdout, stderr=sys.stderr)
31 | 
32 | 
33 | def main():
34 |     parser = argparse.ArgumentParser("metapi misc")
35 |     parser.add_argument("--basename", dest="basename")
36 |     parser.add_argument("--input-file", dest="input_file", nargs="+")
37 |     parser.add_argument("--output-dir", dest="output_dir")
38 | 
39 |     args = parser.parse_args()
40 | 
41 |     link_or_cat(args)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     main()
46 | 


--------------------------------------------------------------------------------
/scripts/filter_pe_fastq_by_len.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | import gzip
 4 | from Bio import SeqIO, bgzf
 5 | 
 6 | def filter_pe_fasq_by_len(fq_1, fq_2, minlen, prefix):
 7 |     '''filter pe reads by min length'''
 8 |     fq_1_ = prefix + ".gt" + str(minlen) + ".1.fq.gz"
 9 |     fq_2_ = prefix + ".gt" + str(minlen) + ".2.fq.gz"
10 |     with bgzf.BgzfWriter(fq_1_, 'wb') as out_1, bgzf.BgzfWriter(fq_2_, 'wb') as out_2:
11 |         with gzip.open(fq_1, 'rt') as in_1, gzip.open(fq_2, 'rt') as in_2:
12 |             for rec_a, rec_b in zip(SeqIO.parse(in_1, 'fastq'), SeqIO.parse(in_2, 'fastq')):
13 |                 if (len(rec_a.seq) > minlen) and (len(rec_b.seq) > minlen):
14 |                     SeqIO.write(rec_a, out_1, 'fastq')
15 |                     SeqIO.write(rec_b, out_2, 'fastq')
16 | 
17 | def main():
18 |     '''main function'''
19 |     parser = argparse.ArgumentParser(
20 |         description='filter fastq file by reads length')
21 |     parser.add_argument('-1', '--read1', help='paired-end fastq file one')
22 |     parser.add_argument('-2', '--read2', help='paired-end fastq file two')
23 |     parser.add_argument('-l', '--minlen', type=int, default=80,
24 |                         help='remove reads if length < min-len')
25 |     parser.add_argument('-p', '--prefix',
26 |                         help='output prefix')
27 |     args = parser.parse_args()
28 | 
29 |     filter_pe_fasq_by_len(args.read1, args.read2, args.minlen, args.prefix)
30 | 
31 | if __name__ == '__main__':
32 |     main()
33 | 


--------------------------------------------------------------------------------
/metapi/visualization/dada2_stats_barplot.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | library(ggplot2)
 4 | 
 5 | 
 6 | dada2_stats_barplot <- function(df, stack=FALSE, pretty=FALSE)
 7 | {
 8 |     df <- df %>% dplyr::arrange(`non-chimeric`)
 9 |     df_l <- df %>% 
10 |         dplyr::select("sample-id", "input", "filtered", "denoised", "non-chimeric") %>%
11 |         tidyr::pivot_longer(!"sample-id", names_to="step", values_to="count") %>%
12 |         dplyr::mutate(step=factor(step,
13 |                                   levels=c("input", "filtered", "denoised", "non-chimeric")),
14 |                       `sample-id`=factor(`sample-id`,
15 |                                          levels=df$`sample-id`))
16 | 
17 |     position = position_dodge(0.8)
18 |     if (stack) { position = "stack" }
19 | 
20 |     if (pretty) {
21 |         p <- 
22 |             ggpubr::ggbarplot(df_l, x="sample-id", y="count",
23 |                               fill="step", color="step", x.text.angle=90,
24 |                               stat="identity", position=position)
25 |     } else {
26 |         p <-
27 |             ggplot(df_l, aes(x=`sample-id`, y=count)) +
28 |             geom_bar(aes(color=step, fill=step),
29 |                      stat="identity", position=position, width=0.7) +
30 |         theme_classic() +
31 |         theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5,
32 |                                        size=12, color="black"),
33 |               axis.text.y=element_text(size=12, color="black"))
34 |         }
35 | 
36 |     print(p)
37 |     return(p)
38 | }


--------------------------------------------------------------------------------
/scripts/fasta_length_tab.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from Bio.SeqIO.FastaIO import SimpleFastaParser
 3 | import argparse
 4 | """get each sequence length from a fasta file and pring it to a file, then plot"""
 5 | 
 6 | def gen_fa_len_tab(fa_file, len_out):
 7 |     with open(len_out, 'w') as out_handle:
 8 |         with open(fa_file, 'r') as in_handle:
 9 |             for title, seq in SimpleFastaParser(in_handle):
10 |                 #out_handle.write(title + "\t" + str(len(seq)))
11 |                 # just print id and seq length
12 |                 out_handle.write(title.split(' ')[0] + "\t" + str(len(seq)) + "\n")
13 | 
14 | # megahit contigs header contains contigs length info
15 | def gen_fa_len_tab_megahit(fa_file, len_out):
16 |     with open(len_out, 'w') as out_handle:
17 |         with open(fa_file, 'r') as in_handle:
18 |             for title, seq in SimpleFastaParser(in_handle):
19 |                 # maybe wrong
20 |                 len = title.split(' ')[-1].split('=')[-1]
21 |                 out_handle.write(title + "\t" + len + "\n")
22 | 
23 | def main():
24 |     parser = argparse.ArgumentParser(description='get fasta length info')
25 |     parser.add_argument('--fasta', type=str, help='fasta file')
26 |     parser.add_argument('--out', type=str, help='fasta length output file')
27 |     args = parser.parse_args()
28 | 
29 |     gen_fa_len_tab(args.fasta, args.out)
30 |     
31 |     # fasta input must contigs file which was assemblyed by megahit
32 |     # gen_fa_len_tab(args.fasta, args.out)
33 | 
34 | if __name__ == '__main__':
35 |     main()


--------------------------------------------------------------------------------
/scripts/kraken2_reads_merger.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import os
 5 | import sys
 6 | from glob import glob
 7 | 
 8 | 
 9 | def merger_reads(inputdir, outputdir):
10 |     merger = {}
11 |     for i in glob(inputdir.rstrip("/") + "/*/*.1.fq.gz"):
12 |         taxid = int(os.path.basename(i).split(".")[1])
13 |         if taxid in merger:
14 |             merger[taxid].append(i)
15 |         else:
16 |             merger[taxid] = [i]
17 | 
18 |     for taxid in merger:
19 |         r1_str = " ".join(merger[taxid])
20 |         r2_str = r1_str.replace("1.fq.gz",  "2.fq.gz")
21 |         r1 = os.path.join(outputdir, "%d.1.fq.gz" % taxid)
22 |         r2 = os.path.join(outputdir, "%d.2.fq.gz" % taxid)
23 |         if len(merger[taxid]) > 1:
24 |             cmd = 'cat %s > %s && rm -rf %s && cat %s > %s && rm -rf %s' % (r1_str, r1, r1_str, r2_str, r2, r2_str)
25 |             print(cmd)
26 |         else:
27 |             cmd = 'mv %s %s && mv %s %s' % (r1_str, r1, r2_str, r2)
28 |             print(cmd)
29 | 
30 | 
31 | def main(args_):
32 |     parser = argparse.ArgumentParser("merge kraken2 partition reads of many samples")
33 |     parser.add_argument(
34 |         '-i',
35 |         '--input_dir',
36 |         help='a directory contains many sample-specific directory'
37 |     )
38 |     parser.add_argument(
39 |         '-o',
40 |         '--output_dir',
41 |         help='output directory'
42 |     )
43 | 
44 |     args = parser.parse_args(args_)
45 |     merger_reads(args.input_dir, args.output_dir)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     main(sys.argv[1:])
50 | 


--------------------------------------------------------------------------------
/scripts/animf_cluster.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import pandas as pd
 5 | import drep
 6 | 
 7 | def check_drep_exists():
 8 |     try:
 9 |         from drep import argumentParser
10 |         print("drep version: %s" % argumentParser.version())
11 |     except ImportError:
12 |         print("drep doesn't exists")
13 | 
14 | 
15 | def cluster(Bdb, Cdb, work_dir):
16 |     Ndb = pd.DataFrame()
17 |     for bdb, name in drep.d_cluster.iteratre_clusters(Bdb, Cdb):
18 |         genome_list = bdb["location"].tolist()
19 |         anin_folder = os.path.join(work_dir, "ANImf_files")
20 | 
21 |         org_lengths = {}
22 |         files = []
23 |         deltafiles = []
24 | 
25 |         # genome1_vs_genome2.delta
26 |         # genome1_vs_genome2.filtered.delta
27 |         for g1 in genome_list:
28 |             cur_folder = os.path.join(anin_folder, os.path.basename(g1))
29 |             org_lengths[os.path.basename(g1)] = \
30 |                 drep.d_filter.calc_fasta_length(g1)
31 |             for g2 in genome_list:
32 |                 file_name = "{0}/{1}_vs_{2}".format(
33 |                     cur_folder,
34 |                     os.path.basename(g1),
35 |                     os.path.basename(g2)
36 |                 )
37 |                 deltafiles.append(file_name + ".filtered.delta")
38 |         df = drep.d_cluster.process_deltafiles(deltafiles,
39 |                                                org_lengths,
40 |                                                coverage_method="larger")
41 | 
42 |         
43 | 
44 | 
45 | def main():
46 |     pass
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     main()


--------------------------------------------------------------------------------
/scripts/find_path.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | 
 4 | def find_path(dir, suffix):
 5 |     path = {}
 6 |     for f in os.listdir(dir):
 7 |         if f.endswith(suffix):
 8 |             key = f.rstrip(suffix)
 9 |             path[key] = os.path.join(dir, f)
10 |     return path
11 | 
12 | def find_path_tag(dir, tag):
13 |     if tag == "raw":
14 |         r1 = {}
15 |         r2 = {}
16 |         for f in os.listdir(dir):
17 |             if f.endswith("1.fq.gz"):
18 |                 key = f.rstrip(".|-|_" + "1.fq.gz")
19 |                 r1[key] = os.path.join(dir, f)
20 |             if f.endswith("2.fq.gz"):
21 |                 key = f.rstrip(".|-|_" + "2.fq.gz")
22 |                 r2[key] = os.path.join(dir, f)
23 |         return (r1, r2)
24 |     elif tag == "clean" or tag == "rmhost":
25 |         r1 = {}
26 |         r2 = {}
27 |         rs = {}
28 |         rt = {}
29 |         for f in os.listdir(dir):
30 |             if f.endswith(tag + ".1.fq.gz"):
31 |                 key = f.rstrip(".|-|_" + tag + ".1.fq.gz")
32 |                 r1[key] = os.path.join(dir, f)
33 |             if f.endswith(tag + ".2.fq.gz"):
34 |                 key = f.rstrip(".|-|_" + tag + ".2.fq.gz")
35 |                 r1[key] = os.path.join(dir, f)
36 |             if f.endswith(tag + ".single.fq.gz"):
37 |                 key = f.rstrip(".|-|_" + tag + ".single.fq.gz")
38 |                 r1[key] = os.path.join(dir, f)
39 |             if f.endswith(tag + ".stat_out"):
40 |                 key = f.rstrip(".|-|_" + tag + ".stat_out")
41 |                 r1[key] = os.path.join(dir, f)
42 |         return (r1, r2, rs, rt)
43 | 


--------------------------------------------------------------------------------
/metapi/wrappers/prokka_wrapper.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import time
 4 | import subprocess
 5 | 
 6 | 
 7 | PROKKA_SUFFIX = ["err", "log", "faa", "ffn", "fna", "fsa",
 8 |                  "gbk", "gff", "sqn", "tbl", "tsv", "txt"]
 9 | 
10 | bin_list = glob.glob(snakemake.input["mags_dir"] + "/*.fa.gz")
11 | gff_count = 0
12 | 
13 | for bin_fa in bin_list:
14 |     bin_id = os.path.basename(os.path.splitext(os.path.splitext(bin_fa)[0])[0])
15 |     output_dir = os.path.join(snakemake.params["output_dir"], bin_id)
16 |     gff_file = os.path.join(output_dir, bin_id + ".gff")
17 | 
18 |     subprocess(f'''echo "\nProcessing {bin_fa}\n" >> {snakemake.log}''', shell=True)
19 | 
20 |     # https://github.com/tseemann/prokka/pull/130
21 |     # Uncompressing 1000's of gzip'ed fasta files just to run them through prokka can be a bit of pain.
22 |     subprocess(
23 |         f'''
24 |         prokka <(zcat {bin_fa}) \
25 |         --force \
26 |         --centre X \
27 |         --compliant \
28 |         --cpus {snakemake.threads} \
29 |         --outdir {output_dir} \
30 |         --locustag {bin_id} \
31 |         --prefix {bin_id} \
32 |         --kingdom {snakemake.params["kingdom"]} \
33 |         2>> {snakemake.log} 
34 |         ''', shell=True)
35 | 
36 |     if os.path.exists(gff_file):
37 |         gff_count += 1
38 | 
39 | if gff_count == len(bin_list):
40 |     subprocess('''touch {snakemake.output["done"]}''', shell=True)
41 | 
42 |     for suffix in PROKKA_SUFFIX:
43 |         prokka_f = os.path.join(output_dir, f'''{bin_id}.{suffix}''')
44 |         if os.path.exists(prokka_f):
45 |             subprocess.run(f'''pigz -f {prokka_f}''', shell=True)


--------------------------------------------------------------------------------
/scripts/taxonomy_info_covert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | import csv
 4 | import os
 5 | 
 6 | def parse_lca_classify(taxonomy_csv, output):
 7 |     # taxonomy = ['superkingdom', 'phylum', 'order', 'class', 'family', 'genus', 'species']
 8 |     headers = ["ID", "status", "lineage"]
 9 |     rows = []
10 |     with open(taxonomy_csv, 'r') as csv_h:
11 |         f_csv = csv.DictReader(csv_h)
12 |         # print(type(f_csv))
13 |         for row in f_csv:
14 |             row_dict = {}
15 |             row_dict["ID"] = os.path.basename(row["ID"])
16 |             row_dict["status"] = row["status"]
17 |             row_dict["lineage"] = row["superkingdom"] + ";" + \
18 |                                   row["phylum"] + ";" + \
19 |                                   row["order"] + ";" + \
20 |                                   row["class"] + ";" + \
21 |                                   row["family"] + ";" + \
22 |                                   row["genus"] + ";" + \
23 |                                   row["species"]
24 |             rows.append(row_dict)
25 | 
26 |     with open(output, 'w') as csv_out:
27 |         csv_f = csv.DictWriter(csv_out, headers)
28 |         csv_f.writeheader()
29 |         csv_f.writerows(rows)
30 | 
31 | def main():
32 |     parser = argparse.ArgumentParser(description="convert sourmash lca classify resuts to metacoder input")
33 |     parser.add_argument('-csv', type=str, help="sourmash lca classify results csv file")
34 |     parser.add_argument('-out', type=str, help='coverted csv file')
35 |     args = parser.parse_args()
36 |     parse_lca_classify(args.csv, args.out)
37 | 
38 | if __name__ == '__main__':
39 |     main()
40 | 


--------------------------------------------------------------------------------
/metapi/tooler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import concurrent.futures
 5 | import pandas as pd
 6 | 
 7 | 
 8 | def parse(stats_file):
 9 |     if os.path.exists(stats_file):
10 |         try:
11 |             df = pd.read_csv(stats_file, sep="\t")
12 |         except pd.errors.EmptyDataError:
13 |             print("%s is empty, please check" % stats_file)
14 |             return None
15 | 
16 |         if not df.empty:
17 |             return df
18 |         else:
19 |             return None
20 |     else:
21 |         print("%s is not exists" % stats_file)
22 |         return None
23 | 
24 | 
25 | def merge(input_list, func, workers, **kwargs):
26 |     df_list = []
27 |     with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
28 |         for df in executor.map(func, input_list):
29 |             if df is not None:
30 |                 df_list.append(df)
31 | 
32 |     df_ = pd.concat(df_list)
33 | 
34 |     if "output" in kwargs:
35 |         df_.to_csv(kwargs["output"], sep="\t", index=False)
36 |     return df_
37 | 
38 | 
39 | def merge2(input_list, func, workers, **kwargs):
40 |     df1_list = []
41 |     df2_list = []
42 |     with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
43 |         for df1, df2 in executor.map(func, input_list):
44 |             if df1 is not None:
45 |                 df1_list.append(df1)
46 |             if df2 is not None:
47 |                 df2_list.append(df2)
48 | 
49 |     df_1 = pd.concat(df1_list)
50 |     df_2 = pd.concat(df2_list)
51 | 
52 |     if "output_1" in kwargs:
53 |         df_1.to_csv(kwargs["output_1"], sep="\t", index=False)
54 |     if "output_2" in kwargs:
55 |         df_2.to_csv(kwargs["output_2"], sep="\t", index=False)
56 | 
57 |     return df_1, df_2
58 | 


--------------------------------------------------------------------------------
/metapi/snakefiles/gene_wf.smk:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env snakemake
 2 | 
 3 | import sys
 4 | import metapi
 5 | import pandas as pd
 6 | from snakemake.utils import min_version
 7 | 
 8 | min_version(7.0)
 9 | 
10 | shell.executable("bash")
11 | 
12 | METAPI_DIR = metapi.__path__[0]
13 | WRAPPER_DIR = os.path.join(METAPI_DIR, "wrappers")
14 | 
15 | 
16 | RMHOST_DO = any([
17 |     config["params"]["rmhost"]["bwa"]["do"],
18 |     config["params"]["rmhost"]["bowtie2"]["do"]])
19 | 
20 | 
21 | TRIMMING_DO = any([
22 |     config["params"]["trimming"]["sickle"]["do"],
23 |     config["params"]["trimming"]["fastp"]["do"],
24 |     config["params"]["trimming"]["trimmomatic"]["do"]])
25 | 
26 | 
27 | ASSEMBLERS = []
28 | if config["params"]["assembly"]["megahit"]["do"]:
29 |     ASSEMBLERS += ["megahit"]
30 | if config["params"]["assembly"]["idba_ud"]["do"]:
31 |     ASSEMBLERS += ["idba_ud"]
32 | if config["params"]["assembly"]["metaspades"]["do"]:
33 |     ASSEMBLERS += ["metaspades"]
34 | if config["params"]["assembly"]["spades"]["do"]:
35 |     ASSEMBLERS += ["spades"]
36 | 
37 | 
38 | SAMPLES, DATA_TYPE = metapi.parse_samples(config["params"]["samples"])
39 | 
40 | 
41 | include: "../rules/raw.smk"
42 | include: "../rules/trimming.smk"
43 | include: "../rules/rmhost.smk"
44 | include: "../rules/qcreport.smk"
45 | include: "../rules/assembly.smk"
46 | include: "../rules/predict_scaftigs.smk"
47 | include: "../rules/dereplicate_cds.smk"
48 | include: "../rules/upload.smk"
49 | 
50 | 
51 | rule all:
52 |     input:
53 |         rules.raw_all.input,
54 |         rules.trimming_all.input,
55 |         rules.rmhost_all.input,
56 |         rules.qcreport_all.input,
57 |         rules.assembly_all.input,
58 |         rules.predict_scaftigs_gene_all.input,
59 |         rules.dereplicate_gene_all.input,
60 |         rules.upload_all.input
61 | 


--------------------------------------------------------------------------------
/tests/test_spades/Snakefile:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env snakemake
 2 | 
 3 | rule download_reads:
 4 |     output:
 5 |         r1 = "test/reads/ecoli_1K.1.fq.gz",
 6 |         r2 = "test/reads/ecoli_1K.2.fq.gz"
 7 |     threads:
 8 |         1
 9 |     shell:
10 |         '''
11 |         curl -o test/reads/ecoli_1K.1.fq.gz https://raw.githubusercontent.com/ablab/spades/spades_3.15.5/assembler/test_dataset/ecoli_1K_1.fq.gz
12 |         curl -o test/reads/ecoli_1K.2.fq.gz https://raw.githubusercontent.com/ablab/spades/spades_3.15.5/assembler/test_dataset/ecoli_1K_2.fq.gz
13 |         '''
14 | 
15 | 
16 | rule prepare_samples_tsv:
17 |     input:
18 |         r1 = "test/reads/ecoli_1K.1.fq.gz",
19 |         r2 = "test/reads/ecoli_1K.2.fq.gz"
20 |     output:
21 |         "test/samples.tsv"
22 |     threads:
23 |         1
24 |     shell:
25 |         '''
26 |         fd -t f fq.gz $(pwd)/test/reads | \
27 |         sort | uniq | paste - - | \
28 |         awk 'BEGIN{{print "sample_id\tassembly_group\tbinning_group\tfq1\tfq2"}};{{print "ecoli_1K\tecoli_1K\tecoli_1K\t" $0}}' \
29 |         > {output}
30 |         '''
31 | 
32 | 
33 | rule metapi_init:
34 |     input:
35 |         "test/samples.tsv"
36 |     output:
37 |         "test/config.yaml"
38 |     conda:
39 |         "metapi.yaml"
40 |     shell:
41 |         '''
42 |         pushd test
43 |         metapi init -d . -s $(basename {input}) -b assembly --assembler spades
44 |         popd
45 |         '''
46 | 
47 | 
48 | rule metapi_run_assembly:
49 |     input:
50 |         "test/config.yaml"
51 |     output:
52 |         "test/results/04.assembly/report/assembly_stats_spades.tsv"
53 |     conda:
54 |         "metapi.yaml"
55 |     shell:
56 |         '''
57 |         pushd test
58 |         metapi mag_wf assembly_all --run-local --use-conda
59 |         popd
60 |         '''
61 | 
62 | 
63 | rule all:
64 |     input:
65 |         "test/results/04.assembly/report/assembly_stats_spades.tsv"
66 | 


--------------------------------------------------------------------------------
/metapi/rules/qcreport.smk:
--------------------------------------------------------------------------------
 1 | STEPS = ["raw"]
 2 | if TRIMMING_DO:
 3 |     STEPS += ["trimming"]
 4 | if RMHOST_DO:
 5 |     STEPS += ["rmhost"]
 6 | 
 7 | SAMPLESDIR = os.path.join(config["output"][STEPS[-1]])
 8 | 
 9 | if config["params"]["qcreport"]["do"]:
10 |     rule qcreport_summary:
11 |         input:
12 |             expand(os.path.join(config["output"]["qcreport"], "{step}_stats.tsv"),
13 |             step=STEPS)
14 |         output:
15 |             summary_l = os.path.join(config["output"]["qcreport"], "qc_stats_l.tsv"),
16 |             summary_w = os.path.join(config["output"]["qcreport"], "qc_stats_w.tsv")
17 |         priority:
18 |             30
19 |         threads:
20 |             config["params"]["qcreport"]["seqkit"]["threads"]
21 |         run:
22 |             df = metapi.merge(input, metapi.parse, threads)
23 |             df = metapi.compute_host_rate(df, STEPS, SAMPLES_ID_LIST, allow_miss_samples=True, output=output.summary_l)
24 |             metapi.qc_summary_merge(df, output=output.summary_w)
25 | 
26 | 
27 |     rule qcreport_plot:
28 |         input:
29 |             rules.qcreport_summary.output
30 |         output:
31 |             os.path.join(config["output"]["qcreport"], "qc_reads_num_barplot.pdf")
32 |         priority:
33 |             30
34 |         run:
35 |             df = metapi.parse(input[0])
36 |             metapi.qc_bar_plot(df, "seaborn", output=output[0])
37 | 
38 | 
39 |     rule qcreport_all:
40 |         input:
41 |             os.path.join(config["output"]["qcreport"], "qc_stats_l.tsv"),
42 |             os.path.join(config["output"]["qcreport"], "qc_stats_w.tsv"),
43 |             os.path.join(config["output"]["qcreport"], "qc_reads_num_barplot.pdf")
44 | 
45 | else:
46 |     rule qcreport_summary:
47 |         input:
48 | 
49 | 
50 |     rule qcreport_plot:
51 |         input:
52 | 
53 | 
54 |     rule qcreport_all:
55 |         input:
56 | 
57 | 
58 | localrules:
59 |     qcreport_summary,
60 |     qcreport_plot,
61 |     qcreport_all


--------------------------------------------------------------------------------
/metapi/profiles/generic/slurm_status.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import re
 3 | import subprocess as sp
 4 | import shlex
 5 | import sys
 6 | import time
 7 | import logging
 8 | logger = logging.getLogger("__name__")
 9 | 
10 | STATUS_ATTEMPTS = 20
11 | 
12 | jobid = sys.argv[1]
13 | 
14 | for i in range(STATUS_ATTEMPTS):
15 |     try:
16 |         sacct_res = sp.check_output(shlex.split("sacct -P -b -j {} -n".format(jobid)))
17 |         res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")}
18 |         break
19 |     except sp.CalledProcessError as e:
20 |         logger.error("sacct process error")
21 |         logger.error(e)
22 |     except IndexError as e:
23 |         pass
24 |     # Try getting job with scontrol instead in case sacct is misconfigured
25 |     try:
26 |         sctrl_res = sp.check_output(shlex.split("scontrol -o show job {}".format(jobid)))
27 |         m = re.search("JobState=(\w+)", sctrl_res.decode())
28 |         res = {jobid: m.group(1)}
29 |         break
30 |     except sp.CalledProcessError as e:
31 |         logger.error("scontrol process error")
32 |         logger.error(e)
33 |         if i >= STATUS_ATTEMPTS - 1:
34 |             print("failed")
35 |             exit(0)
36 |         else:
37 |             time.sleep(1)
38 | 
39 | status = res[jobid]
40 | 
41 | if (status == "BOOT_FAIL"):
42 |     print("failed")
43 | elif (status == "OUT_OF_MEMORY"):
44 |     print("failed")
45 | elif (status.startswith("CANCELLED")):
46 |     print("failed")
47 | elif (status == "COMPLETED"):
48 |     print("success")
49 | elif (status == "DEADLINE"):
50 |     print("failed")
51 | elif (status == "FAILED"):
52 |     print("failed")
53 | elif (status == "NODE_FAIL"):
54 |     print("failed")
55 | elif (status == "PREEMPTED"):
56 |     print("failed")
57 | elif (status == "TIMEOUT"):
58 |     print("failed")
59 | # Unclear whether SUSPENDED should be treated as running or failed
60 | elif (status == "SUSPENDED"):
61 |     print("failed")
62 | else:
63 |     print("running")
64 | 


--------------------------------------------------------------------------------
/scripts/get_prodigal_gbk_result.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # email: zhujie@genomics.cn
 3 | # license: GPL V3
 4 | import re
 5 | 
 6 | gbklist = "./gene.coordinate.gbk.pathlist.new"
 7 | out = open("./gene.coordinate.stat.out.new", 'w')
 8 | out.write("ID\tpartial=00\tpartial=01\tpartial=10\tpartial=11\ttotal_len\ttotal_num\tavg_length\n")
 9 | 
10 | with open(gbklist, 'r') as path_handler:
11 |     for gbkpath in path_handler:
12 |         genenum = {}
13 |         gene_total_len = 0
14 |         gene_total_num = 0
15 |         gene_avg_len = 0
16 |         partial = ['partial=00', 'partial=01', 'partial=10', 'partial=11']
17 |         genenum['partial=00'] = 0
18 |         genenum['partial=01'] = 0
19 |         genenum['partial=10'] = 0
20 |         genenum['partial=11'] = 0
21 | 
22 |         with open(gbkpath.strip(), 'r') as gbk_handler:
23 |             first = next(gbk_handler)
24 |             id = re.search(r'(.*?)seqhdr="(CL\d+_L\d+_\d+)_scaffold(.*)', first).group(2)
25 |             genenum['id'] = id
26 |             for line in gbk_handler:
27 |                 for tag in partial:
28 |                     if re.search(tag, line):
29 |                         genenum[tag] += 1
30 |                         gene_total_num += 1
31 |                 if re.search("CDS\s+(complement\()?(<)?(\d+)\.\.(>)?(\d+)(\))", line):
32 |                     len = re.search("CDS\s+(complement\()?(<)?(\d+)\.\.(>)?(\d+)(\))", line)
33 |                     gene_total_len += int(len.group(5)) - int(len.group(3)) + 1
34 |             
35 |             gene_avg_len = round(float(gene_total_len) / float(gene_total_num), 6)
36 | 
37 |             out.write("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%f\n" % (
38 |                     genenum['id'],
39 |                     genenum['partial=00'],
40 |                     genenum['partial=01'],
41 |                     genenum['partial=10'],
42 |                     genenum['partial=11'],
43 |                     gene_total_len,
44 |                     gene_total_num,
45 |                     gene_avg_len))
46 | 
47 | out.close()
48 | 


--------------------------------------------------------------------------------
/metapi/wrappers/vamb/concatenate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import os
 5 | import argparse
 6 | import gzip
 7 | import vamb
 8 | 
 9 | parser = argparse.ArgumentParser(
10 |     description="""Creates the input FASTA file for Vamb.
11 | Input should be one or more FASTA files, each from a sample-specific assembly.
12 | If keepnames is False, resulting FASTA can be binsplit with separator 'C'.""",
13 |     formatter_class=argparse.RawDescriptionHelpFormatter,
14 |     add_help=False,
15 | )
16 | 
17 | parser.add_argument("outpath", help="Path to output FASTA file")
18 | parser.add_argument("inpaths", help="Paths to input FASTA file(s)", nargs="+")
19 | parser.add_argument(
20 |     "-m",
21 |     dest="minlength",
22 |     metavar="",
23 |     type=int,
24 |     default=2000,
25 |     help="Discard sequences below this length [2000]",
26 | )
27 | parser.add_argument(
28 |     "--keepnames", action="store_true", help="Do not rename sequences [False]"
29 | )
30 | parser.add_argument("--nozip", action="store_true", help="Do not gzip output [False]")
31 | 
32 | if len(sys.argv) == 1 or sys.argv[1] in ("-h", "--help"):
33 |     parser.print_help()
34 |     sys.exit()
35 | 
36 | args = parser.parse_args()
37 | 
38 | # Check inputs
39 | for path in args.inpaths:
40 |     if not os.path.isfile(path):
41 |         raise FileNotFoundError(path)
42 | 
43 | if os.path.exists(args.outpath):
44 |     raise FileExistsError(args.outpath)
45 | 
46 | parent = os.path.dirname(args.outpath)
47 | if parent != "" and not os.path.isdir(parent):
48 |     raise NotADirectoryError(
49 |         f'Output file cannot be created: Parent directory "{parent}" is not an existing directory'
50 |     )
51 | 
52 | # Run the code. Compressing DNA is easy, this is not much bigger than level 9, but
53 | # many times faster
54 | filehandle = (
55 |     open(args.outpath, "w")
56 |     if args.nozip
57 |     else gzip.open(args.outpath, "wt", compresslevel=1)
58 | )
59 | vamb.vambtools.concatenate_fasta(
60 |     filehandle, args.inpaths, minlength=args.minlength, rename=(not args.keepnames)
61 | )
62 | filehandle.close()
63 | 


--------------------------------------------------------------------------------
/metapi/rules/simulate.smk:
--------------------------------------------------------------------------------
 1 | if config["params"]["simulate"]["do"]:
 2 |     rule simulate_short_reads:
 3 |         input:
 4 |             genomes = lambda wildcards: metapi.get_simulate_info(SAMPLES, wildcards, "genome")
 5 |         output:
 6 |             r1 = os.path.join(config["output"]["simulate"],
 7 |                               "short_reads/{sample}.simulate.1.fq.gz"),
 8 |             r2 = os.path.join(config["output"]["simulate"],
 9 |                               "short_reads/{sample}.simulate.2.fq.gz"),
10 |             abunf = os.path.join(config["output"]["simulate"],
11 |                                  "abundance/{sample}.simulate.abundance.txt")
12 |         log:
13 |             os.path.join(config["output"]["simulate"], "logs/{sample}.iss.log")
14 |         benchmark:
15 |             os.path.join(config["output"]["simulate"], "benchmark/iss/{sample}.iss.benchmark.txt")
16 |         conda:
17 |             config["envs"]["simulate"]
18 |         params:
19 |             output_prefix = os.path.join(config["output"]["simulate"],
20 |                                          "short_reads/{sample}"),
21 |             model = lambda wildcards: metapi.get_simulate_info(SAMPLES, wildcards, "model")[0],
22 |             reads_num = lambda wildcards: metapi.get_simulate_info(SAMPLES, wildcards, "reads_num")[0],
23 |             abundance = lambda wildcards: metapi.get_simulate_info(SAMPLES, wildcards, "abundance")
24 |         threads:
25 |             config["params"]["simulate"]["threads"]
26 |         script:
27 |             "../wrappers/simulate_reads.py"
28 | 
29 | 
30 |     rule simulate_all:
31 |         input:
32 |             expand([
33 |                 os.path.join(config["output"]["simulate"],
34 |                              "short_reads/{sample}.simulate.{read}.fq.gz"),
35 |                 os.path.join(config["output"]["simulate"],
36 |                              "abundance/{sample}.simulate.abundance.txt")],
37 |                    read=["1", "2"],
38 |                    sample=SAMPLES.index.unique())
39 | 
40 | else:
41 |     rule simulate_all:
42 |         input:
43 | 
44 | 
45 | localrules:
46 |     simulate_all


--------------------------------------------------------------------------------
/metapi/profiles/slurm/slurm-status.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import re
 3 | import subprocess as sp
 4 | import shlex
 5 | import sys
 6 | import time
 7 | import logging
 8 | from CookieCutter import CookieCutter
 9 | 
10 | logger = logging.getLogger("__name__")
11 | 
12 | STATUS_ATTEMPTS = 20
13 | 
14 | jobid = sys.argv[1]
15 | 
16 | cluster = CookieCutter.get_cluster_option()
17 | 
18 | for i in range(STATUS_ATTEMPTS):
19 |     try:
20 |         sacct_res = sp.check_output(shlex.split(f"sacct {cluster} -P -b -j {jobid} -n"))
21 |         res = {
22 |             x.split("|")[0]: x.split("|")[1]
23 |             for x in sacct_res.decode().strip().split("\n")
24 |         }
25 |         break
26 |     except sp.CalledProcessError as e:
27 |         logger.error("sacct process error")
28 |         logger.error(e)
29 |     except IndexError as e:
30 |         logger.error(e)
31 |         pass
32 |     # Try getting job with scontrol instead in case sacct is misconfigured
33 |     try:
34 |         sctrl_res = sp.check_output(
35 |             shlex.split(f"scontrol {cluster} -o show job {jobid}")
36 |         )
37 |         m = re.search(r"JobState=(\w+)", sctrl_res.decode())
38 |         res = {jobid: m.group(1)}
39 |         break
40 |     except sp.CalledProcessError as e:
41 |         logger.error("scontrol process error")
42 |         logger.error(e)
43 |         if i >= STATUS_ATTEMPTS - 1:
44 |             print("failed")
45 |             exit(0)
46 |         else:
47 |             time.sleep(1)
48 | 
49 | status = res[jobid]
50 | 
51 | if status == "BOOT_FAIL":
52 |     print("failed")
53 | elif status == "OUT_OF_MEMORY":
54 |     print("failed")
55 | elif status.startswith("CANCELLED"):
56 |     print("failed")
57 | elif status == "COMPLETED":
58 |     print("success")
59 | elif status == "DEADLINE":
60 |     print("failed")
61 | elif status == "FAILED":
62 |     print("failed")
63 | elif status == "NODE_FAIL":
64 |     print("failed")
65 | elif status == "PREEMPTED":
66 |     print("failed")
67 | elif status == "TIMEOUT":
68 |     print("failed")
69 | elif status == "SUSPENDED":
70 |     print("running")
71 | else:
72 |     print("running")
73 | 


--------------------------------------------------------------------------------
/scripts/asm_status_wrapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import shutil
 5 | import argparse
 6 | 
 7 | STATSWRAPPER_TEMPLATE = '''{stats} \
 8 | in={input_list} \
 9 | minscaf={minscaf} > {output}'''
10 | 
11 | 
12 | class statswrapper:
13 | 	def __init__(self, input_list, minscaf, output):
14 | 		self.stats = shutil.which("statswrapper.sh")
15 | 		self.input_list = ",".join(input_list)
16 | 		self.minscaf = minscaf
17 | 		self.output = output
18 | 
19 | 
20 | def gen_shell(ilist, mlen, split, prefix, output):
21 | 	files = open(ilist, 'r').readlines()
22 | 	total = len(files)
23 | 	assert total >= split, "can't split"
24 | 	step = total // split
25 | 	m = total % split
26 | 	count = 0
27 | 	sub_files = []
28 | 	cmds = []
29 | 	for i in range(0, total, step):
30 | 		count += 1
31 | 		if count <= split:
32 | 			sub_files = [f.strip() for f in files[i:(i + step)]]
33 | 			output_ = "%s.%d.tsv" % (prefix, count)
34 | 			cmd = STATSWRAPPER_TEMPLATE.format_map(
35 | 				vars(statswrapper(sub_files, mlen, output_)))
36 | 			cmds.append(cmd)
37 | 
38 | 		if (count > split) and (m > 0):
39 | 			sub_files += [f.strip() for f in files[(total - m):total]]
40 | 			output_ = "%s.%d.tsv" % (prefix, split)
41 | 			cmd = STATSWRAPPER_TEMPLATE.format_map(
42 | 				vars(statswrapper(sub_files, mlen, output_)))
43 | 			cmds[split - 1] = cmd
44 | 
45 | 	with open(output, 'w') as oh:
46 | 		for i in cmds:
47 | 			oh.write(i + "\n")
48 | 
49 | 
50 | def main():
51 | 	parser = argparse.ArgumentParser("assembler status wrapper")
52 | 	parser.add_argument('-l', '--list', type=str, help='input assembly file list')
53 | 	parser.add_argument('-m', '--min_len', type=int, default=0, help='minimal contig/scaffold length')
54 | 	parser.add_argument('-s', '--split', type=int, default=1, help='split input file')
55 | 	parser.add_argument('-p', '--prefix', type=str, default="asm_stats", help="assembly status output prefix")
56 | 	parser.add_argument('-o', '--output', type=str, default=sys.stdout, help='write cmd to file, default: stdout')
57 | 	args = parser.parse_args()
58 | 
59 | 	gen_shell(args.list, args.min_len, args.split, args.prefix, args.output)
60 | 
61 | 
62 | if __name__ == '__main__':
63 | 	main()
64 | 


--------------------------------------------------------------------------------
/scripts/cut_up_fasta_concoct.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | This scipt comes from CONCOCT
 5 | Let it support Python 3
 6 | Cut up fasta file in non-overlapping or overlapping parts of equal length.
 7 | """
 8 | import argparse
 9 | from Bio import SeqIO
10 | import gzip
11 | 
12 | def cut_up_fasta(fastfiles, chunk_size, overlap, merge_last):
13 |     for ff in fastfiles:
14 |         if ff.strip().endswith(".gz"):
15 |             fa_handle = gzip.open(ff.strip(), 'rt')
16 |         else:
17 |             fa_handle = open(ff.strip(), 'r')
18 |         for record in SeqIO.parse(fa_handle, "fasta"):
19 |             if (not merge_last and len(record.seq) > chunk_size) or (merge_last and len(record.seq) >= 2 * chunk_size):
20 |                 i = 0
21 |                 for split_seq in chunks(record.seq, chunk_size, overlap, merge_last):
22 |                     print(">%s.%i\n%s" % (record.id, i, split_seq))
23 |                     i = i + 1
24 |             else:
25 |                 print(">%s\n%s" % (record.id, record.seq))
26 | 
27 | 
28 | def chunks(l, n, o, merge_last):
29 |     """ Yield successive n-sized chunks from l with given overlap o between the
30 |     chunks.
31 |     """
32 |     assert n > o
33 | 
34 |     if not merge_last:
35 |         for i in range(0, len(l), n - o):
36 |             yield l[i:i + n]
37 |     else:
38 |         for i in range(0, len(l) - n + 1, n - o):
39 |             yield l[i:i + n] if i + n + n - o <= len(l) else l[i:]
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     parser = argparse.ArgumentParser(description=__doc__,
44 |                 formatter_class=argparse.RawDescriptionHelpFormatter)
45 |     parser.add_argument(
46 |         "contigs", nargs="+", help="Fasta files with contigs\n")
47 |     parser.add_argument("-c", "--chunk_size", default=1999, type=int, help="Chunk size\n")
48 |     parser.add_argument("-o", "--overlap_size", default=1900, type=int, help="Overlap size\n")
49 |     parser.add_argument("-m", "--merge_last", default=False, action="store_true", help="Concatenate final part to last contig\n")
50 |     args = parser.parse_args()
51 |     cut_up_fasta(args.contigs, args.chunk_size, args.overlap_size, args.merge_last)
52 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | from setuptools import setup
 5 | 
 6 | exec(open("metapi/__about__.py").read())
 7 | 
 8 | if sys.argv[-1] == "publish":
 9 |     os.system("python setup.py sdist upload")
10 |     sys.exit()
11 | 
12 | with open("README.md") as f:
13 |     long_description = f.read()
14 | 
15 | packages = ["metapi"]
16 | package_data = {
17 |     "metapi": [
18 |         "metapi/config/*.yaml",
19 |         "metapi/envs/*.yaml",
20 |         "metapi/snakefiles/*.smk",
21 |         "metapi/rules/*.smk",
22 |         "metapi/wrappers/*.py",
23 |         "metapi/data/*",
24 |         "metapi/*.py",
25 |     ]
26 | }
27 | data_files = [(".", ["LICENSE", "README.md"])]
28 | 
29 | entry_points = {"console_scripts": ["metapi=metapi.corer:main"]}
30 | 
31 | requires = [
32 |     req.strip()
33 |     for req in open("requirements.txt", "r").readlines()
34 |     if not req.startswith("#")
35 | ]
36 | 
37 | classifiers = [
38 |     "Development Status :: 3 - Alpha",
39 |     "Environment :: Console",
40 |     "Intended Audience :: Developers",
41 |     "Intended Audience :: Science/Research",
42 |     "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
43 |     "Natural Language :: English",
44 |     "Operating System :: OS Independent",
45 |     "Programming Language :: Python :: 3.7",
46 |     "Programming Language :: Python :: 3.8",
47 |     "Programming Language :: Python :: 3.9",
48 |     "Programming Language :: Python :: 3.10",
49 |     "Topic :: Scientific/Engineering :: Bio-Informatics",
50 | ]
51 | 
52 | setup(
53 |     name="metapi",
54 |     version=__version__,
55 |     author=__author__,
56 |     author_email="alienchuj@gmail.com",
57 |     url="https://github.com/ohmeta/metapi",
58 |     description="a pipeline to construct a genome catalogue from metagenomics data",
59 |     long_description_content_type="text/markdown",
60 |     long_description=long_description,
61 |     entry_points=entry_points,
62 |     packages=packages,
63 |     package_data=package_data,
64 |     data_files=data_files,
65 |     include_package_data=True,
66 |     install_requires=requires,
67 |     license="GPLv3+",
68 |     classifiers=classifiers,
69 | )
70 | 


--------------------------------------------------------------------------------
/metapi/profiles/lsf/lsf_config.py:
--------------------------------------------------------------------------------
 1 | import shlex
 2 | from collections import OrderedDict
 3 | from itertools import chain
 4 | from typing import TextIO, Union, List, Any, Dict
 5 | 
 6 | import yaml
 7 | 
 8 | 
 9 | class Config:
10 |     def __init__(self, data: Union[dict, None] = None):
11 |         self._data = dict()
12 |         if data is not None:
13 |             for key, value in data.items():
14 |                 self._data[key] = self.concatenate_params(value)
15 | 
16 |     def __bool__(self) -> bool:
17 |         return bool(self._data)
18 | 
19 |     def __contains__(self, item) -> bool:
20 |         return item in self._data
21 | 
22 |     def get(self, key: str, default: Any = None) -> Any:
23 |         return self._data.get(key, default)
24 | 
25 |     @staticmethod
26 |     def args_to_dict(args: str) -> Dict[str, str]:
27 |         """Converts a string into a dictionary where key/value pairs are consecutive
28 |         elements of the string.
29 |         Eg '-J "2" -q 3' --> {'-J': '2', '-q': '3'}
30 |         """
31 |         args_iter = shlex.shlex(args, posix=True)
32 |         args_iter.whitespace_split = True
33 |         return OrderedDict(zip(args_iter, args_iter))
34 | 
35 |     @staticmethod
36 |     def concatenate_params(params: Union[List[str], str]) -> str:
37 |         if isinstance(params, str):
38 |             return params
39 |         return " ".join(filter(None, params))
40 | 
41 |     def default_params(self) -> str:
42 |         return self.get("__default__", "")
43 | 
44 |     def params_for_rule(self, rulename: str) -> str:
45 |         """Loads default + rule-specific arguments.
46 |         Arguments specified for a rule override default-specified arguments.
47 |         Shlex-joining is required to properly pass quoted escapes in yaml
48 |         to the shell.
49 |         """
50 |         default_params = self.args_to_dict(self.default_params())
51 |         rule_params = self.args_to_dict(self.get(rulename, ""))
52 |         default_params.update(rule_params)
53 |         return " ".join(map(shlex.quote, chain.from_iterable(default_params.items())))
54 | 
55 |     @staticmethod
56 |     def from_stream(stream: TextIO) -> "Config":
57 |         data = yaml.safe_load(stream)
58 |         return Config(data)
59 | 


--------------------------------------------------------------------------------
/metapi/profiles/slurm/slurm-submit.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Snakemake SLURM submit script.
 4 | """
 5 | from snakemake.utils import read_job_properties
 6 | 
 7 | import slurm_utils
 8 | from CookieCutter import CookieCutter
 9 | 
10 | # cookiecutter arguments
11 | SBATCH_DEFAULTS = CookieCutter.SBATCH_DEFAULTS
12 | CLUSTER = CookieCutter.get_cluster_option()
13 | CLUSTER_CONFIG = CookieCutter.CLUSTER_CONFIG
14 | ADVANCED_ARGUMENT_CONVERSION = CookieCutter.get_advanced_argument_conversion()
15 | 
16 | RESOURCE_MAPPING = {
17 |     "time": ("time", "runtime", "walltime"),
18 |     "mem": ("mem", "mem_mb", "ram", "memory"),
19 |     "mem-per-cpu": ("mem-per-cpu", "mem_per_cpu", "mem_per_thread"),
20 |     "nodes": ("nodes", "nnodes"),
21 | }
22 | 
23 | # parse job
24 | jobscript = slurm_utils.parse_jobscript()
25 | job_properties = read_job_properties(jobscript)
26 | 
27 | sbatch_options = {}
28 | cluster_config = slurm_utils.load_cluster_config(CLUSTER_CONFIG)
29 | 
30 | # 1) sbatch default arguments and cluster
31 | sbatch_options.update(slurm_utils.parse_sbatch_defaults(SBATCH_DEFAULTS))
32 | sbatch_options.update(slurm_utils.parse_sbatch_defaults(CLUSTER))
33 | 
34 | # 2) cluster_config defaults
35 | sbatch_options.update(cluster_config["__default__"])
36 | 
37 | # 3) Convert resources (no unit conversion!) and threads
38 | sbatch_options.update(
39 |     slurm_utils.convert_job_properties(job_properties, RESOURCE_MAPPING)
40 | )
41 | 
42 | # 4) cluster_config for particular rule
43 | sbatch_options.update(cluster_config.get(job_properties.get("rule"), {}))
44 | 
45 | # 5) cluster_config options
46 | sbatch_options.update(job_properties.get("cluster", {}))
47 | 
48 | # 6) Advanced conversion of parameters
49 | if ADVANCED_ARGUMENT_CONVERSION:
50 |     sbatch_options = slurm_utils.advanced_argument_conversion(sbatch_options)
51 | 
52 | # 7) Format pattern in snakemake style
53 | sbatch_options = slurm_utils.format_values(sbatch_options, job_properties)
54 | 
55 | # ensure sbatch output dirs exist
56 | for o in ("output", "error"):
57 |     slurm_utils.ensure_dirs_exist(sbatch_options[o]) if o in sbatch_options else None
58 | 
59 | # submit job and echo id back to Snakemake (must be the only stdout)
60 | print(slurm_utils.submit_job(jobscript, **sbatch_options))
61 | 


--------------------------------------------------------------------------------
/scripts/aggregate_genomecov.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | 
 4 | def aggregate(cov):
 5 |     '''
 6 |     bedtools genomecov -ibam sample.mapped.sorted.bam -g contigs_c10K.len > sample_cov.txt
 7 |     produce a histogram of coverage of the exons throughout the genome
 8 |     
 9 |     output format explain:
10 |     1. chromosome(or entire genome)
11 |     2. depth of coverage from features in input file
12 |     3. number of bases on chromosome(or genome) with depth equal to column 2
13 |     4. size of chromosome(or entire genome) in base pairs
14 |     5. fraction of baes on chromosome(or entire genome) with depth equal to column 2
15 |     so column_5 = column_3 / column_4
16 |     all sum(column_3{column_1}) = column_4{column_1}
17 |     all sum(column_5{column_1}) = 1
18 | 
19 |     k119_2  1       30      399     0.075188
20 |     k119_2  2       27      399     0.0676692
21 |     k119_2  3       151     399     0.378446
22 |     k119_2  4       79      399     0.197995
23 |     k119_2  5       54      399     0.135338
24 |     k119_2  6       39      399     0.0977444
25 |     k119_2  7       19      399     0.047619
26 |     k119_3  0       387     473     0.818182
27 |     k119_3  1       86      473     0.181818
28 |     k119_4  4       1       340     0.00294118
29 |     '''
30 |     with open(cov, 'r') as in_handle:
31 |         cov_num = {}
32 |         chr_len = {}
33 |         chr_list = []
34 |         for line in in_handle:
35 |             chr, depth, num, len, frac = line.strip().split('\t')
36 |             if chr not in chr_len:
37 |                 chr_len[chr] = int(len)
38 |                 cov_num[chr] = int(depth) * int(num)
39 |                 chr_list.append(chr)
40 |             else:
41 |                 cov_num[chr] += int(depth) * int(num)
42 |         for chr_name in chr_list:
43 |             print("%s,%f" % (chr_name, cov_num[chr_name] / chr_len[chr_name]))
44 | 
45 | def main():
46 |     parser = argparse.ArgumentParser(description='aggregate the output of bedtools')
47 |     parser.add_argument('-cov', type=str, help='input coverage file')
48 |     args = parser.parse_args()
49 | 
50 |     aggregate(args.cov)
51 | 
52 | if __name__ == '__main__':
53 |     main()
54 | 
55 | # awk -F'\t' '{l[$1]=l[$1]+($2*$3);r[$1]=$4} END {for (i in l){print i","(l[i]/r[i])}}'


--------------------------------------------------------------------------------
/metapi/profiles/sge/sge-status.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import re
 3 | import subprocess as sp
 4 | import shlex
 5 | import sys
 6 | import time
 7 | import logging
 8 | 
 9 | logger = logging.getLogger("__name__")
10 | logger.setLevel(40)
11 | 
12 | STATUS_ATTEMPTS = 20
13 | 
14 | jobid = int(sys.argv[1])
15 | job_status = "running"
16 | 
17 | # WARNING this currently has no support for task array jobs
18 | 
19 | for i in range(STATUS_ATTEMPTS):
20 |     # first try qstat to see if job is running
21 |     # we can use `qstat -s pr -u "*"` to check for all running and pending jobs
22 |     try:
23 |         qstat_res = sp.check_output(shlex.split(f"qstat -s pr")).decode().strip()
24 | 
25 |         # skip the header using [2:]
26 |         res = {
27 |             int(x.split()[0]) : x.split()[4] for x in qstat_res.splitlines()[2:]
28 |         }
29 | 
30 |         # job is in an unspecified error state
31 |         if "E" in res[jobid]:
32 |             job_status = "failed"
33 |             break
34 | 
35 |         job_status = "running"
36 |         break
37 | 
38 |     except sp.CalledProcessError as e:
39 |         logger.error("qstat process error")
40 |         logger.error(e)
41 |     except KeyError as e:
42 |         # if the job has finished it won't appear in qstat and we should check qacct
43 |         # this will also provide the exit status (0 on success, 128 + exit_status on fail)
44 |         # Try getting job with scontrol instead in case sacct is misconfigured
45 |         try:
46 |             qacct_res = sp.check_output(shlex.split(f"qacct -j {jobid}"))
47 | 
48 |             exit_code = int(re.search("exit_status  ([0-9]+)", qacct_res.decode()).group(1))
49 | 
50 |             if exit_code == 0:
51 |                 job_status = "success"
52 |                 break
53 | 
54 |             if exit_code != 0:
55 |                 job_status = "failed"
56 |                 break
57 | 
58 |         except sp.CalledProcessError as e:
59 |             logger.warning("qacct process error")
60 |             logger.warning(e)
61 |             if i >= STATUS_ATTEMPTS - 1:
62 |                 job_status = "failed"
63 |                 break
64 |             else:
65 |                 # qacct can be quite slow to update on large servers
66 |                 time.sleep(5)
67 |         pass
68 | 
69 | print(job_status)
70 | 


--------------------------------------------------------------------------------
/scripts/contigs_from_sample.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import os
 4 | from Bio.SeqIO.FastaIO import SimpleFastaParser
 5 | 
 6 | def contigs_from_sample(contigs_len, sc_out):
 7 |     info = {}
 8 |     #count = 0
 9 |     with open(contigs_len, 'r') as handle:
10 |         for line in handle:
11 |             key = '_'.join(line.split("_")[:3])
12 |             len = int(line.split("\t")[-1])
13 |             if key not in info:
14 |                 info[key] = {}
15 |                 info[key]["num"] = 1
16 |                 info[key]["len"] = len
17 |             else:
18 |                 info[key]["num"] += 1
19 |                 info[key]["len"] += len
20 |             #count += 1
21 |             #if count == 10000:
22 |             #    break
23 |     with open(sc_out, 'w') as out:
24 |         out.write("sample_name\ttotal_contigs_num\ttotal_contigs_len\n")
25 |         for key in info:
26 |             out.write(key + "\t" + str(info[key]["num"]) + "\t" +
27 |                       str(info[key]["len"]) + "\n")
28 | 
29 | def contigs_from_sample_list(contigs_list, sc_out):
30 |     info = {}
31 |     with open(contigs_list, 'r') as contigs_handle:
32 |         for contigs_path in contigs_handle:
33 |             key = os.path.basename(contigs_path.strip()).split(".")[0]
34 |             if key not in info:
35 |                 info[key] = {}
36 |                 info[key]["num"] = 0
37 |                 info[key]["num_gt2kb"] = 0
38 |                 info[key]["len"] = 0
39 |                 info[key]["len_gt2kb"] = 0
40 |             with open(contigs_path.strip(), 'r') as contigs_fa:
41 |                 for title, seq in SimpleFastaParser(contigs_fa):
42 |                     info[key]["num"] += 1
43 |                     info[key]["len"] += len(seq)
44 |                     if len(seq) >= 2000:
45 |                         info[key]["num_gt2kb"] += 1
46 |                         info[key]["len_gt2kb"] += len(seq)
47 |     with open(sc_out, 'w') as out:
48 |         out.write("sample_name\ttotal_contigs_num\ttotal_contigs_num_gt2kb\ttotal_contigs_len\ttotal_contigs_len_gt2kb\n")
49 |         for key in info:
50 |             out.write("%s\t%d\t%d\t%d\t%d\n" % (key, info[key]["num"], info[key]["num_gt2kb"], info[key]["len"], info[key]["len_gt2kb"]))
51 |             
52 | def main():
53 |     #contigs_from_sample(sys.argv[1], sys.argv[2])
54 |     contigs_from_sample_list(sys.argv[1], sys.argv[2])
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     main()
59 | 


--------------------------------------------------------------------------------
/metapi/profiles/lsf/OSLayer.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | import time
 4 | import uuid
 5 | from pathlib import Path
 6 | from typing import Tuple, List
 7 | 
 8 | if not __name__.startswith("tests.src."):
 9 |     sys.path.append(str(Path(__file__).parent.absolute()))
10 |     from CookieCutter import CookieCutter
11 | else:
12 |     from .CookieCutter import CookieCutter
13 | 
14 | stdout = str
15 | stderr = str
16 | 
17 | 
18 | class TailError(Exception):
19 |     pass
20 | 
21 | 
22 | class OSLayer:
23 |     """
24 |     This class provides an abstract layer to communicating with the OS.
25 |     Its main purpose is to enable OS operations mocking, so we don't actually need to
26 |     make file operations or create processes.
27 |     """
28 | 
29 |     @staticmethod
30 |     def mkdir(directory: Path):
31 |         directory.mkdir(parents=True, exist_ok=True)
32 | 
33 |     @staticmethod
34 |     def remove_file(file: Path):
35 |         if file.is_file():
36 |             file.unlink()
37 | 
38 |     @staticmethod
39 |     def run_process(cmd: str) -> Tuple[stdout, stderr]:
40 |         completed_process = subprocess.run(
41 |             cmd, check=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
42 |         )
43 |         return (
44 |             completed_process.stdout.decode().strip(),
45 |             completed_process.stderr.decode().strip(),
46 |         )
47 | 
48 |     @staticmethod
49 |     def print(string: str):
50 |         print(string)
51 | 
52 |     @staticmethod
53 |     def get_uuid4_string() -> str:
54 |         return str(uuid.uuid4())
55 | 
56 |     @staticmethod
57 |     def tail(path: str, num_lines: int = 10) -> List[bytes]:
58 |         if not Path(path).exists():
59 |             # allow for filesystem latency
60 |             time.sleep(CookieCutter.get_latency_wait())
61 |             if not Path(path).exists():
62 |                 raise FileNotFoundError("{} does not exist.".format(path))
63 | 
64 |         process = subprocess.Popen(
65 |             ["tail", "-n", str(num_lines), path],
66 |             stdout=subprocess.PIPE,
67 |             stderr=subprocess.PIPE,
68 |         )
69 |         exit_code = process.wait()
70 |         if exit_code != 0:
71 |             raise TailError(
72 |                 "Failed to execute the tail command on the file {} due to the "
73 |                 "following error:\n{}".format(path, process.stderr.read().decode())
74 |             )
75 |         return process.stdout.readlines()
76 | 


--------------------------------------------------------------------------------
/metapi/wrappers/simulate_reads.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | import gzip
 6 | import subprocess
 7 | from Bio import SeqIO
 8 | 
 9 | 
10 | def simulate_short_reads(
11 |     genomes, output_prefix, r1, r2, abunf, model, reads_num, abundance, threads, logf,
12 | ):
13 |     if len(abundance) != 0:
14 |         with open(abunf, "w") as outh:
15 |             for (g, a) in zip(genomes, abundance):
16 |                 inh = gzip.open(g, "rt") if g.endswith(".gz") else open(g, "r")
17 |                 genome = []
18 |                 total_len = 0
19 |                 for record in SeqIO.parse(inh, "fasta"):
20 |                     total_len += len(record.seq)
21 |                     genome.append((record.id, len(record.seq)))
22 |                 for s in genome:
23 |                     outh.write("%s\t%f\n" %
24 |                                (s[0], float(a) * s[1] / total_len))
25 |                 inh.close()
26 | 
27 |     args = (
28 |         ["iss", "generate", "--cpus", str(threads), "--genomes"]
29 |         + genomes
30 |         + ["--n_reads", reads_num, "--model", model, "--output", output_prefix]
31 |     )
32 | 
33 |     if len(abundance) != 0:
34 |         args += ["--abundance_file", abunf]
35 |     print(" ".join(args))
36 |     env = os.environ.copy()
37 |     proc = subprocess.Popen(
38 |         args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, encoding="utf-8",
39 |     )
40 |     output, error = proc.communicate()
41 | 
42 |     with open(logf, "w") as logh:
43 |         logh.write(error)
44 | 
45 |     if proc.returncode == 0:
46 |         if len(abundance) == 0:
47 |             default_abunf = output_prefix + "_abundance.txt"
48 |             if os.path.exists(default_abunf):
49 |                 os.rename(default_abunf, abunf)
50 |         subprocess.run(f"pigz -p {threads} {output_prefix}_R1.fastq", shell=True)
51 |         subprocess.run(f"pigz -p {threads} {output_prefix}_R2.fastq", shell=True)
52 |         os.rename(f"{output_prefix}_R1.fastq.gz", r1)
53 |         os.rename(f"{output_prefix}_R2.fastq.gz", r2)
54 |     else:
55 |         sys.exit(1)
56 | 
57 | 
58 | simulate_short_reads(
59 |     snakemake.input["genomes"],
60 |     snakemake.params["output_prefix"],
61 |     snakemake.output["r1"],
62 |     snakemake.output["r2"], 
63 |     snakemake.output["abunf"],
64 |     snakemake.params["model"],
65 |     snakemake.params["reads_num"],
66 |     snakemake.params["abundance"],
67 |     snakemake.threads,
68 |     str(snakemake.log))
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/scripts/fastq_contig_size.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | import os
 4 | import csv
 5 | 
 6 | def gen_size_tsv(fqlist, ctglist, tsvout):
 7 |     '''gen data size tsv out'''
 8 |     fq_size = {}
 9 |     ctg_size = {}
10 |     file_size = {}
11 |     file_size["header"] = ["fq_1", "fq_2", "fq_s", "contig", "sample_name"]
12 |     file_size["body"] = []
13 | 
14 |     with open(fqlist, 'r') as fq_handle, open(ctglist, 'r') as ctg_handle:
15 |         for (fq_line, ctg_line) in zip(fq_handle, ctg_handle):
16 |             (reads_a, reads_b, reads_s) = fq_line.strip().split()
17 |             fq_name = os.path.basename(reads_a).split('.')[0]
18 |             ctg_name = os.path.basename(ctg_line).split('.')[0]
19 |             if fq_name not in fq_size:
20 |                 fq_size[fq_name] = {}
21 |             fq_size[fq_name]["fq_1"] = os.path.getsize(reads_a)
22 |             fq_size[fq_name]["fq_2"] = os.path.getsize(reads_b)
23 |             fq_size[fq_name]["fq_s"] = os.path.getsize(reads_s)
24 |             if ctg_name not in ctg_size:
25 |                 ctg_size[ctg_name] = {}
26 |             ctg_size[ctg_name] = os.path.getsize(ctg_line.strip())
27 | 
28 |     assert sorted(fq_size.keys()) == sorted(ctg_size.keys())
29 | 
30 |     for key in ctg_size:
31 |         file_size_ = {}
32 |         file_size_["sample_name"] = key
33 |         file_size_["fq_1"] = fq_size[key]["fq_1"]
34 |         file_size_["fq_2"] = fq_size[key]["fq_2"]
35 |         file_size_["fq_s"] = fq_size[key]["fq_s"]
36 |         file_size_["contig"] = ctg_size[key]
37 |         file_size["body"].append(file_size_)
38 | 
39 |     with open(tsvout, 'w') as out_handle:
40 |         f_tsv = csv.DictWriter(out_handle, file_size["header"], delimiter='\t')
41 |         f_tsv.writeheader()
42 |         f_tsv.writerows(file_size["body"])
43 | 
44 | 
45 | def main():
46 |     '''main function'''
47 |     parser = argparse.ArgumentParser(
48 |         description='''research relationships between fastq size and contigs size:
49 | Usage: python fastq_contig_size_relationship.py --fqlist ./212S_rmhost_fqgz.pathlist.paired --ctglist ./212S_assembly_contigs.pathlist --tsvout fq_contigs_size.ts
50 | ''')
51 |     parser.add_argument('--fqlist', type=str,
52 |                         help='rmhost fastq file path list')
53 |     parser.add_argument('--ctglist', type=str,
54 |                         help='contigs file path list')
55 |     parser.add_argument('--tsvout', type=str,
56 |                         help='tsv out put')
57 |     args = parser.parse_args()
58 |     gen_size_tsv(args.fqlist, args.ctglist, args.tsvout)
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     main()
63 | 


--------------------------------------------------------------------------------
/metapi/profiles/generic/scheduler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | 
 4 | import sys, os
 5 | from subprocess import Popen, PIPE
 6 | import yaml
 7 | 
 8 | 
 9 | def eprint(*args, **kwargs):
10 |     print(*args, file=sys.stderr, **kwargs)
11 | 
12 | 
13 | # let snakemake read job_properties
14 | from snakemake.utils import read_job_properties
15 | 
16 | 
17 | 
18 | jobscript = sys.argv[1]
19 | job_properties = read_job_properties(jobscript)
20 | 
21 | #default paramters defined in cluster_spec (accessed via snakemake read_job_properties)
22 | cluster_param= job_properties["cluster"]
23 | 
24 | if job_properties["type"]=='single':
25 |     cluster_param['name'] = job_properties['rule']
26 | elif job_properties["type"]=='group':
27 |     cluster_param['name'] = job_properties['groupid']
28 | else:
29 |     raise NotImplementedError(f"Don't know what to do with job_properties['type']=={job_properties['type']}")
30 | 
31 | 
32 | # don't overwrite default parameters if defined in rule (or config file)
33 | if ('threads' in job_properties) and ('threads' not in cluster_param):
34 |     cluster_param["threads"] = job_properties["threads"]
35 | for res in ['time','mem']:
36 |     if (res in job_properties["resources"]) and (res not in cluster_param):
37 |         cluster_param[res] = job_properties["resources"][res]
38 | 
39 | # time in hours
40 | if "time" in cluster_param:
41 |     cluster_param["time"]=int(cluster_param["time"]*60)
42 | 
43 | 
44 | # check which system you are on and load command command_options
45 | key_mapping_file=os.path.join(os.path.dirname(__file__),"key_mapping.yaml")
46 | command_options=yaml.load(open(key_mapping_file),
47 |                           Loader=yaml.BaseLoader)
48 | system= command_options['system']
49 | command= command_options[system]['command']
50 | 
51 | key_mapping= command_options[system]['key_mapping']
52 | 
53 | # construct command:
54 | for  key in key_mapping:
55 |     if key in cluster_param:
56 |         command+=" "
57 |         command+=key_mapping[key].format(cluster_param[key])
58 | 
59 | command+=' {}'.format(jobscript)
60 | 
61 | eprint("submit command: "+command)
62 | 
63 | p = Popen(command.split(' '), stdout=PIPE, stderr=PIPE)
64 | output, error = p.communicate()
65 | if p.returncode != 0:
66 |     raise Exception("Job can't be submitted\n"+output.decode("utf-8")+error.decode("utf-8"))
67 | else:
68 |     res= output.decode("utf-8")
69 | 
70 |     if system=='lsf':
71 |         import re
72 |         match = re.search(r"Job <(\d+)> is submitted", res)
73 |         jobid = match.group(1)
74 | 
75 |     elif system=='pbs':
76 |         jobid= res.strip().split('.')[0]
77 | 
78 |     else:
79 |         jobid= int(res.strip().split()[-1])
80 | 
81 |     print(jobid)
82 | 


--------------------------------------------------------------------------------
/scripts/split_fx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # please see http://biopython.org/wiki/Split_large_file
 3 | import argparse
 4 | import os
 5 | import errno
 6 | 
 7 | from Bio import SeqIO
 8 | 
 9 | 
10 | def batch_iterator(iterator, batch_size):
11 |     """Returns lists of length batch_size.
12 | 
13 |     This can be used on any iterator, for example to batch up
14 |     SeqRecord objects from Bio.SeqIO.parse(...), or to batch
15 |     Alignment objects from Bio.AlignIO.parse(...), or simply
16 |     lines from a file handle.
17 | 
18 |     This is a generator function, and it returns lists of the
19 |     entries from the supplied iterator. Each list will have
20 |     batch_size entries, although the final list may be shorter. 
21 |     """
22 | 
23 |     entry = True
24 |     while entry:
25 |         batch = []
26 |         while len(batch) < batch_size:
27 |             try:
28 |                 # entry = iterator.next()
29 |                 entry = next(iterator)
30 |             except StopIteration:
31 |                 entry = None
32 |             if entry is None:
33 |                 break
34 |             batch.append(entry)
35 |         if batch:
36 |             yield batch
37 | 
38 | 
39 | # TODO
40 | # def split_fastq()
41 | # def split_alignment() 
42 | 
43 | 
44 | def split_fasta(fa_file, batch_size, outdir, onedir):
45 |     record_iter = SeqIO.parse(open(fa_file, 'r'), "fasta")
46 |     i = 0
47 |     for i, batch in enumerate(batch_iterator(record_iter, batch_size), start = 1):
48 |         if onedir:
49 |             splitfa = os.path.join(outdir, "split_%i.fa" % (i))
50 |         else:
51 |             splitdir = os.path.join(outdir, "split_" + str(i))
52 |             try:
53 |                 os.makedirs(splitdir)
54 |             except OSError as e:
55 |                 if e.errno != errno.EEXIST:
56 |                     raise
57 |             splitfa = os.path.join(splitdir, "split_%i.fa" % (i))
58 |             
59 |         with open(splitfa, 'w') as out_h:
60 |             count = SeqIO.write(batch, out_h, "fasta")
61 |         print("wrote %i records to %s" % (count, splitfa))
62 |     return i
63 | 
64 | 
65 | def main():
66 |     """split large fasta/fastq file by seq size"""
67 |     parser = argparse.ArgumentParser(description='split large fasta/fastq file by seq size')
68 |     parser.add_argument('-f', type=str, help='input file, a large fasta or fastq file')
69 |     parser.add_argument('-n', type=int, help='each splited file base size', default=1000)
70 |     parser.add_argument('-outdir', type=str, help='a directory store splited file')
71 | 
72 |     args = parser.parse_args()
73 |     split_fasta(args.f, args.n, args.outdir, False) 
74 | 
75 | if __name__ == '__main__':
76 |     main()


--------------------------------------------------------------------------------
/scripts/megahit_hadoop.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Author: liuxing 
 3 | # Email: liuxing2@genomics.cn
 4 | 
 5 | if [[ $# -ne 8 ]];then
 6 |         echo
 7 |         echo "usage: $0 -l FastaqFileList -o OutputDirPath -d HdfsOutputPath -n NumberOfTasks
 8 |         -l fastaq file list, please make a list including all fastq file, path one sample per line
 9 |            and seperate the read1 read2 and singleRead with space or table e.g: read1.fq read2.fq singleRead.fq
10 |         -o output directory path, the directory that you would write the run script and assembly result
11 |         -d HDFS output path, e.g: /user/liuxing2/megahitout
12 |         -n the number of tasks, equal to the number of the samples "
13 |         echo
14 | else
15 |         while [[ -n "$1" ]]
16 |         do
17 |                 case "$1" in
18 |                         -l) fqfilelist="$2"
19 |                             shift ;;
20 |                         -o) outpath="$2"
21 |                             shift ;;
22 |                         -d) dfsoutpath="$2"
23 |                             shift ;;
24 |                         -n) maps="$2"
25 |                             shift ;;
26 |                 esac
27 |                 shift
28 |         done
29 |         if [[ ! -d $outpath ]];then
30 |                 mkdir $outpath
31 |         fi
32 |      
33 |         echo "while read LINE
34 |         do
35 |                 if [[ -n \$LINE ]];then
36 |                         echo \$LINE;
37 |                         read1=\`echo \$LINE| awk '{print \$2}'\`
38 |                         read2=\`echo \$LINE| awk '{print \$3}'\`
39 | 			reads=\`echo \$LINE| awk '{print \$4}'\`                     
40 |                         base=\`basename \$read1\`
41 |                         prefix=\${base%%.*}
42 | 			outputfilename=\${prefix}.megahit_asm
43 |                         /hwfssz1/ST_META/CD/zhujie/program/bioenv/bin/megahit -1 \$read1 -2 \$read2 -r \$reads -o ${outpath}/\$outputfilename --out-prefix \$prefix
44 |                 fi
45 |         done" >${outpath}/megahit.sh
46 | 
47 |         echo "/hwfssz1/BIGDATA_COMPUTING/hadoop/job_submit/10.53.20.169/CDH/bin/hadoop fs -rm -r -skipTrash $dfsoutpath
48 | /hwfssz1/BIGDATA_COMPUTING/hadoop/job_submit/10.53.20.169/CDH/bin/hadoop jar /hwfssz1/BIGDATA_COMPUTING/hadoop/job_submit/10.53.20.169/CDH/jars/hadoop-streaming-2.6.0-cdh5.11.1.jar -D mapreduce.job.name=\"megahit\" -D mapreduce.job.maps=$maps -D mapreduce.job.reduces=0  -D mapreduce.map.memory.mb=25600 -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat -input file:$fqfilelist -output $dfsoutpath -mapper \"sh megahit.sh\" -file ${outpath}/megahit.sh
49 | 
50 | /hwfssz1/BIGDATA_COMPUTING/hadoop/job_submit/10.53.20.169/CDH/bin/hadoop fs -rm -r -skipTrash $dfsoutpath" >${outpath}/megahit_hadoopsubmit.sh
51 | fi
52 | 
53 | 


--------------------------------------------------------------------------------
/scripts/get_bin_id_by_ccsh.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | import csv
 4 | import os
 5 | import re
 6 | 
 7 | 
 8 | def get_bin_id(checkmout_list, out_tsv, completeness, contamination):
 9 |     headers = [
10 |         "sample_id", "bin_id", "marker_lineage", "genomes", "markers",
11 |         "marker_sets", "completeness", "contamination", "strain_heterogeneity"
12 |     ]
13 |     samples_bin_info = []
14 |     with open(checkmout_list, "r") as list_handle:
15 |         for checkmout in list_handle:
16 |             with open(checkmout.strip(), 'r') as checkmout_handle:
17 |                 print("processing %s" % checkmout.strip())
18 |                 sample_id = os.path.basename(checkmout.strip()).split('.')[0]
19 |                 next(checkmout_handle)
20 |                 next(checkmout_handle)
21 |                 next(checkmout_handle)
22 |                 for info in checkmout_handle:
23 |                     if info.strip().startswith("R0"):
24 |                         info_l = re.split(r'\s+', info.strip())
25 |                         if (float(info_l[-2]) < contamination) and (float(
26 |                                 info_l[-3]) > completeness):
27 |                             bin_info = {}
28 |                             bin_info['sample_id'] = sample_id
29 |                             bin_info["bin_id"] = info_l[0]
30 |                             bin_info[
31 |                                 "marker_lineage"] = info_l[1] + " " + info_l[2]
32 |                             bin_info["genomes"] = info_l[3]
33 |                             bin_info["markers"] = info_l[4]
34 |                             bin_info["marker_sets"] = info_l[5]
35 |                             bin_info["completeness"] = info_l[-3]
36 |                             bin_info["contamination"] = info_l[-2]
37 |                             bin_info["strain_heterogeneity"] = info_l[-1]
38 |                             samples_bin_info.append(bin_info)
39 |     with open(out_tsv, 'w') as out_handle:
40 |         f_tsv = csv.DictWriter(out_handle, headers, delimiter="\t")
41 |         f_tsv.writeheader()
42 |         f_tsv.writerows(samples_bin_info)
43 | 
44 | 
45 | def main():
46 |     parser = argparse.ArgumentParser(
47 |         description='''get bin id by completeness cutoff and contamination
48 |         cutoff''')
49 |     parser.add_argument('-l', type=str, help='checkmout list of many samples')
50 |     parser.add_argument(
51 |         '-o',
52 |         type=str,
53 |         help='bin id and completeness, contamination output file')
54 |     parser.add_argument(
55 |         '-c1', type=float, help='completeness cutoff', default=70.0)
56 |     parser.add_argument(
57 |         '-c2', type=float, help='contamination cutoff', default=30.0)
58 |     args = parser.parse_args()
59 |     get_bin_id(args.l, args.o, args.c1, args.c2)
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     main()
64 | 


--------------------------------------------------------------------------------
/scripts/insert_size_ploter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import pandas as pd
 5 | from glob import glob
 6 | import os
 7 | import re
 8 | from plotnine import *
 9 | 
10 | 
11 | def parse_bam_stats(bam_stats_list):
12 |     insert_size_df = pd.DataFrame()
13 |     bam_stats_list_ = []
14 |     if re.search(r'\*', bam_stats_list[0]):
15 |         bam_stats_list_ = glob(bam_stats_list[0])
16 |     else:
17 |         bam_stats_list_ = bam_stats_list
18 | 
19 |     for bam_stats_file in bam_stats_list_:
20 |         df = pd.DataFrame(columns=["insert_size", "pairs_total",
21 |                                    "inward_oriented_pairs",
22 |                                    "outward_oriented_pairs",
23 |                                    "other_pairs", "sample_id"])
24 |         sample_id = os.path.basename(bam_stats_file).split(".")[0]
25 |         with open(bam_stats_file, 'r') as ih:
26 |             for line in ih:
27 |                 if line.startswith("IS"):
28 |                     line_list = re.split(r'\s+', line.strip())
29 |                     df = df.append({"sample_id": sample_id,
30 |                                     "insert_size": line_list[1],
31 |                                     "pairs_total": line_list[2],
32 |                                     "inward_oriented_pairs": line_list[3],
33 |                                     "outward_oriented_pairs": line_list[4],
34 |                                     "other_pairs": line_list[5]}, ignore_index=True)
35 | 
36 |         insert_size_df = pd.concat([insert_size_df, df])
37 |     return insert_size_df
38 | 
39 | 
40 | def plot_insert_size(insert_size_df, outpdf):
41 |     df_l = insert_size_df.melt(id_vars=["insert_size", "sample_id"],
42 |                                value_vars=["pairs_total",
43 |                                            "inward_oriented_pairs",
44 |                                            "outward_oriented_pairs",
45 |                                            "other_pairs"],
46 |                                var_name="type",
47 |                                value_name="count")
48 |     is_plot = (ggplot(df_l, aes(x='insert_size', y='count'))
49 |                + geom_point(aes(fill='type', colour='type'), size=0.2)
50 |                + facet_wrap('~sample_id', scales='free')
51 |                + ggtitle('insert size distribution'))
52 |     is_plot.save(outpdf, width=16, height=16)
53 | 
54 | 
55 | def main():
56 |     parser = argparse.ArgumentParser('plot insert size for samtools bamstats')
57 |     parser.add_argument('-i', nargs='*', help='bamstats file list, separated by spaces')
58 |     parser.add_argument('-o', type=str, help='insert size plot output, pdf format')
59 | 
60 |     args = parser.parse_args()
61 | 
62 |     df = parse_bam_stats(args.i)
63 |     plot_insert_size(df, args.o)
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     main()
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/scripts/megahit_sge.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env
 2 | import shutil
 3 | import os
 4 | from datetime import datetime
 5 | import argparse
 6 | 
 7 | from asub import submit_job
 8 | 
 9 | # TODO
10 | #def assembly(fqlist):
11 | 
12 | def coassembly(fqlist, thread, outdir, prefix, queue, project, resource):
13 |     r1 = []
14 |     r2 = []
15 |     with open(fqlist, 'r') as in_handle:
16 |         for line in in_handle:
17 |             fq_1, fq_2 = line.strip().split("\t")
18 |             r1.append(os.path.abspath(fq_1))
19 |             r2.append(os.path.abspath(fq_2))
20 |     pe1 = ",".join(r1)
21 |     pe2 = ",".join(r2)
22 |     coasm_shell = "%s -1 %s -2 %s -t %d --out-dir %s --out-prefix %s\n" % (shutil.which("megahit"), pe1, pe2, thread, outdir, prefix)
23 |     print(coasm_shell)
24 | 
25 |     with open("./megahit_coasm.sh", 'w') as sh_h:
26 |         sh_h.write(coasm_shell)
27 |     with open("./megahit_coasm_submit.sh", 'w') as sge_h:
28 |         sge_h.write("qsub -cwd -q %s -P %s -l %s megahit_coasm.sh\n" % (queue, project, resource))
29 | 
30 |     '''
31 |     jobname = "megahit_coasm" + "_" + datetime.now().strftime("%Y%m%d%H%M%S")
32 |     logdir = jobname + "_qsub"
33 |     if os.path.exists(logdir):
34 |         os.remove(logdir)
35 |     os.makedirs(logdir)
36 | 
37 |     jobfile = os.path.join(logdir, jobname + "_1.sh")
38 |     with open(jobfile, 'w') as out_handle:
39 |         out_handle.write(coasm_shell)
40 |     
41 |     submit_job(jobname, 1, queue, project, resource, logdir)
42 |     '''
43 | 
44 | 
45 | def main():
46 |     parser = argparse.ArgumentParser(description='using megahit to do assembly or coassembly')
47 |     parser.add_argument('-asm', action='store_true', help='do assembly', default=False)
48 |     parser.add_argument('-coasm', action='store_true', help='do coassembly', default=False)
49 |     parser.add_argument('-fqlist', type=str, help='clean pair-ended reads, each line format: reads_1.fq.gz reads_2.fq.gz')
50 |     parser.add_argument('-thread', type=int, help="number of CPU threads, at least 2 if GPU enabled. [# of logical processors]", default=8)
51 |     parser.add_argument('-outdir', type=str, help='output directory', default="coasm_results")
52 |     parser.add_argument('-prefix', type=str, help='coassembly prefix', default="megahit_coasm.out")
53 |     parser.add_argument('-queue', type=str, help='submit queue', default='st.q')
54 |     parser.add_argument('-project', type=str, help='project id', default='F16ZQSB1SY2779')
55 |     parser.add_argument('-resource',type=str, help='resourse requirment', default='vf=30G,p=8')
56 | 
57 |     args = parser.parse_args()
58 | 
59 |     assert int(args.resource.split("=")[2]) == args.thread, "please let p number equal thread number"
60 | 
61 |     #if args.asm:
62 |     #    assembly(args.fqlist)
63 |     
64 |     if args.coasm:
65 |         coassembly(args.fqlist, args.thread, args.outdir, args.prefix, args.queue, args.project, args.resource)
66 | 
67 | if __name__ == '__main__':
68 |     main()


--------------------------------------------------------------------------------
/scripts/split_mummer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | import errno
 4 | import os
 5 | import shutil
 6 | 
 7 | from asub import submit_job
 8 | from split_fx import split_fasta
 9 | 
10 | 
11 | def gen_job(qry_fa, min_cluster, split_num, split_dir, job_dir, results_dir):
12 |     nucmer = shutil.which("nucmer")
13 |     for i in range(1, split_num + 1):
14 |         # split/split_1.fa
15 |         # job/mummer_1.sh
16 |         # split/split_2.fa
17 |         # job/mummer_2.sh
18 |         # results/nucmer_1.delta
19 |         job_sh = os.path.join(job_dir, "mummer_%i.sh" % (i))
20 |         ref_fa = os.path.join(split_dir, "split_%i.fa" % (i))
21 |         prefix = os.path.join(results_dir, "nucmer_%i" % (i))
22 |         with open(job_sh, 'w') as job_h:
23 |             job_h.write("%s -maxmatch -c %d %s %s -p %s\n" % (nucmer, min_cluster, ref_fa, qry_fa, prefix))
24 | 
25 | # TODO
26 | # def merge():
27 | 
28 | def main():
29 |     parser = argparse.ArgumentParser(description='''split reference, submit mummer array job to SGE, finally merge mummer results''')
30 |     parser.add_argument('-ref', type=str, help='reference fasta file')
31 |     parser.add_argument('-qry', type=str, help='query fasta file')
32 |     parser.add_argument('-c', type=int, help='Sets the minimum length of a cluster of matches, default: 65', default=65)
33 |     parser.add_argument('-size', type=int, help='how many seq records split into a group, default: 10000', default=10000)
34 |     parser.add_argument('-outdir', type=str, help='output directory, default: ./', default="./")
35 |     parser.add_argument('-queue', type=str, help='submit queue, default: st.q', default='st.q')
36 |     parser.add_argument('-project', type=str, help='project id, default: F16ZQSB1SY2779', default='F16ZQSB1SY2779')
37 |     parser.add_argument('-resource',type=str, help='resourse requirment, default: vf=1G,p=1', default='vf=1G,p=1')
38 |     args = parser.parse_args()
39 | 
40 |     # make split, job, results dirs
41 |     split_dir = os.path.join(os.path.abspath(args.outdir), "split")
42 |     try:
43 |         os.makedirs(split_dir)
44 |     except OSError as e:
45 |         if e.errno != errno.EEXIST:
46 |             raise
47 |     job_dir = os.path.join(os.path.abspath(args.outdir), "job")
48 |     try:
49 |         os.makedirs(job_dir)
50 |     except OSError as e:
51 |         if e.errno != errno.EEXIST:
52 |             raise
53 |     results_dir = os.path.join(os.path.abspath(args.outdir), "results")
54 |     try:
55 |         os.makedirs(results_dir)
56 |     except OSError as e:
57 |         if e.errno != errno.EEXIST:
58 |             raise
59 |     qry_fa = os.path.abspath(args.qry)
60 | 
61 |     # split reference fasta
62 |     split_num = split_fasta(args.ref, args.size, split_dir, True)
63 |     gen_job(qry_fa, args.c, split_num, split_dir, job_dir, results_dir)
64 |     submit_job("mummer", split_num, args.queue, args.project, args.resource, job_dir)
65 | 
66 | if __name__ == '__main__':
67 |     main()
68 | 


--------------------------------------------------------------------------------
/scripts/assembly_info.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | library(ggplot2)
 3 | library(dplyr)
 4 | library(tidyr)
 5 | library(readr)
 6 | library(stringr)
 7 | library(argparser)
 8 | library(here)
 9 | 
10 | parse_asm <- function(path_f)
11 | {
12 |     return
13 |     read_delim(path_f, delim = '\t') %>%
14 |       arrange(scaf_L50) %>%
15 |       select(
16 |         filename, n_scaffolds, scaf_bp,
17 |         scaf_N50, scaf_L50,
18 |         scaf_N90, scaf_L90,
19 |         scaf_max, scaf_n_gt50K, scaf_pct_gt50K,
20 |         gc_avg, gc_std)
21 | }
22 | 
23 | average_asm <- function(asm_df)
24 | {
25 |     return
26 |     asm_df %>%
27 |         select(
28 |           n_scaffolds, scaf_bp,
29 |           scaf_N50, scaf_L50,
30 |           scaf_N90, scaf_L90, scaf_max,
31 |           scaf_n_gt50K, scaf_pct_gt50K,
32 |           gc_avg, gc_std) %>%
33 |         summarise(
34 |             n_scaffolds_average = mean(n_scaffolds),
35 |             scaf_bp_average = mean(scaf_bp),
36 |             scaf_N50_average = mean(scaf_N50),
37 |             scaf_L50_average = mean(scaf_L50),
38 |             scaf_N90_average = mean(scaf_N90),
39 |             scaf_L90_average = mean(scaf_L90),
40 |             scaf_max_average = mean(scaf_max),
41 |             scaf_n_gt50K_average = mean(scaf_n_gt50K),
42 |             scaf_pct_gt50K_average = mean(scaf_pct_gt50K),
43 |             gc_avg_average = mean(gc_avg),
44 |             gc_std_average = mean(gc_std)) %>%
45 |         gather(key, value) %>%
46 |         mutate(value_human = value / 1000)
47 | }
48 | 
49 | asm_boxplot <- function(df, title)
50 | {
51 |     p <-
52 |     df %>%
53 |         gather(key, value, -filename) %>%
54 |         mutate(key = factor(
55 |             key,
56 |             levels = c(
57 |                 "n_scaffolds", "scaf_bp",
58 |                 "scaf_N50", "scaf_L50",
59 |                 "scaf_N90", "scaf_L90",
60 |                 "scaf_max", "scaf_n_gt50K", "scaf_pct_gt50K",
61 |                 "gc_avg", "gc_std"))) %>%
62 |         ggplot(., aes(key, value)) +
63 |         geom_boxplot(aes(fill = key), outlier.size = 0.5) +
64 |         geom_jitter(size = 1, width = 0.25) +
65 |         facet_wrap(~ key, scales = "free") +
66 |         theme(
67 |             panel.grid = element_blank(),
68 |             axis.text.x = element_blank(),
69 |             axis.ticks.x = element_blank(),
70 |             axis.title = element_blank(),
71 |             legend.title = element_blank()) +
72 |         ggtitle(title)
73 |     return(p)
74 | }
75 | 
76 | parser <- arg_parser("plot assembly statistics") %>%
77 |   add_argument("--assembly_info", help="assembly statistics info table") %>%
78 |   add_argument("--pdf", help="assembly statistics plot", default="assembly_statistics.pdf")
79 | 
80 | args <- parse_args(parser)
81 | asm_df <- parse_asm(args$assembly_info)
82 | average_asm_df <- average_asm(asm_df)
83 | plot <- asm_boxplot(asm_df, "8 soil and 2 wood samples megahit assembly statistics")
84 | ggsave(args$pdf, plot, width = 10, height = 10)
85 | 


--------------------------------------------------------------------------------
/metapi/predictor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import gzip
 5 | import re
 6 | import sys
 7 | import argparse
 8 | from Bio import SeqIO
 9 | 
10 | 
11 | def parse_gff(gff_file, min_len):
12 |     '''
13 |     extract pep id under the requirement of the minimal contig length cutoff from the GFF file generated by prodigal 
14 |     '''
15 |     save = False
16 |     min_len = int(min_len)
17 |     pep_id_list = []
18 |     with gzip.open(gff_file, "rt") as ih:
19 |         for line in ih:
20 |             seq_len = 0
21 | 
22 |             if line.startswith("##") or line.startswith("# Model Data") or line.strip() == '"':
23 |                 continue
24 | 
25 |             elif line.startswith("# Sequence Data"):
26 |                 line_split = line.strip().split(";")
27 |                 for token in line_split:
28 |                     if "seqlen=" in token:
29 |                         seq_len = int(token[token.find("=") + 1:])
30 |                 if seq_len < min_len:
31 |                     save = False 
32 |                 else:
33 |                     save = True
34 |             elif save:
35 |                 line_split = re.split("\\s+", line.strip())
36 |                 seq_id = line_split[0]
37 |                 trans_id = line_split[-1].split(";")[0].split("_")[-1]
38 |                 pep_id = f'''{seq_id}_{trans_id}'''
39 |                 pep_id_list.append(pep_id)
40 |             else:
41 |                 continue
42 |     return pep_id_list
43 | 
44 | 
45 | def extract_faa(faa_file, pep_id_list, out_file, assembly_group=None):
46 |     if os.path.dirname(out_file) != "":
47 |         os.makedirs(os.path.dirname(out_file), exist_ok=True)
48 | 
49 |     with gzip.open(out_file, "wt") as oh:
50 |         with gzip.open(faa_file, "rt") as ih:
51 |             for seq in SeqIO.parse(ih, "fasta"):
52 |                 if seq.id in pep_id_list:
53 |                     if assembly_group is not None:
54 |                         seq.id = f'''{assembly_group}C{seq.id}'''
55 |                         seq.name = f'''{assembly_group}C{seq.name}'''
56 |                         seq.description = f'''{assembly_group}C{seq.description}'''
57 |                     SeqIO.write(seq, oh, "fasta")
58 | 
59 | 
60 | def main():
61 |     parser = argparse.ArgumentParser("PEP extractor")
62 |     parser.add_argument("--faa-file", dest="faa_file", type=str, required=True, help="protein file, gzipped")
63 |     parser.add_argument("--gff-file", dest="gff_file", type=str, required=True, help="gff file, gzipped")
64 |     parser.add_argument("--min-contig", dest="min_contig", default=2000, type=int, help="minimal contig length, default: 2000")
65 |     parser.add_argument("--out-file", dest="out_file", type=str, required=True, help="output protein file, gzipped")
66 |     args = parser.parse_args()
67 | 
68 |     pep_id_list = parse_gff(args.gff_file, args.min_contig)
69 |     if len(pep_id_list) > 0:
70 |         extract_faa(args.faa_file, pep_id_list, args.out_file)
71 |     else:
72 |         sys.exit("Emplty protein file after contigs length control")
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     main()


--------------------------------------------------------------------------------
/metapi/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from metapi.configer import metaconfig
 4 | from metapi.configer import parse_yaml
 5 | from metapi.configer import update_config
 6 | from metapi.configer import custom_help_formatter
 7 | 
 8 | from metapi.tooler import parse
 9 | from metapi.tooler import merge
10 | 
11 | from metapi.simulator import parse_genomes
12 | from metapi.simulator import get_simulate_info
13 | from metapi.simulator import simulate_short_reads
14 | 
15 | from metapi.sampler import HEADERS
16 | from metapi.sampler import parse_samples
17 | from metapi.sampler import get_reads
18 | from metapi.sampler import get_sample_id
19 | from metapi.sampler import get_sample_id_
20 | from metapi.sampler import get_samples_id_by_assembly_group
21 | from metapi.sampler import get_samples_id_by_binning_group
22 | from metapi.sampler import get_samples_id_by_assembly_and_binning_group
23 | from metapi.sampler import get_assembly_group_by_binning_group
24 | from metapi.sampler import get_binning_group_by_assembly_group
25 | from metapi.sampler import get_multibinning_group_by_assembly_group
26 | 
27 | from metapi.sampler import get_raw_input_list
28 | from metapi.sampler import get_raw_input_dict
29 | 
30 | from metapi.sampler import get_samples_for_assembly_list
31 | from metapi.sampler import get_samples_for_assembly_dict
32 | from metapi.sampler import get_samples_for_assembly_megahit
33 | from metapi.sampler import get_samples_for_assembly_idba_ud
34 | from metapi.sampler import get_samples_for_assembly_spades
35 | from metapi.sampler import get_samples_for_assembly_plass
36 | from metapi.sampler import get_samples_for_assembly_opera_ms
37 | from metapi.sampler import get_samples_for_metaquast
38 | 
39 | from metapi.sampler import get_samples_bax
40 | from metapi.sampler import get_samples_bax_multi
41 | from metapi.sampler import get_samples_bax_multi_all
42 | from metapi.sampler import get_samples_scaftigs
43 | 
44 | from metapi.qcer import change
45 | from metapi.qcer import compute_host_rate
46 | from metapi.qcer import qc_summary_merge
47 | from metapi.qcer import qc_bar_plot
48 | from metapi.qcer import parse_fastp_json
49 | 
50 | from metapi.assembler import assembler_init
51 | from metapi.assembler import parse_assembly
52 | from metapi.assembler import parse_assembly_spades_params
53 | 
54 | from metapi.aligner import flagstats_summary
55 | 
56 | from metapi.predictor import parse_gff
57 | from metapi.predictor import extract_faa
58 | 
59 | from metapi.binner import get_binning_info
60 | from metapi.binner import generate_mags
61 | from metapi.binner import extract_mags_report
62 | from metapi.binner import combine_jgi
63 | 
64 | from metapi.checkmer import checkm_prepare
65 | from metapi.checkmer import checkm_reporter
66 | 
67 | from metapi.classifier import demultiplex
68 | from metapi.classifier import gtdbtk_prepare_from_mags
69 | from metapi.classifier import gtdbtk_prepare_from_genes
70 | 
71 | from metapi.taxonomyer import refine_taxonomy
72 | 
73 | from metapi.uploader import gen_samples_info
74 | from metapi.uploader import gen_info
75 | 
76 | from metapi.__about__ import __version__, __author__
77 | 
78 | name = "metapi"
79 | 


--------------------------------------------------------------------------------
/scripts/metapi_config_update.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import os
 5 | from metapi import configer
 6 | 
 7 | 
 8 | def update_config(
 9 |     workdir,
10 |     rmhost_host_fasta,
11 |     rmhost_bwa_index,
12 |     rmhost_bowtie2_index,
13 |     kraken2_db,
14 |     prof_index_metadata,
15 |     prof_taxonomy,
16 |     prof_jgi_index,
17 |     project_id,
18 | ):
19 | 
20 |     conf_file = os.path.join(workdir, "config.yaml")
21 |     cluster_file = os.path.join(workdir, "cluster.yaml")
22 |     conf_file_up = os.path.join(workdir, "config_update.yaml")
23 |     cluster_file_up = os.path.join(workdir, "cluster_update.yaml")
24 | 
25 |     conf = configer.parse_yaml(os.path.join(workdir, "config.yaml"))
26 |     cluster = configer.parse_yaml(os.path.join(workdir, "cluster.yaml"))
27 | 
28 |     conf["params"]["rmhost"]["host_fasta"] = rmhost_host_fasta
29 |     conf["params"]["rmhost"]["bwa"]["index_prefix"] = rmhost_bwa_index
30 |     conf["params"]["rmhost"]["bowtie2"]["index_prefix"] = rmhost_bowtie2_index
31 |     conf["params"]["classify"]["kraken2"]["database"] = kraken2_db
32 |     conf["params"]["profiling"]["jgi"]["index_metadata"] = prof_index_metadata
33 |     conf["params"]["profiling"]["jgi"]["taxonomy"] = prof_taxonomy
34 |     conf["params"]["profiling"]["jgi"]["index_prefix"] = prof_jgi_index
35 | 
36 |     cluster["__default__"]["project"] = project_id
37 | 
38 |     configer.update_config(conf_file, conf_file_up, conf, remove=False)
39 |     os.rename(conf_file_up, conf_file)
40 | 
41 |     configer.update_config(cluster_file, cluster_file_up, cluster, remove=False)
42 |     os.rename(cluster_file_up, cluster_file)
43 | 
44 | 
45 | def main():
46 |     parser = argparse.ArgumentParser("update metapi config.yaml")
47 |     parser.add_argument("-d", "--workdir", type=str, help="work dir", default="./")
48 |     parser.add_argument("-a", "--rmhost_host_fasta", type=str, help="rmhost host fasta")
49 |     parser.add_argument(
50 |         "-i", "--rmhost_bwa_index", type=str, help="rmhost bwa index prefix"
51 |     )
52 |     parser.add_argument(
53 |         "-I", "--rmhost_bowtie2_index", type=str, help="rmhost bowtie2 index prefix"
54 |     )
55 |     parser.add_argument("-k", "--kraken2_db", type=str, help="kraken2 database")
56 |     parser.add_argument(
57 |         "-m", "--profiling_index_metadata", type=str, help="profiling index metadata"
58 |     )
59 |     parser.add_argument(
60 |         "-t", "--profiling_taxonomy", type=str, help="profiling taxonomy"
61 |     )
62 |     parser.add_argument(
63 |         "-j", "--profiling_jgi_index", type=str, help="profiling jgi index prefix"
64 |     )
65 |     parser.add_argument("-p", "--project_id", type=str, help="project id")
66 |     args = parser.parse_args()
67 | 
68 |     update_config(
69 |         args.workdir,
70 |         args.rmhost_host_fasta,
71 |         args.rmhost_bwa_index,
72 |         args.rmhost_bowtie2_index,
73 |         args.kraken2_db,
74 |         args.profiling_index_metadata,
75 |         args.profiling_taxonomy,
76 |         args.profiling_jgi_index,
77 |         args.project_id,
78 |     )
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     main()
83 | 


--------------------------------------------------------------------------------
/metapi/wrappers/prodigal_wrapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import glob
 4 | import os
 5 | import stat
 6 | import sys
 7 | import subprocess
 8 | import concurrent.futures
 9 | 
10 | import pandas as pd
11 | from checkm import prodigal
12 | 
13 | 
14 | def run_prodigal(input_list):
15 |     bin_fa = os.path.abspath(input_list[0])
16 |     output_dir = os.path.abspath(input_list[1])
17 | 
18 |     bin_id = os.path.basename(os.path.splitext(os.path.splitext(bin_fa)[0])[0])
19 | 
20 |     pep_file = os.path.join(output_dir, bin_id + ".faa")
21 |     cds_file = os.path.join(output_dir, bin_id + ".ffn")
22 |     gff_file = os.path.join(output_dir, bin_id + ".gff")
23 | 
24 |     pep_file_gz = pep_file + ".gz"
25 |     cds_file_gz = cds_file + ".gz"
26 |     gff_file_gz = gff_file + ".gz"
27 | 
28 |     prodigal_runner = prodigal.ProdigalRunner(output_dir)
29 |     prodigal_runner.aaGeneFile = pep_file
30 |     prodigal_runner.ntGeneFile = cds_file
31 |     prodigal_runner.gffFile = gff_file
32 | 
33 |     best_translation_table = prodigal_runner.run(bin_fa, True)
34 | 
35 |     if os.path.exists(pep_file) and (os.path.getsize(pep_file) > 0):
36 |         subprocess.run(f'''pigz -f {pep_file}''', shell=True)
37 |         if os.path.exists(cds_file) and (os.path.getsize(cds_file) > 0):
38 |             subprocess.run(f'''pigz -f {cds_file}''', shell=True)
39 |         if os.path.exists(gff_file) and (os.path.getsize(gff_file) > 0):
40 |             subprocess.run(f'''pigz -f {gff_file}''', shell=True)
41 |     else:
42 |         subprocess.run(f'''rm -rf {pep_file}''', shell=True)
43 |         subprocess.run(f'''rm -rf {cds_file}''', shell=True)
44 |         subprocess.run(f'''rm -rf {gff_file}''', shell=True)
45 |  
46 |     if best_translation_table in [4, 11]:
47 |         if (os.path.exists(pep_file_gz)) and (os.path.exists(cds_file_gz)) and (os.path.exists(gff_file_gz)) and (os.stat(pep_file_gz)[stat.ST_SIZE]) > 0:
48 |             return (bin_id, bin_fa, pep_file_gz, best_translation_table)
49 |         else:
50 |             return None
51 |     else:
52 |         if (os.path.exists(pep_file_gz)) and (os.path.exists(cds_file_gz)) and (os.path.exists(gff_file_gz)) and (os.stat(pep_file_gz)[stat.ST_SIZE]) > 0:
53 |             return (bin_id, bin_fa, pep_file_gz, f"unknown: {best_translation_table}")
54 |         else: 
55 |             return None
56 | 
57 | 
58 | workers = int(sys.argv[1])
59 | input_mags_dir = os.path.dirname(sys.argv[2])
60 | output_done = sys.argv[3]
61 | output_dir = os.path.dirname(output_done)
62 | 
63 | bin_list = glob.glob(input_mags_dir + "/*.fa.gz")
64 | 
65 | input_list = []
66 | for bin_fa in bin_list:
67 |     input_list.append((bin_fa, output_dir))
68 | 
69 | table_list = []
70 | 
71 | 
72 | subprocess.run(f'''rm -rf {output_dir}''', shell=True)
73 | subprocess.run(f'''mkdir -p {output_dir}''', shell=True)
74 | 
75 | with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
76 |     for table_df in executor.map(run_prodigal, input_list):
77 |         if table_df is not None:
78 |             table_list.append(table_df)
79 | 
80 | table_df = pd.DataFrame(table_list, columns=["bin_id", "bin_file", "pep_file", "best_translation_table"])
81 | table_df.to_csv(output_done, sep="\t", index=False)


--------------------------------------------------------------------------------
/scripts/merge_checkm_out.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import pandas as pd
 5 | import re
 6 | from glob import glob
 7 | import sys
 8 | from pprint import pprint
 9 | 
10 | 
11 | def merge(checkm_list, sort_by):
12 |     df = pd.DataFrame()
13 |     if re.search(r'\*', checkm_list[0]):
14 |         checkm_list_ = glob(checkm_list[0])
15 |     else:
16 |         checkm_list_ = checkm_list
17 |     for checkm_file in checkm_list_:
18 |         checkm_df = pd.DataFrame(columns=["bin_id", "marker_lineage",
19 |                                           "genomes", "markers", "marker_sets",
20 |                                           "0", "1", "2", "3", "4", "5+",
21 |                                           "completeness", "contamination", "strain_heterogeneity"])
22 |         with open(checkm_file, 'r') as ih:
23 |             print("analysis %s" % checkm_file)
24 |             next(ih), next(ih), next(ih)
25 |             for line in ih:
26 |                 if not line.startswith("--"):
27 |                     line_list = re.split(r'\s+', line.strip())
28 |                     checkm_df = checkm_df.append({"bin_id": line_list[0],
29 |                                                   "marker_lineage": "-".join(line_list[1:3]),
30 |                                                   "genomes": line_list[3],
31 |                                                   "markers": line_list[4],
32 |                                                   "marker_sets": line_list[5],
33 |                                                   "0": line_list[6],
34 |                                                   "1": line_list[7],
35 |                                                   "2": line_list[8],
36 |                                                   "3": line_list[9],
37 |                                                   "4": line_list[10],
38 |                                                   "5+": line_list[11],
39 |                                                   "completeness": line_list[12],
40 |                                                   "contamination": line_list[13],
41 |                                                   "strain_heterogeneity": line_list[14]}, ignore_index=True)
42 |         df = pd.concat([df, checkm_df])
43 |         if sort_by == "completeness":
44 |             df = df.sort_values(by=["completeness", "contamination", "strain_heterogeneity"],
45 |                                 ascending=[False, True, True])
46 |         else:
47 |             df = df.sort_values(by="bin_id")
48 |     return df
49 | 
50 | 
51 | def main():
52 |     parser = argparse.ArgumentParser("merge many checkm out txt to one")
53 |     parser.add_argument('-l', '--list', nargs='*', help='checkm out txt list, separated by spaces')
54 |     parser.add_argument('-o', '--output', default=sys.stdout,
55 |                         help='merge results, if not specific it, will print stdout')
56 |     parser.add_argument('-s', '--sort', choices=['bin_id', 'completeness'], default="completeness",
57 |                         help='sort merged checkm output')
58 |     args = parser.parse_args()
59 | 
60 |     df = merge(args.list, args.sort)
61 |     df.to_csv(args.output, sep='\t', index=False)
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     main()
66 | 


--------------------------------------------------------------------------------
/metapi/simulator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import gzip
 5 | import sys
 6 | import subprocess
 7 | import pandas as pd
 8 | from Bio import SeqIO
 9 | 
10 | 
11 | def parse_genomes(samples_tsv, output_dir, check_samples=False):
12 |     header = ["id", "genome", "abundance", "reads_num", "model"]
13 | 
14 |     genomes_df = pd.read_csv(samples_tsv, sep="\t").set_index("id", drop=False)
15 | 
16 |     cancel = False
17 |     for i in header:
18 |         if i not in genomes_df.columns:
19 |             cancel = True
20 |             print(f'Error: {i} not in {genomes_df.columns} header')
21 | 
22 |     for i in genomes_df.index.unique():
23 |         if "." in i:
24 |             cancel = True
25 |             print('Error: sample id %s contains ".", please remove all "."' % i)
26 | 
27 |     if cancel:
28 |         sys.exit(1)
29 | 
30 |     genomes_df["fq1"] = genomes_df.apply(
31 |         lambda x: os.path.join(
32 |             output_dir, "short_reads/%s.simulate.1.fq.gz" % x["id"],
33 |         ),
34 |         axis=1,
35 |     )
36 |     genomes_df["fq2"] = genomes_df.apply(
37 |         lambda x: os.path.join(
38 |             output_dir, "short_reads/%s.simulate.2.fq.gz" % x["id"],
39 |         ),
40 |         axis=1,
41 |     )
42 |     return genomes_df
43 | 
44 | 
45 | def simulate_short_reads(
46 |     genomes, output_prefix, r1, r2, abunf, model, reads_num, abundance, threads, logf,
47 | ):
48 |     if len(abundance) != 0:
49 |         with open(abunf, "w") as outh:
50 |             for (g, a) in zip(genomes, abundance):
51 |                 inh = gzip.open(g, "rt") if g.endswith(".gz") else open(g, "r")
52 |                 genome = []
53 |                 total_len = 0
54 |                 for record in SeqIO.parse(inh, "fasta"):
55 |                     total_len += len(record.seq)
56 |                     genome.append((record.id, len(record.seq)))
57 |                 for s in genome:
58 |                     outh.write("%s\t%f\n" %
59 |                                (s[0], float(a) * s[1] / total_len))
60 |                 inh.close()
61 | 
62 |     args = (
63 |         ["iss", "generate", "--cpus", str(threads), "--genomes"]
64 |         + genomes
65 |         + ["--n_reads", reads_num, "--model", model, "--output", output_prefix]
66 |     )
67 | 
68 |     if len(abundance) != 0:
69 |         args += ["--abundance_file", abunf]
70 |     print(" ".join(args))
71 |     env = os.environ.copy()
72 |     proc = subprocess.Popen(
73 |         args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, encoding="utf-8",
74 |     )
75 |     output, error = proc.communicate()
76 | 
77 |     with open(logf, "w") as logh:
78 |         logh.write(error)
79 | 
80 |     if proc.returncode == 0:
81 |         if len(abundance) == 0:
82 |             default_abunf = output_prefix + "_abundance.txt"
83 |             if os.path.exists(default_abunf):
84 |                 os.rename(default_abunf, abunf)
85 |         subprocess.run(f"pigz -p {threads} {output_prefix}_R1.fastq", shell=True)
86 |         subprocess.run(f"pigz -p {threads} {output_prefix}_R2.fastq", shell=True)
87 |         os.rename(f"{output_prefix}_R1.fastq.gz", r1)
88 |         os.rename(f"{output_prefix}_R2.fastq.gz", r2)
89 |     else:
90 |         sys.exit(1)
91 | 
92 | 
93 | def get_simulate_info(genomes_df, wildcards, col):
94 |     return genomes_df.loc[[wildcards.sample], col].dropna().tolist()
95 | 


--------------------------------------------------------------------------------
/scripts/clstr_szie_tab.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #from Bio.SeqIO.FastaIO import SimpleFastaParser
 3 | import argparse
 4 | import re
 5 | 
 6 | pattern = re.compile(r'\d+\t(\d+)[a-z]{2}, >(.+)\.\.\. \*')
 7 | #pattern = re.compile(r'\d+\t(\d+)[a-z]{2},\s>(.+)\.\.\.\s\*')
 8 | #pattern = re.compile(r'\d+\t(\d+)nt, >(.+)\.\.\. \*')
 9 | #pattern = re.compile(r'\d+\t(\d+)nt,\s>(.+)\.\.\.\s\*')
10 | 
11 | # this parser base code comes from Bio.SeqIO.FastaIO.SimpleFastaParser :)
12 | def cdhit_clstr_parser(handle):
13 |     """Generator function to iterate over cdhit clstr records (as string tuple)
14 |     
15 |     >Cluster 0
16 |     0       1131322nt, >k119_12676... *
17 |     1       84315nt, >k119_210239... at -/99.66%
18 |     2       73592nt, >k119_187067... at +/99.86%
19 |     3       70665nt, >k119_160147... at -/99.32%
20 |     4       66352nt, >k119_217379... at +/99.89%
21 |     5       63337nt, >k119_125106... at +/99.28%
22 |     6       63232nt, >k119_150147... at -/99.80%
23 |     7       59840nt, >k119_197728... at +/99.04%
24 |     8       59306nt, >k119_59391... at -/99.00%
25 |     >Cluster 5343379
26 |     0       2000nt, >k119_192744... *
27 |     >Cluster 5343380
28 |     0       2000nt, >k119_222307... *
29 |     >Cluster 5343381
30 |     0       2000nt, >k119_232332... *
31 |     >Cluster 5343382
32 |     0       2000nt, >k119_241124... *
33 |     >Cluster 5343383
34 |     0       2000nt, >k119_253638... *
35 |     
36 |     """
37 |     #Skip any text before the first record (e.g. blank lines, comments)
38 |     seq_id = ""
39 |     seq_len = 0
40 |     clstr_size = 0
41 |     while True:
42 |         line = handle.readline()
43 |         if line == "":
44 |             return  # Premature end of file, or just empty?
45 |         if line[0] == ">":
46 |             break
47 | 
48 |     while True:
49 |         clstr_size = 0
50 |         if line[0] != ">":
51 |             raise ValueError(
52 |                 "Records in cdhit cluster file(fasta format) should start with '>' character")
53 |         clstr_name = line[1:].rstrip()
54 |         line = handle.readline()
55 |         while True:
56 |             if not line:
57 |                 break
58 |             if line[0] == ">":
59 |                 break
60 |             # lines contain many cluster records
61 |             #lines.append(line.rstrip())
62 |             clstr_size += 1
63 |             matches = re.search(pattern, line)
64 |             if matches:
65 |                 seq_len = matches.group(1)
66 |                 seq_id = matches.group(2)
67 | 
68 |             line = handle.readline()
69 |         yield clstr_name, seq_id, seq_len, clstr_size
70 | 
71 |         if not line:
72 |             return  # StopIteration
73 | 
74 |     assert False, "Should not reach this line"
75 | 
76 | def clstr_size_tab(clstr_file, clstr_size_out):
77 |     with open(clstr_size_out, 'w') as out_handle:
78 |         out_handle.write("cluster_name\tcluster_size\tsequence_id\tsequence_length\n")
79 |         with open(clstr_file, 'r') as clstr_handle:
80 |             for clstr_name, seq_id, seq_len, clstr_size in cdhit_clstr_parser(clstr_handle):
81 |                 clstr_name = "cluster_" + clstr_name.split(' ')[1]
82 |                 out_handle.write(clstr_name + "\t" + str(clstr_size) + "\t" + seq_id + "\t" + str(seq_len) + "\n")
83 | 
84 | def main():
85 |     parser = argparse.ArgumentParser(description='parse cdhit cluster file and get cluste size distribution')
86 |     parser.add_argument('--clstr', type=str, help='cluster file')
87 |     parser.add_argument('--out', type=str, help='cluster size distribution')
88 |     args = parser.parse_args()
89 | 
90 |     clstr_size_tab(args.clstr, args.out)
91 | 
92 | if __name__ == '__main__':
93 |     main()
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/metapi/wrappers/gtdbtk_postprocess.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import subprocess
  4 | import concurrent.futures
  5 | from pprint import pprint
  6 | 
  7 | 
  8 | def parse(stats_file):
  9 |     if os.path.exists(stats_file):
 10 |         try:
 11 |             df = pd.read_csv(stats_file, sep="\t")
 12 |         except pd.errors.EmptyDataError:
 13 |             print("%s is empty, please check" % stats_file)
 14 |             return None
 15 | 
 16 |         if not df.empty:
 17 |             return df
 18 |         else:
 19 |             return None
 20 |     else:
 21 |         print("%s is not exists" % stats_file)
 22 |         return None
 23 | 
 24 | 
 25 | def merge(input_list, func, workers, **kwargs):
 26 |     df_list = []
 27 |     with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
 28 |         for df in executor.map(func, input_list):
 29 |             if df is not None:
 30 |                 df_list.append(df)
 31 | 
 32 |     df_ = pd.concat(df_list)
 33 | 
 34 |     if "output" in kwargs:
 35 |         df_.to_csv(kwargs["output"], sep="\t", index=False)
 36 |     return df_
 37 | 
 38 | 
 39 | threads = int(snakemake.threads)
 40 | 
 41 | gtdb_done_list = snakemake.input["gtdb_done"]
 42 | 
 43 | gtdb_to_ncbi_script = snakemake.params["gtdb_to_ncbi_script"]
 44 | metadata_archaea = snakemake.params["metadata_archaea"]
 45 | metadata_bacteria = snakemake.params["metadata_bacteria"]
 46 | 
 47 | table_gtdb = snakemake.output["table_gtdb"]
 48 | table_ncbi = snakemake.output["table_ncbi"]
 49 | table_all = snakemake.output["table_all"]
 50 | 
 51 | os.makedirs(os.path.dirname(table_all), exist_ok=True)
 52 | 
 53 | gtdb_list = []
 54 | ncbi_list = []
 55 | 
 56 | for i in gtdb_done_list:
 57 |     out_dir = os.path.dirname(i)
 58 |     archaea_tsv = os.path.join(out_dir, "gtdbtk.archaea.summary.tsv")
 59 |     bacteria_tsv = os.path.join(out_dir, "gtdbtk.bacteria.summary.tsv")
 60 | 
 61 |     if os.path.exists(archaea_tsv):
 62 |         gtdb_list.append(archaea_tsv)
 63 |     if os.path.exists(bacteria_tsv):
 64 |         gtdb_list.append(bacteria_tsv)
 65 | 
 66 |     gtdb_to_ncbi_summary = os.path.join(out_dir, "gtdbtk.ncbi.summary.tsv")
 67 |     gtdb_to_ncbi_log = os.path.join(out_dir, "gtdbtk.to.ncbi.log")
 68 | 
 69 |     archaea_cmd = "--ar53_metadata_file"
 70 |     if "ar122" in os.path.realpath(archaea_tsv):
 71 |         archaea_cmd = "--ar122_metadata_file"
 72 | 
 73 |     bacteria_cmd = "--bac120_metadata_file"
 74 | 
 75 |     gtdb_to_ncbi_cmd = \
 76 |         f'''
 77 |         python {gtdb_to_ncbi_script} \
 78 |         --gtdbtk_output_dir {out_dir} \
 79 |         --output_file {gtdb_to_ncbi_summary} \
 80 |         {archaea_cmd} {metadata_archaea} \
 81 |         {bacteria_cmd} {metadata_bacteria} \
 82 |         > {gtdb_to_ncbi_log}
 83 |         '''
 84 |     subprocess.run(gtdb_to_ncbi_cmd, shell=True)
 85 | 
 86 |     if os.path.exists(gtdb_to_ncbi_summary):
 87 |         ncbi_list.append(gtdb_to_ncbi_summary)
 88 | 
 89 | 
 90 | if len(gtdb_list) > 0:
 91 |     table_gtdb_df = merge(gtdb_list, parse, threads, output=table_gtdb)
 92 | else:
 93 |     print(f"No {table_gtdb} generate")
 94 | 
 95 | if len(ncbi_list) > 0:
 96 |     table_ncbi_df = merge(ncbi_list, parse, threads, output=table_ncbi)
 97 | else:
 98 |     print(f"No {table_ncbi} generate")
 99 | 
100 | 
101 | table_gtdb_df = table_gtdb_df.rename(columns={"classification": "GTDB classification"})
102 | pprint(table_gtdb_df)
103 | 
104 | table_ncbi_df = table_ncbi_df.rename(columns={"Genome ID": "user_genome"})
105 | pprint(table_ncbi_df)
106 | 
107 | table_all_df = pd.merge(
108 |     table_gtdb_df, table_ncbi_df, how="inner",
109 |     on=["user_genome", "GTDB classification"])#\
110 | 
111 | table_all_df.to_csv(table_all, sep="\t", index=False)
112 | 


--------------------------------------------------------------------------------
/scripts/rename_fasta_id.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from Bio import SeqIO, bgzf
 3 | from Bio.SeqIO.FastaIO import FastaIterator, FastaWriter
 4 | import gzip
 5 | import sys
 6 | import os
 7 | import argparse
 8 | 
 9 | #with open(sys.argv[2], 'w') as fa_out:
10 | #    with open(sys.argv[1], 'r') as fa_in:
11 | #        for rec in SeqIO.parse(fa_in, 'fasta'):
12 | #            (description, sample_name) = rec.description.split("\t")
13 | #            rec.description = sample_name + "_" + description
14 | #            rec.id = rec.description.split(' ')[0]
15 | #            SeqIO.write(rec, fa_out, 'fasta')
16 | 
17 | def change_header_sample(title):
18 |     # title(total header) -> (id, name, description)
19 |     # R0170300050_tooth_RA.contigs.fa
20 |     #from > k119_1 flag=1 multi=7.0000 len=3284 R0170300050_tooth_RA
21 |     #to   > R0170300050_tooth_RA_k119_1 flag=1 multi=7.0000 len=3284
22 |     (one_line, sample_name) = title.split("\t")
23 |     id = sample_name + "_" + title.split(' ')[0]
24 |     desc = id + ' ' + ' '.join(one_line.split(' ')[1:])
25 |     return id, "", desc
26 | 
27 | def change_header_no_sample(title):
28 |     # title(total header) -> (id, name, description)
29 |     # R0170300050_tooth_RA.contigs.fa
30 |     #from > k119_1 flag=1 multi=7.0000 len=3284
31 |     #to   > R0170300050_tooth_RA_k119_1 flag=1 multi=7.0000 len=3284
32 |     id = sample_tag + "_" + title.split(' ')[0]
33 |     desc = id + " " + " ".join(title.split(' ')[1:])
34 |     return id, "", desc
35 | 
36 | 
37 | ## rename header framework
38 | ## just change header_function
39 | def reheader_fasta(fa_in, fa_out, header_function, in_gz, gz):
40 |     if in_gz:
41 |         in_h = gzip.open(fa_in, 'rt')
42 |     else:
43 |         in_h = open(fa_in, 'r')
44 |     if gz:
45 |         out_h = bgzf.BgzfWriter(fa_out, 'wb')
46 |     else:
47 |         out_h = open(fa_out, 'w')
48 |     writer = FastaWriter(out_h)
49 |     writer.write_header()
50 |     for rec in FastaIterator(in_h, title2ids = header_function):
51 |         writer.write_record(rec)
52 |     writer.write_footer()
53 |     out_h.close()
54 |     in_h.close()
55 | 
56 | def main():
57 |     '''
58 |     Why write this script ?
59 |     Becaust megahit always generate knum_num format contigs id
60 |     '''
61 |     parser = argparse.ArgumentParser(description='change fasta file header')
62 |     parser.add_argument('-fa', type=str, help='fasta file path')
63 |     parser.add_argument('-out', type=str, help='output')
64 |     parser.add_argument('-rm', action='store_true', help='delete original fasta file', default=False)
65 |     parser.add_argument('-gz', action='store_true', help='compress output fasta file', default=False)
66 |     parser.add_argument('-mv', action='store_true', help="rename change id fasta file to original file", default=False)
67 |     args = parser.parse_args()
68 |     
69 |     #assert not args.fa == args.out, "input file name can't equal to output file name"
70 |     if (args.out == args.fa) or (not args.out):
71 |         args.out = args.fa + ".changeid"
72 |     if args.gz:
73 |         if not args.out.endswith(".gz"):
74 |             args.out = args.out + ".gz"
75 |     
76 |     in_gz = args.fa.endswith(".gz")
77 |     #if args.fa.endswith(".gz"):
78 |     #    args.gz = True
79 |     
80 |     global sample_tag
81 |     sample_tag = os.path.basename(args.fa).split(".")[0]
82 |     
83 |     abs_in = os.path.abspath(args.fa)
84 |     abs_out = os.path.abspath(args.out)
85 |     reheader_fasta(abs_in, abs_out, change_header_no_sample, in_gz, args.gz)
86 |     
87 |     if args.rm:
88 |         os.remove(abs_in)
89 |     if args.mv:
90 |         if (not in_gz) and args.gz:
91 |             abs_in = abs_in + ".gz"
92 |         if in_gz and (not args.gz):
93 |             abs_in = abs_in.rstrip(".gz")
94 |         os.rename(abs_out, abs_in)
95 | 
96 | if __name__ == '__main__':
97 |     main()


--------------------------------------------------------------------------------
/scripts/asub.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # please see https://github.com/lh3/asub
 3 | import argparse
 4 | import fileinput
 5 | import os
 6 | import re
 7 | import shutil
 8 | import stat
 9 | import subprocess
10 | import sys
11 | from datetime import datetime
12 | 
13 | __author__ = 'Jie Zhu, Jiahui Zhu'
14 | __email__ = 'zhujie@genomics.cn, zhujiahui@genomics.cn'
15 | __version__ = '0.3.1'
16 | __date__ = 'Jun 19, 2018'
17 | 
18 | 
19 | def parse_job(job_name, job_file, a_job_line, logdir):
20 |     with fileinput.input(files=job_file if not job_file is None else ('-', )) as in_h:
21 |         job_num = 0
22 |         for one_line in in_h:
23 |             job_num += 1
24 |             job_f = os.path.join(logdir, job_name.rstrip(".sh") + "_" + str(job_num) + ".sh")
25 |             with open(job_f, 'w') as job_h:
26 |                 job_h.write(one_line)
27 |                 while fileinput.lineno() % a_job_line != 0:
28 |                     job_h.write(next(in_h))
29 |                 #for i in range(1, a_job_line):
30 |                 #    job_h.write(next(in_h))
31 |         return job_num
32 | 
33 | 
34 | def submit_job(job_name, total_job_num, queue, prj_id, resource, logdir):
35 |     submit_f = os.path.join(os.path.curdir, job_name.rstrip(".sh") + "_submit.sh")
36 |     array_range = "1-" + str(total_job_num) + ":1"
37 |     job_script = os.path.join(logdir, job_name.rstrip(".sh") + "_$SGE_TASK_ID.sh")
38 |     num_proc = resource.split('=')[-1]
39 |     with open(submit_f, 'w') as submit_h:
40 |         submit_h.write('''#!/bin/bash\n\
41 | #$ -clear
42 | #$ -S /bin/bash
43 | #$ -N %s
44 | #$ -cwd
45 | #$ -l %s
46 | #$ -binding linear:%s
47 | #$ -q %s
48 | #$ -P %s
49 | #$ -t %s
50 | jobscript=%s
51 | bash $jobscript\n''' % (job_name, resource, num_proc, queue, prj_id, array_range, job_script))
52 | 
53 |     os.chmod(submit_f, stat.S_IRWXU)
54 |     submit_cmd = shutil.which("qsub") + \
55 |                  " -e " + os.path.join(logdir, job_name + "_\\$TASK_ID.e") + \
56 |                  " -o " + os.path.join(logdir, job_name + "_\\$TASK_ID.o") + " " + submit_f
57 |     print(submit_cmd)
58 |     subprocess.call(submit_cmd, shell=True)
59 | 
60 | def main():
61 |     '''it is a very simple script to submit array job, but you need supply real run command'''
62 |     parser = argparse.ArgumentParser(description='make submit array job easy')
63 |     parser.add_argument('-jobfile', nargs='*', help='job file to read, if empty, stdin is used')
64 |     parser.add_argument('-jobname', type=str, help='job name', default='job')
65 |     parser.add_argument('-jobline', type=int, help='set the number of lines to form a job', default=1)
66 |     parser.add_argument('-queue', type=str, help='submit queue', default='st.q')
67 |     parser.add_argument('-project', type=str, help='project id', default='F16ZQSB1SY2779')
68 |     parser.add_argument('-resource',type=str, help='resourse requirment', default='vf=1G,p=1')
69 |     parser.add_argument('-logdir', type=str, help='array job log directory')
70 |     args = parser.parse_args()
71 | 
72 |     assert re.match(r'vf=[\d\.]+\w,p=\d+', args.resource), "please specific memory usage and number processor"
73 |     assert not re.match(r'^\d+', args.jobname), "array job name cannot start with a digit"
74 |     assert args.jobline >= 1, "a job line can't to be zero"
75 | 
76 |     args.jobname += "_" + datetime.now().strftime("%Y%m%d%H%M%S")
77 | 
78 |     if not args.logdir:
79 |         args.logdir = args.jobname + "_qsub"
80 |     args.logdir = args.logdir.rstrip("/") + "/"
81 | 
82 |     if os.path.exists(args.logdir):
83 |         os.remove(args.logdir)
84 |     os.makedirs(args.logdir)
85 | 
86 |     total_job_num = parse_job(args.jobname, args.jobfile, args.jobline, args.logdir)
87 |     submit_job(args.jobname, total_job_num, args.queue, args.project, args.resource, args.logdir)
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     main()
92 | 


--------------------------------------------------------------------------------
/metapi/rules/upload.smk:
--------------------------------------------------------------------------------
  1 | if config["upload"]["do"]:
  2 |     rule upload_generate_samples_info:
  3 |         input:
  4 |             config["params"]["samples"]
  5 |         output:
  6 |             os.path.join(config["output"]["upload"], "table/MIxS_Samples.xlsx")
  7 |         run:
  8 |             metapi.gen_samples_info(SAMPLES, output[0], config)
  9 | 
 10 | 
 11 |     rule upload_md5_short_reads:
 12 |         input:
 13 |             alignment_input_with_short_reads
 14 |         output:
 15 |             os.path.join(config["output"]["upload"], "md5/short_reads/{sample}.md5")
 16 |         shell:
 17 |             '''
 18 |             md5sum {input} > {output}
 19 |             '''
 20 | 
 21 | 
 22 |     rule upload_generate_run_info:
 23 |         input:
 24 |             expand(os.path.join(
 25 |                 config["output"]["upload"], "md5/short_reads/{sample}.md5"),
 26 |                 sample=SAMPLES_ID_LIST)
 27 |         output:
 28 |             os.path.join(config["output"]["upload"], "table/Experiment_Run.xlsx")
 29 |         threads:
 30 |             config["upload"]["threads"]
 31 |         run:
 32 |             metapi.gen_info(input, output[0], config, threads, "sequencing_run")
 33 | 
 34 | 
 35 |     rule upload_sequencing_all:
 36 |         input:
 37 |             os.path.join(config["output"]["upload"], "table/Experiment_Run.xlsx"),
 38 |             os.path.join(config["output"]["upload"], "table/MIxS_Samples.xlsx")
 39 | 
 40 | 
 41 |     localrules:
 42 |         upload_generate_samples_info,
 43 |         upload_generate_run_info,
 44 | 
 45 | 
 46 |     if len(ASSEMBLERS) != 0:
 47 |         rule upload_md5_scaftigs:
 48 |             input:
 49 |                 os.path.join(
 50 |                     config["output"]["assembly"],
 51 |                     "scaftigs/{binning_group}.{assembly_group}.{assembler}/{binning_group}.{assembly_group}.{assembler}.scaftigs.fa.gz")
 52 |             output:
 53 |                 os.path.join(
 54 |                     config["output"]["upload"],
 55 |                     "md5/scaftigs/{assembler}/{binning_group}.{assembly_group}.{assembler}.scaftigs.md5")
 56 |             shell:
 57 |                 '''
 58 |                 md5sum {input} > {output}
 59 |                 '''
 60 | 
 61 | 
 62 |         rule upload_generate_assembly_info:
 63 |             input:
 64 |                 expand(os.path.join(
 65 |                     config["output"]["upload"],
 66 |                     "md5/scaftigs/{{assembler}}/{binning_group}.{assembly_group}.{{assembler}}.scaftigs.md5"),
 67 |                     zip,
 68 |                     binning_group=ASSEMBLY_GROUP["binning_group"],
 69 |                     assembly_group=ASSEMBLY_GROUP["assembly_group"])
 70 |             output:
 71 |                 os.path.join(
 72 |                     config["output"]["upload"],
 73 |                     "table/Genome_Assembly_{assembler}.xlsx")
 74 |             threads:
 75 |                 config["upload"]["threads"]
 76 |             run:
 77 |                 metapi.gen_info(input, output[0], config, threads, "assembly")
 78 | 
 79 | 
 80 |         rule upload_assembly_all:
 81 |             input:
 82 |                 expand(os.path.join(
 83 |                     config["output"]["upload"],
 84 |                     "table/Genome_Assembly_{assembler}.xlsx"),
 85 |                     assembler=ASSEMBLERS)
 86 | 
 87 | 
 88 |         localrules:
 89 |             upload_generate_assembly_info
 90 | 
 91 | 
 92 |     else:
 93 |         rule upload_assembly_all:
 94 |             input:
 95 | 
 96 | else:
 97 |     rule upload_sequencing_all:
 98 |         input:
 99 | 
100 | 
101 |     rule upload_assembly_all:
102 |         input:
103 | 
104 | 
105 | rule upload_all:
106 |     input:
107 |         rules.upload_sequencing_all.input,
108 |         rules.upload_assembly_all.input#,
109 | 
110 | 
111 | localrules:
112 |     upload_sequencing_all,
113 |     upload_assembly_all,
114 |     upload_all


--------------------------------------------------------------------------------
/scripts/kraken2_demultiplex_summary.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import argparse
  4 | import sys
  5 | import pandas as pd
  6 | import pickle
  7 | from pprint import pprint
  8 | from taxadb.taxid import TaxID
  9 | from taxadb.names import SciName
 10 | 
 11 | 
 12 | def main(args_):
 13 |     parser = argparse.ArgumentParser("a summary of kraken2 demultiplex pickle")
 14 |     parser.add_argument(
 15 |         '--rank',
 16 |         choices=["superkingdom", "phylum", "class", "order", "family", "genus", "species"],
 17 |         default="genus",
 18 |         help='mini rank for merge'
 19 |     )
 20 |     parser.add_argument(
 21 |         '--taxadb',
 22 |         type=str,
 23 |         help='taxonomy database'
 24 |     )
 25 |     parser.add_argument(
 26 |         '-p',
 27 |         '--pickle_list',
 28 |         help='kraken2 demultiplex pickle list'
 29 |     )
 30 |     parser.add_argument(
 31 |         '-o',
 32 |         '--summary_output'
 33 |     )
 34 |     args = parser.parse_args(args_)
 35 | 
 36 |     LINEAGES = ["no_rank", "subspecies", "species", "genus", "family",
 37 |                 "order", "class", "phylum", "superkingdom"]
 38 | 
 39 |     RANK = args.rank
 40 |     if not args.rank in LINEAGES[1:]:
 41 |         print("wrong rank %s" % args.rank)
 42 |         sys.exit(1)
 43 | 
 44 |     SUB_LINRAGES = LINEAGES[LINEAGES.index(RANK):]
 45 | 
 46 |     TAXID_DB = TaxID(dbtype='sqlite', dbname=args.taxadb)
 47 |     NAMES_DB = SciName(dbtype='sqlite', dbname=args.taxadb)
 48 | 
 49 | 
 50 |     def get_parent_taxid(tax_id, tax_name, level):
 51 |         if tax_id == 0:
 52 |             return "no_rank", 0, "unclassified"
 53 |     
 54 |         lineage_dict = TAXID_DB.lineage_id(tax_id, ranks=True)
 55 | 
 56 |         if lineage_dict is None:
 57 |             taxid = NAMES_DB.taxid(tax_name)
 58 |             if taxid is None:
 59 |                 taxid = NAMES_DB.taxid(tax_name.split()[0])
 60 |             if not taxid is None:
 61 |                 lineage_dict = TAXID_DB.lineage_id(taxid, ranks=True)
 62 |             else:
 63 |                 return "no_rank", tax_id, tax_name
 64 |     
 65 |         for rank in SUB_LINRAGES:
 66 |             if rank in lineage_dict:
 67 |                 return rank, lineage_dict[rank], TAXID_DB.lineage_name(lineage_dict[rank], ranks=True)[rank]
 68 |         return "no_rank", tax_id, "unclassified"
 69 |         
 70 |     
 71 |     summary_dict = {"taxid": [],
 72 |                     "taxa_name": [],
 73 |                     "reads_count": [],
 74 |                     "rank": [],
 75 |                     "parent_taxid": [],
 76 |                     "parent_taxa_name": []}
 77 | 
 78 |     with open(args.pickle_list, 'r') as ih:
 79 |         for line in ih:
 80 |             with open(line.strip(), 'rb') as ph:
 81 |                 kk2_ranks_counter = pickle.load(ph)
 82 |                 # pprint(kk2_ranks_counter)
 83 | 
 84 |                 for taxid in kk2_ranks_counter:
 85 |                     if taxid in summary_dict["taxid"]:
 86 |                         summary_dict["reads_count"][summary_dict["taxid"].index(taxid)] += 2 * kk2_ranks_counter[taxid][1]
 87 |                     else:
 88 |                         summary_dict["taxid"].append(taxid)
 89 |                         summary_dict["taxa_name"].append(kk2_ranks_counter[taxid][0])
 90 |                         summary_dict["reads_count"].append(2 * kk2_ranks_counter[taxid][1]) 
 91 |                         
 92 |                         rank_, taxid_, taxaname_ = get_parent_taxid(taxid, kk2_ranks_counter[taxid][0], RANK)
 93 |                         summary_dict["rank"].append(rank_)
 94 |                         summary_dict["parent_taxid"].append(taxid_)
 95 |                         summary_dict["parent_taxa_name"].append(taxaname_)
 96 | 
 97 |     summary_df = pd.DataFrame.from_dict(summary_dict)
 98 | 
 99 |     summary_df.to_csv(args.summary_output, index=False, sep='\t')
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     main(sys.argv[1:])
104 | 


--------------------------------------------------------------------------------
/scripts/qc_report.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import pandas as pd
  4 | import concurrent.futures
  5 | import subprocess
  6 | import argparse
  7 | import os
  8 | 
  9 | def get_reads(df, id_, col_):
 10 |     return df.loc[[id_], col_].dropna().tolist()
 11 | 
 12 | 
 13 | def run_(tuple_):
 14 |     try:
 15 |         output = subprocess.check_output(
 16 |             tuple_[0], shell=True, stderr=subprocess.STDOUT, universal_newlines=True
 17 |         )
 18 |     except subprocess.CalledProcessError as e:
 19 |         print(e.output)
 20 |         return None
 21 | 
 22 |     out_list = output.strip().split("\n")
 23 |     header = out_list[0].split("\t")
 24 |     data = []
 25 | 
 26 |     for line in out_list[1:]:
 27 |         content = tuple(line.split("\t"))
 28 |         data.append(content)
 29 | 
 30 |     df = pd.DataFrame(data, columns=header)
 31 |     df["id"] = tuple_[1]
 32 |     df["step"] = tuple_[2]
 33 |     df["fq_type"] = tuple_[3]
 34 |     df["reads"] = tuple_[4]
 35 |     return df
 36 | 
 37 | 
 38 | def run(cmd_list, workers):
 39 |     df_list = []
 40 |     with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
 41 |         for df in executor.map(run_, cmd_list):
 42 |             if df is not None:
 43 |                 df_list.append(df)
 44 |     df_ = pd.concat(df_list)
 45 |     return df_
 46 | 
 47 | 
 48 | def gen(fastq_list, step, fq_encoding, is_pe=True):
 49 |     fq_df = pd.read_csv(fastq_list, sep="\t").set_index("id")
 50 |     cmd_list = []
 51 | 
 52 |     for i in fq_df.index.unique():
 53 |         fq1_list = get_reads(fq_df, i, "fq1")
 54 |         if is_pe:
 55 |             fq2_list = get_reads(fq_df, i, "fq2")
 56 | 
 57 |         if is_pe:
 58 |             if len(fq1_list) == 1:
 59 |                 cmd = "seqkit stats -a -T -b -j 1 -E %s %s %s" % (
 60 |                     fq_encoding,
 61 |                     fq1_list[0],
 62 |                     fq2_list[0],
 63 |                 )
 64 |                 cmd_list.append((cmd, i, step, "pe", ["fq1", "fq2"]))
 65 |             else:
 66 |                 cmd_1 = "cat %s | seqkit stats -a -T -b -j 1 -E %s" % (
 67 |                     " ".join(fq1_list),
 68 |                     fq_encoding,
 69 |                 )
 70 |                 cmd_list.append((cmd_1, i, step, "pe", ["fq1"]))
 71 |                 cmd_2 = "cat %s | seqkit stats -a -T -b -j 1 -E %s" % (
 72 |                     " ".join(fq2_list),
 73 |                     fq_encoding,
 74 |                 )
 75 |                 cmd_list.append((cmd_2, i, step, "pe", ["fq2"]))
 76 |         else:
 77 |             cmd = "cat %s | seqkit stats -a -T -b -j 1 -E %s" % (
 78 |                 " ".join(fq1_list),
 79 |                 fq_encoding,
 80 |             )
 81 |             cmd_list.append((cmd, i, step, "se", ["fq1"]))
 82 |     return cmd_list
 83 | 
 84 | 
 85 | def main():
 86 |     parser = argparse.ArgumentParser(description="generate quality control report from raw, trimming, rmhost data")
 87 |     parser.add_argument("--raw_list", help="raw data list, headers: id fq1 fq2")
 88 |     parser.add_argument("--trimming_list", help="trimming data list, headers: id fq1 fq2")
 89 |     parser.add_argument("--rmhost_list", help="rmhost data list, headers: id fq1 fq2")
 90 |     parser.add_argument("--is_se", action='store_true', default=False, help='default: is_pe')
 91 |     parser.add_argument("--fq_encoding", help="fastq quality encoding, default: sanger", default="sanger")
 92 |     parser.add_argument("--threads", help="threads, default: 8", default=8)
 93 |     parser.add_argument("--output", help="qc report output")
 94 | 
 95 |     args = parser.parse_args()
 96 | 
 97 |     cmd_raw = gen(args.raw_list, "raw", args.fq_encoding, not args.is_se)
 98 |     cmd_trimming = gen(args.trimming_list, "trimming", args.fq_encoding, not args.is_se)
 99 |     cmd_rmhost = gen(args.rmhost_list, "rmhost", args.fq_encoding, not args.is_se)
100 | 
101 |     cmd = cmd_raw + cmd_trimming + cmd_rmhost
102 | 
103 |     df = run(cmd, args.threads)
104 |     df.to_csv(args.output, sep='\t', index=False)
105 | 
106 | if __name__ == "__main__":
107 |     main()
108 | 


--------------------------------------------------------------------------------
/scripts/clean_statout_to_matrix.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | ## Metagenomics Institute of BGI Research
  3 | ## zhujie@genomics.cn
  4 | ## 2017-11-29
  5 | ## GPL-V3
  6 | 
  7 | import os
  8 | import argparse
  9 | 
 10 | ##TODO
 11 | ## clean and SE
 12 | 
 13 | def parse_pe_clean_statout(handle, min_l, max_l):
 14 |     header_list = [str(i) for i in range(min_l, max_l + 1)]
 15 |     value_dict = {}
 16 |     for key in header_list:
 17 |         value_dict[key] = 0
 18 | 
 19 |     ## total info
 20 |     for key,value in zip(handle.readline().strip().split(), handle.readline().strip().split()):
 21 |         header_list.append(key)
 22 |         value_dict[key] = value
 23 | 
 24 |     ## reads_1 info
 25 |     tag = True
 26 |     for key,value in zip(handle.readline().strip().split(), handle.readline().strip().split()):
 27 |         if tag:
 28 |             tag = False
 29 |         else:
 30 |             key = key + "_1"
 31 |         header_list.append(key)
 32 |         value_dict[key] = value
 33 | 
 34 |     ## reads_2 info
 35 |     tag = True
 36 |     for key,value in zip(handle.readline().strip().split(), handle.readline().strip().split()):
 37 |         if tag:
 38 |             tag = False
 39 |         else:
 40 |             key = key + "_2"
 41 |         header_list.append(key)
 42 |         value_dict[key] = value
 43 | 
 44 |     ## reads_single info
 45 |     tag = True
 46 |     for key,value in zip(handle.readline().strip().split(), handle.readline().strip().split()):
 47 |         if tag:
 48 |             tag = False
 49 |         else:
 50 |             key = key + "_single"
 51 |         header_list.append(key)
 52 |         value_dict[key] = value
 53 | 
 54 |     ## length info
 55 |     next(handle)
 56 |     total_filter_base = 0
 57 |     total_filter_reads = 0
 58 |     total_filter_reads_len_gt80 = 0
 59 |     L80 = 0
 60 |     for line in handle:
 61 |         line_list = line.strip().split()
 62 |         reads_len = line_list[0]
 63 |         reads_num = line_list[1]
 64 |         total_filter_reads += int(reads_num)
 65 |         total_filter_base  += int(reads_len) * int(reads_num)
 66 |         if (int(reads_len) >= 80):
 67 |             total_filter_reads_len_gt80 += int(reads_num)
 68 |         value_dict[str(reads_len)] = str(reads_num)
 69 | 
 70 |     L80 = total_filter_reads_len_gt80 / total_filter_reads
 71 |     header_list.append("total_filter_base")
 72 |     value_dict["total_filter_base"] = str(total_filter_base)
 73 |     header_list.append("total_filter_reads")
 74 |     value_dict["total_filter_reads"] = str(total_filter_reads)
 75 |     header_list.append("L80")
 76 |     value_dict["L80"] = str(L80)
 77 | 
 78 |     return (value_dict, header_list)
 79 | 
 80 | def gen_len_matrix(dirname, min_l, max_l):
 81 |     no_header = True
 82 |     for fl in os.listdir(dirname):
 83 |         if fl.endswith("stat_out"):
 84 |             sample_name = fl.split("/")[-1].split(".")[0]
 85 |             statout = os.path.join(dirname, fl)
 86 |             with open(statout, 'r') as h:
 87 |                 tuple_ = parse_pe_clean_statout(h, min_l, max_l)
 88 |                 if no_header:
 89 |                     header = "sample_name\t" + "\t".join(tuple_[1])
 90 |                     print(header)
 91 |                     body = sample_name + "\t" + "\t".join([tuple_[0][key] for key in tuple_[1]])
 92 |                     print(body)
 93 |                     no_header = False
 94 |                 else:
 95 |                     body = sample_name + "\t" + "\t".join([tuple_[0][key] for key in tuple_[1]])
 96 |                     print(body)
 97 | 
 98 | def main():
 99 |     parser = argparse.ArgumentParser("convert many clean statout to a matrix\n \
100 |                                       e.g: python clean_statout_to_matrix.py ../data/clean_statout -m 100 -n 30 > ../data/length_clean_statout.tsv\n")
101 |     parser.add_argument("-d", "--statout_dir", help="a directory contain many samples clean statout file")
102 |     parser.add_argument("-m", "--max_len", type=int, help="max reads length")
103 |     parser.add_argument("-n", "--min_len", type=int, help="min reads length")
104 |     args = parser.parse_args()
105 |     gen_len_matrix(args.statout_dir, args.min_len, args.max_len)
106 | 
107 | if __name__ == "__main__":
108 |     main()
109 | 


--------------------------------------------------------------------------------
/scripts/extract_bins_from_mgs_profile.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from Bio.Alphabet import generic_dna
  4 | from Bio import SeqIO
  5 | import argparse
  6 | import re
  7 | import os
  8 | import time
  9 | 
 10 | 
 11 | def take_second(elem):
 12 |     return elem[1]
 13 | 
 14 | 
 15 | def parse_mgs(mgs_profile):
 16 |     bins = {}
 17 |     bins_num = []
 18 |     with open(mgs_profile, 'r') as ih:
 19 |         for line in ih:
 20 |             line_list = re.split(r"\s+|,", line.strip(",|\n"))
 21 |             bin_id = line_list[0]
 22 |             contigs_count = int(line_list[1])
 23 |             bins[bin_id] = []
 24 |             bins_num.append((bin_id, contigs_count))
 25 |             for contig_id in line_list[2:]:
 26 |                 bins[bin_id].append(contig_id)
 27 |     bins_num.sort(key=take_second, reverse=True)
 28 |     return bins, bins_num
 29 | 
 30 | 
 31 | def extract(contigs_list, bins, bins_num, head, tail, outdir):
 32 |     files = []
 33 |     all_count = 0
 34 |     with open(contigs_list, 'r') as ih:
 35 |         for line in ih:
 36 |             files.append(line.strip())
 37 | 
 38 |     begin = time.time()
 39 |     records = SeqIO.index_db(":memory:", files, "fasta", generic_dna)
 40 |     end = time.time()
 41 |     print("index db: %.2f s" % (end - begin))
 42 | 
 43 |     begin = time.time()
 44 | 
 45 |     if head is not None:
 46 |         if head > len(bins_num):
 47 |             count = len(bins_num)
 48 |         else:
 49 |             count = head
 50 |         all_count += count
 51 |         for i in range(count):
 52 |             bin_id = bins_num[i][0]
 53 |             with open(os.path.join(outdir, bin_id + ".fa"), 'w') as oh:
 54 |                 for contig_id in bins[bin_id]:
 55 |                     if contig_id in records:
 56 |                         SeqIO.write(records[contig_id], oh, 'fasta')
 57 |                     else:
 58 |                         print("%s has not find %s" % (bin_id, contig_id))
 59 | 
 60 |     if tail is not None:
 61 |         if tail > len(bins_num):
 62 |             count = len(bins_num)
 63 |         else:
 64 |             count = tail
 65 |         all_count += count
 66 |         for i in range(count):
 67 |             bin_id = bins_num[-(i+1)][0]
 68 |             with open(os.path.join(outdir, bin_id + ".fa"), 'w') as oh:
 69 |                 for contig_id in bins[bin_id]:
 70 |                     if contig_id in records:
 71 |                         SeqIO.write(records[contig_id], oh, 'fasta')
 72 |                     else:
 73 |                         print("%s has not find %s" % (bin_id, contig_id))
 74 | 
 75 |     if (head is None) and (tail is None):
 76 |         for bin_id in bins:
 77 |             all_count += 1
 78 |             with open(os.path.join(outdir, bin_id + ".fa"), 'w') as oh:
 79 |                 for contig_id in bins[bin_id]:
 80 |                     if contig_id in records:
 81 |                         SeqIO.write(records[contig_id], oh, 'fasta')
 82 |                     else:
 83 |                         print("%s has not find %s" % (bin_id, contig_id))
 84 | 
 85 |     records.close()
 86 |     end = time.time()
 87 | 
 88 |     print("extract %d bins: %.2f s" % (all_count, end - begin))
 89 | 
 90 | 
 91 | def main():
 92 |     parser = argparse.ArgumentParser("get bins fasta from mgs contigs/scafoolds profile")
 93 |     parser.add_argument('-p', '--profile', type=str, help='mgs contigs/scaffolds profile')
 94 |     parser.add_argument('-l', '--contigs_list', type=str, help='assembly contigs/scaffolds fasta path list')
 95 |     parser.add_argument('-o', '--outdir', type=str, help='bins output dir')
 96 |     parser.add_argument('--head', type=int, default=None, help='head number bins')
 97 |     parser.add_argument('--tail', type=int, default=None, help='tail number bins')
 98 | 
 99 |     args = parser.parse_args()
100 |     if not os.path.exists(args.outdir):
101 |         os.makedirs(args.outdir, exist_ok=True)
102 | 
103 |     (bins, bins_num) = parse_mgs(args.profile)
104 | 
105 |     if (args.head is not None) and (args.tail is not None):
106 |         assert args.head + args.tail <= len(bins_num), "too many head or too many tail"
107 | 
108 |     extract(args.contigs_list, bins, bins_num, args.head, args.tail, args.outdir)
109 | 
110 | 
111 | if __name__ == '__main__':
112 |     main()
113 | 


--------------------------------------------------------------------------------
/scripts/filter_pe_fastq_by_size.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | import glob
  5 | import os
  6 | 
  7 | # see: http://goo.gl/kTQMs
  8 | SYMBOLS = {
  9 |     'customary'     : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'),
 10 |     'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', 'zetta', 'iotta'),
 11 |     'iec'           : ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
 12 |     'iec_ext'       : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', 'zebi', 'yobi'),
 13 | }
 14 | 
 15 | def bytes2human(n, format='%(value).1f %(symbol)s', symbols='customary'):
 16 |     """
 17 |     Convert n bytes into a human readable string based on format.
 18 |     symbols can be either "customary", "customary_ext", "iec" or "iec_ext",
 19 |     see: http://goo.gl/kTQMs
 20 | 
 21 |       >>> bytes2human(0)
 22 |       '0.0 B'
 23 |       >>> bytes2human(0.9)
 24 |       '0.0 B'
 25 |       >>> bytes2human(1)
 26 |       '1.0 B'
 27 |       >>> bytes2human(1.9)
 28 |       '1.0 B'
 29 |       >>> bytes2human(1024)
 30 |       '1.0 K'
 31 |       >>> bytes2human(1048576)
 32 |       '1.0 M'
 33 |       >>> bytes2human(1099511627776127398123789121)
 34 |       '909.5 Y'
 35 | 
 36 |       >>> bytes2human(9856, symbols="customary")
 37 |       '9.6 K'
 38 |       >>> bytes2human(9856, symbols="customary_ext")
 39 |       '9.6 kilo'
 40 |       >>> bytes2human(9856, symbols="iec")
 41 |       '9.6 Ki'
 42 |       >>> bytes2human(9856, symbols="iec_ext")
 43 |       '9.6 kibi'
 44 | 
 45 |       >>> bytes2human(10000, "%(value).1f %(symbol)s/sec")
 46 |       '9.8 K/sec'
 47 | 
 48 |       >>> # precision can be adjusted by playing with %f operator
 49 |       >>> bytes2human(10000, format="%(value).5f %(symbol)s")
 50 |       '9.76562 K'
 51 |     """
 52 |     n = int(n)
 53 |     if n < 0:
 54 |         raise ValueError("n < 0")
 55 |     symbols = SYMBOLS[symbols]
 56 |     prefix = {}
 57 |     for i, s in enumerate(symbols[1:]):
 58 |         prefix[s] = 1 << (i+1)*10
 59 |     for symbol in reversed(symbols[1:]):
 60 |         if n >= prefix[symbol]:
 61 |             value = float(n) / prefix[symbol]
 62 |             return format % locals()
 63 |     return format % dict(symbol=symbols[0], value=n)
 64 | 
 65 | def human2bytes(s):
 66 |     """
 67 |     Attempts to guess the string format based on default symbols
 68 |     set and return the corresponding bytes as an integer.
 69 |     When unable to recognize the format ValueError is raised.
 70 | 
 71 |       >>> human2bytes('0 B')
 72 |       0
 73 |       >>> human2bytes('1 K')
 74 |       1024
 75 |       >>> human2bytes('1 M')
 76 |       1048576
 77 |       >>> human2bytes('1 Gi')
 78 |       1073741824
 79 |       >>> human2bytes('1 tera')
 80 |       1099511627776
 81 | 
 82 |       >>> human2bytes('0.5kilo')
 83 |       512
 84 |       >>> human2bytes('0.1  byte')
 85 |       0
 86 |       >>> human2bytes('1 k')  # k is an alias for K
 87 |       1024
 88 |       >>> human2bytes('12 foo')
 89 |       Traceback (most recent call last):
 90 |           ...
 91 |       ValueError: can't interpret '12 foo'
 92 |     """
 93 |     init = s
 94 |     num = ""
 95 |     while s and s[0:1].isdigit() or s[0:1] == '.':
 96 |         num += s[0]
 97 |         s = s[1:]
 98 |     num = float(num)
 99 |     letter = s.strip()
100 |     for name, sset in SYMBOLS.items():
101 |         if letter in sset:
102 |             break
103 |     else:
104 |         if letter == 'k':
105 |             # treat 'k' as an alias for 'K' as per: http://goo.gl/kTQMs
106 |             sset = SYMBOLS['customary']
107 |             letter = letter.upper()
108 |         else:
109 |             raise ValueError("can't interpret %r" % init)
110 |     prefix = {sset[0]:1}
111 |     for i, s in enumerate(sset[1:]):
112 |         prefix[s] = 1 << (i+1)*10
113 |     return int(num * prefix[letter])
114 | 
115 | 
116 | def main():
117 |     FASTQ_SIZE = human2bytes(sys.argv[2])
118 |     samples = {}
119 |     for i in glob.glob(sys.argv[1].rstrip("/") + "/*.1.fq.gz"):
120 |         if os.path.getsize(i) >= FASTQ_SIZE:
121 |             sample_id = os.path.basename(i).split(".")[0]
122 |             samples[sample_id] = [os.path.abspath(i), os.path.abspath(i.replace("1.fq.gz", "2.fq.gz"))]
123 | 
124 |     print("id\tfq1\tfq2")
125 |     for i in samples:
126 |         print("%s\t%s\t%s" % (i, samples[i][0], samples[i][1]))
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     main()
131 | 


--------------------------------------------------------------------------------
/metapi/profiles/lsf/memory_units.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from enum import Enum
  3 | from typing import Union
  4 | from collections import namedtuple
  5 | 
  6 | 
  7 | class InvalidSuffix(Exception):
  8 |     pass
  9 | 
 10 | 
 11 | class InvalidPower(Exception):
 12 |     pass
 13 | 
 14 | 
 15 | class InvalidMemoryString(Exception):
 16 |     pass
 17 | 
 18 | 
 19 | Scale = namedtuple("Scale", ["power", "metric_suffix"])
 20 | 
 21 | 
 22 | SCALE_MAP = {
 23 |     "B": Scale(0, "B"),
 24 |     "K": Scale(1, "KB"),
 25 |     "M": Scale(2, "MB"),
 26 |     "G": Scale(3, "GB"),
 27 |     "T": Scale(4, "TB"),
 28 |     "P": Scale(5, "PB"),
 29 |     "E": Scale(6, "EB"),
 30 |     "Z": Scale(7, "ZB"),
 31 | }
 32 | 
 33 | 
 34 | class Unit(Enum):
 35 |     BYTES = SCALE_MAP["B"]
 36 |     KILO = SCALE_MAP["K"]
 37 |     MEGA = SCALE_MAP["M"]
 38 |     GIGA = SCALE_MAP["G"]
 39 |     TERA = SCALE_MAP["T"]
 40 |     PETA = SCALE_MAP["P"]
 41 |     EXA = SCALE_MAP["E"]
 42 |     ZETTA = SCALE_MAP["Z"]
 43 | 
 44 |     @staticmethod
 45 |     def from_suffix(suffix: str) -> "Unit":
 46 |         first_letter = suffix[0].upper()
 47 |         if first_letter not in SCALE_MAP:
 48 |             valid_suffixes = ",".join(
 49 |                 scale.metric_suffix for scale in SCALE_MAP.values()
 50 |             )
 51 |             raise InvalidSuffix(
 52 |                 "{suffix}. Valid suffixes are: {valid_suffixes}".format(
 53 |                     suffix=suffix, valid_suffixes=valid_suffixes
 54 |                 )
 55 |             )
 56 |         return Unit(SCALE_MAP[first_letter])
 57 | 
 58 |     @staticmethod
 59 |     def from_power(power: int) -> "Unit":
 60 |         valid_powers = []
 61 |         for scale in SCALE_MAP.values():
 62 |             if scale.power == power:
 63 |                 return Unit(scale)
 64 |             valid_powers.append(scale.power)
 65 | 
 66 |         raise InvalidPower(
 67 |             "{power}. Valid powers are: {valid}".format(
 68 |                 power=power, valid=",".join(str(p) for p in valid_powers)
 69 |             )
 70 |         )
 71 | 
 72 |     @property
 73 |     def power(self) -> int:
 74 |         return self.value.power
 75 | 
 76 |     @property
 77 |     def suffix(self) -> str:
 78 |         return self.value.metric_suffix
 79 | 
 80 | 
 81 | Number = Union[int, float]
 82 | 
 83 | 
 84 | class Memory:
 85 |     def __init__(self, value: Number = 1, unit: Unit = Unit.BYTES):
 86 |         self.value = value
 87 |         self.unit = unit
 88 |         self._decimal_scaling_factor = 1000
 89 |         self._binary_scaling_factor = 1024
 90 | 
 91 |     def __eq__(self, other: "Memory") -> bool:
 92 |         return self.bytes() == other.bytes()
 93 | 
 94 |     def __repr__(self) -> str:
 95 |         val = (
 96 |             int(self.value)
 97 |             if isinstance(self.value, int) or self.value.is_integer()
 98 |             else self.value
 99 |         )
100 |         return "{val}{sfx}".format(val=val, sfx=self.suffix)
101 | 
102 |     @property
103 |     def power(self) -> int:
104 |         return self.unit.power
105 | 
106 |     @property
107 |     def suffix(self) -> str:
108 |         return self.unit.suffix
109 | 
110 |     def _scaling_factor(self, decimal: bool = True) -> int:
111 |         return self._decimal_scaling_factor if decimal else self._binary_scaling_factor
112 | 
113 |     def bytes(self, decimal_multiples: bool = True) -> float:
114 |         scaling_factor = self._scaling_factor(decimal_multiples)
115 |         return float(self.value * (scaling_factor ** self.power))
116 | 
117 |     def to(self, unit: Unit, decimal_multiples: bool = True) -> "Memory":
118 |         scaling_factor = self._scaling_factor(decimal_multiples) ** unit.power
119 |         size = self.bytes(decimal_multiples)
120 |         size /= scaling_factor
121 | 
122 |         return Memory(size, unit)
123 | 
124 |     @staticmethod
125 |     def from_str(s: str) -> "Memory":
126 |         valid_suffixes = "".join(scale.metric_suffix for scale in SCALE_MAP.values())
127 |         regex = re.compile(
128 |             r"^(?P<size>[0-9]*\.?[0-9]+)\s*(?P<sfx>[{}]B?)?$".format(valid_suffixes),
129 |             re.IGNORECASE,
130 |         )
131 |         match = regex.search(s)
132 | 
133 |         if not match:
134 |             raise InvalidMemoryString("{s} is an invalid memory string.".format(s=s))
135 | 
136 |         size = float(match.group("size"))
137 |         suffix = match.group("sfx") or "B"
138 |         unit = Unit.from_suffix(suffix)
139 | 
140 |         return Memory(size, unit)
141 | 


--------------------------------------------------------------------------------
/metapi/rules/binning_report.smk:
--------------------------------------------------------------------------------
  1 | if len(BINNERS_CHECKM) != 0:
  2 |     rule binning_report:
  3 |         input:
  4 |             lambda wildcards: get_binning_done(wildcards, [wildcards.binner_checkm])
  5 |         output:
  6 |             directory(
  7 |                 os.path.join(
  8 |                     config["output"]["binning"],
  9 |                     "report/{assembler}/{binner_checkm}/{binning_group}.{assembly_group}"))
 10 |         params:
 11 |             binning_group = "{binning_group}",
 12 |             assembly_group = "{assembly_group}",
 13 |             assembler = "{assembler}",
 14 |             binner = "{binner_checkm}"
 15 |         priority:
 16 |             35
 17 |         run:
 18 |             import glob
 19 | 
 20 |             shell('''rm -rf {output}''')
 21 |             shell('''mkdir -p {output}''')
 22 | 
 23 |             bin_list =  glob.glob(os.path.dirname(input[0]) + "/*.fa.gz")
 24 |             header_list = [
 25 |                 "binning_group", "assembly_group", "bin_id", "bin_file", "assembler", "binner",
 26 |                 "chr", "length", "#A", "#C", "#G", "#T",
 27 |                 "#2", "#3", "#4", "#CpG", "#tv", "#ts", "#CpG-ts"]
 28 |             header_name = "\\t".join(header_list)
 29 | 
 30 |             for bin_fa in bin_list:
 31 |                 bin_id = os.path.basename(os.path.splitext(os.path.splitext(bin_fa)[0])[0])
 32 |                 bin_file = os.path.abspath(bin_fa)
 33 |                 header_content = "\\t".join([params.binning_group, params.assembly_group, bin_id, bin_file, params.assembler, params.binner])
 34 |                 stats_file = os.path.join(output[0], bin_id + ".seqtk.comp.tsv.gz")
 35 | 
 36 |                 shell(
 37 |                     '''
 38 |                     seqtk comp %s | \
 39 |                     awk \
 40 |                     'BEGIN \
 41 |                     {{print "%s"}}; \
 42 |                     {{print "%s" "\t" $0}}' | \
 43 |                     gzip -c > %s
 44 |                     ''' % (bin_fa, header_name, header_content, stats_file))
 45 | 
 46 | 
 47 |     rule binning_report_merge:
 48 |         input:
 49 |             expand(os.path.join(
 50 |                 config["output"]["binning"],
 51 |                 "report/{{assembler}}/{{binner_checkm}}/{binning_group}.{assembly_group}"),
 52 |                 zip,
 53 |                 binning_group=ASSEMBLY_GROUP["binning_group"],
 54 |                 assembly_group=ASSEMBLY_GROUP["assembly_group"])
 55 |         output:
 56 |             summary = os.path.join(
 57 |                 config["output"]["binning"],
 58 |                 "report/assembly_stats_{assembler}.{binner_checkm}.tsv.gz")
 59 |         params:
 60 |             min_length = config["params"]["assembly"]["report"]["min_length"],
 61 |             len_ranges = config["params"]["assembly"]["report"]["len_ranges"]
 62 |         threads:
 63 |             config["params"]["binning"]["threads"]
 64 |         run:
 65 |             import glob
 66 |             comp_list = []
 67 |             for i in input:
 68 |                 comp_list += glob.glob(i + "/*.seqtk.comp.tsv.gz")
 69 | 
 70 |             if len(comp_list) != 0:
 71 |                 metapi.assembler_init(
 72 |                     params.len_ranges,
 73 |                     ["binning_group", "assembly_group", "bin_id", "bin_file", "assembler", "binner"])
 74 |                 comp_list_ = [(j, params.min_length) for j in comp_list]
 75 |                 metapi.merge(
 76 |                     comp_list_, metapi.parse_assembly,
 77 |                     threads, output=output.summary)
 78 |             else:
 79 |                 shell('''touch {output.summary}''')
 80 | 
 81 | 
 82 |     rule binning_report_all:
 83 |         input:
 84 |             expand(os.path.join(
 85 |                 config["output"]["binning"],
 86 |                 "report/assembly_stats_{assembler}.{binner_checkm}.tsv.gz"),
 87 |                 assembler=ASSEMBLERS,
 88 |                 binner_checkm=BINNERS_CHECKM)
 89 | 
 90 | else:
 91 |     rule binning_report_all:
 92 |         input:
 93 | 
 94 | 
 95 | rule binning_all:
 96 |     input:
 97 |         rules.binning_metabat2_all.input,
 98 |         rules.binning_maxbin2_all.input,
 99 |         rules.binning_concoct_all.input,
100 |         rules.binning_graphbin2_all.input,
101 |         rules.binning_vamb_all.input,
102 |         rules.binning_semibin_all.input,
103 |         rules.binning_dastools_all.input,
104 |         rules.binning_report_all.input
105 | 
106 | 
107 | localrules:
108 |     binning_report_all,
109 |     binning_all


--------------------------------------------------------------------------------
/scripts/post_assembly_binning.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | import os
 4 | 
 5 | import pandas as pd
 6 | 
 7 | 
 8 | def codegen(samples_tsv, output_dir):
 9 |     samples = pd.read_csv(samples_tsv, sep='\t').set_index("bin_id", drop=False)
10 |     
11 |     os.makedirs(output_dir, exist_ok=True)
12 |     
13 |     index_dir = os.path.join(output_dir, "00.index")
14 |     os.makedirs(index_dir, exist_ok=True)
15 |     
16 |     mapping_dir = os.path.join(output_dir, "01.mapping")
17 |     os.makedirs(mapping_dir, exist_ok=True)
18 |     
19 |     asm_dir = os.path.join(output_dir, "02.assembly")
20 |     os.makedirs(asm_dir, exist_ok=True)
21 | 
22 |     checkm_asm_dir = os.path.join(output_dir, "03.checkm_asm")
23 |     os.makedirs(checkm_asm_dir, exist_ok=True)
24 |     checkm_asm_input_dir = os.path.join(checkm_asm_dir, "input")
25 |     os.makedirs(checkm_asm_input_dir, exist_ok=True)
26 |     checkm_asm_output_dir = os.path.join(checkm_asm_dir, "output")
27 |     os.makedirs(checkm_asm_output_dir, exist_ok=True)
28 | 
29 |     with open(os.path.join(output_dir, "step1.index.sh"), 'w') as oh1, \
30 |         open(os.path.join(output_dir, "step2.mapping.sh"), 'w') as oh2, \
31 |         open(os.path.join(output_dir, "step3.assembly_spades.sh"), 'w') as oh3, \
32 |         open(os.path.join(output_dir, "step3.assembly_shovill_spades.sh"), 'w') as oh4, \
33 |         open(os.path.join(output_dir, "step3.assembly_shovill_megahit.sh"), 'w') as oh5, \
34 |         open(os.path.join(output_dir, "step3.assembly_shovill_velvet.sh"), 'w') as oh6, \
35 |         open(os.path.join(output_dir, "step3.assembly_shovill_skesa.sh"), 'w') as oh7, \
36 |         open(os.path.join(output_dir, "step4.links_asm_fa.sh"), 'w') as oh8, \
37 |         open(os.path.join(output_dir, "step5.checkm_lineage_wf.sh"), 'w') as oh9:
38 | 
39 |         for bin_id in samples.index:
40 |             # index
41 |             prefix = os.path.join(index_dir, bin_id)
42 |             cmd = "bwa index %s -p %s\n" % (samples.loc[bin_id, "bins_fna_path"], prefix)
43 |             oh1.write(cmd)
44 | 
45 |             # mapping and extract reads
46 |             r1 = os.path.join(mapping_dir, "%s.r1.fq.gz" % bin_id)
47 |             r2 = os.path.join(mapping_dir, "%s.r2.fq.gz" % bin_id)
48 |             stat = os.path.join(mapping_dir, "%s-flagstat.txt" % bin_id)
49 |             cmd = "bwa mem -t 8 %s %s %s | tee >(samtools flagstat -@8 - > %s) | samtools fastq -@8 -F 12 -n -1 %s -2 %s -\n" % (
50 |                 prefix, samples.loc[bin_id, "fq1"], samples.loc[bin_id, "fq2"], stat, r1, r2)
51 |             oh2.write(cmd)
52 | 
53 |             # assembly
54 |             ## spades
55 |             bins_asm_dir = os.path.join(asm_dir, bin_id + ".spades_out")
56 |             cmd = "spades.py -1 %s -2 %s -k 21,29,39,59,79,99 --threads 8 -o %s\n" % (r1, r2, bins_asm_dir)
57 |             oh3.write(cmd)
58 |             asm_fa = os.path.join(bins_asm_dir, "scaffolds.fasta")
59 |             asm_fa_ = os.path.join(checkm_asm_input_dir, bin_id + ".spades.fa")
60 |             oh8.write("ln -s %s %s\n" % (asm_fa, asm_fa_))
61 | 
62 |             ## shovill
63 |             ### spades or megahit or velvet or skesa
64 |             for assembler, file_handle in zip(["spades", "megahit", "velvet", "skesa"], [oh4, oh5, oh6, oh7]):
65 |                 bins_asm_dir = os.path.join(asm_dir, bin_id + ".shovill_%s_out" % assembler)
66 |                 cmd = "shovill --cpus 8 --ram 20 --keepfiles --assembler %s --outdir %s --R1 %s --R2 %s\n" % (assembler, bins_asm_dir, r1, r2)
67 |                 file_handle.write(cmd)
68 |                 asm_fa = os.path.join(bins_asm_dir, "contigs.fa")
69 |                 asm_fa_ = os.path.join(checkm_asm_input_dir, bin_id + "." + assembler + ".fa")
70 |                 oh8.write("ln -s %s %s\n" % (asm_fa, asm_fa_))
71 |         
72 |         checkm_asm_out = os.path.join(checkm_asm_dir, "checkm_asm.txt")
73 |         checkm_asm_log = os.path.join(checkm_asm_dir, "checkm_asm.log")
74 |         checkm_asm_cmd = "checkm lineage_wf -f %s -t 8 --pplacer_threads 8 -x fa %s/ %s/ 2>%s\n" % \
75 |             (checkm_asm_out, checkm_asm_input_dir, checkm_asm_output_dir, checkm_asm_log)
76 |         oh9.write(checkm_asm_cmd)
77 | 
78 | 
79 | def main():
80 |     parser = argparse.ArgumentParser(description='reassembly reads')
81 |     parser.add_argument('-s', '--samples', type=str, help='metagenomics bins and paired reads list')
82 |     parser.add_argument('-o', '--outdir', type=str, help='output directory')
83 |     args = parser.parse_args()
84 |     codegen(args.samples, args.outdir)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     main()
89 | 


--------------------------------------------------------------------------------
/scripts/estimate_T2T_data_size.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import pandas as pd
  4 | import requests
  5 | import xmltodict
  6 | import argparse
  7 | from rich import print
  8 | from rich.console import Console
  9 | 
 10 | # https://github.com/Textualize/rich/issues/67
 11 | _console = Console()
 12 | 
 13 | class RichArgumentParser(argparse.ArgumentParser):
 14 |     def _print_message(self, message, file=None):
 15 |         _console.print(message)
 16 | 
 17 |     def add_argument_group(self, *args, **kwargs):
 18 |         group = super().add_argument_group(*args, **kwargs)
 19 |         group.title = f"[cyan]{group.title.title()}[/cyan]"
 20 |         return group
 21 | 
 22 | 
 23 | class RichRawTextHelpFormatter(argparse.RawTextHelpFormatter):
 24 |     def _split_lines(self, text, width):
 25 |         return [f"[yellow]{line}[/yellow]" for line in text.splitlines()]
 26 | 
 27 | 
 28 | # see: http://goo.gl/kTQMs
 29 | SYMBOLS = {
 30 |     'customary'     : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'),
 31 |     'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', 'zetta', 'iotta'),
 32 |     'iec'           : ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
 33 |     'iec_ext'       : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', 'zebi', 'yobi'),
 34 | }
 35 | 
 36 | 
 37 | def bytes2human(n, format='%(value).1f %(symbol)s', symbols='customary'):
 38 |     n = int(n)
 39 |     if n < 0:
 40 |         raise ValueError("n < 0")
 41 |     symbols = SYMBOLS[symbols]
 42 |     prefix = {}
 43 |     for i, s in enumerate(symbols[1:]):
 44 |         prefix[s] = 1 << (i+1)*10
 45 |     for symbol in reversed(symbols[1:]):
 46 |         if n >= prefix[symbol]:
 47 |             value = float(n) / prefix[symbol]
 48 |             return format % locals()
 49 |     return format % dict(symbol=symbols[0], value=n)
 50 | 
 51 | 
 52 | def human2bytes(s):
 53 |     init = s
 54 |     num = ""
 55 |     while s and s[0:1].isdigit() or s[0:1] == '.':
 56 |         num += s[0]
 57 |         s = s[1:]
 58 |     if num != "":
 59 |         num = float(num)
 60 |     else:
 61 |         raise ValueError(f"can't covert {s} to float")
 62 |     letter = s.strip()
 63 |     #print(letter)
 64 |     for name, sset in SYMBOLS.items():
 65 |         if letter in sset:
 66 |             break
 67 |     else:
 68 |         if (letter == 'k') or (letter == "m") or (letter == "g"):
 69 |             # treat 'k' as an alias for 'K' as per: http://goo.gl/kTQMs
 70 |             sset = SYMBOLS['customary']
 71 |             letter = letter.upper()
 72 |         else:
 73 |             raise ValueError("can't interpret %r" % init)
 74 |     prefix = {sset[0]:1}
 75 |     for i, s in enumerate(sset[1:]):
 76 |         prefix[s] = 1 << (i+1)*10
 77 |     return int(num * prefix[letter])
 78 | 
 79 | 
 80 | def generate_xml(http_link):
 81 |     print(f'''Parsing: {http_link}\n''')
 82 |     r = requests.get(http_link)
 83 |     if "xml" in r.headers['content-type']:
 84 |         print(f'''Success: get XML document from the link: {http_link}\n''')
 85 |         return r.text
 86 |     else:
 87 |         print(f'''Error: can't get XML document from the link: {http_link}\nExiting\n''')
 88 |         return None
 89 | 
 90 | 
 91 | def estimate_size(xml_str, output=None):
 92 |     xml_dict = xmltodict.parse(xml_str)
 93 |     if "ListBucketResult" in xml_dict:
 94 |         file_info_df = pd.DataFrame(xml_dict["ListBucketResult"]["Contents"])\
 95 |             .astype({"Size": int})\
 96 |             .sort_values(["Size", "Key"])
 97 |         print(file_info_df)
 98 | 
 99 |         if output is not None:
100 |             file_info_df.to_csv(output, sep="\t", index=False)
101 | 
102 |         total_size = sum(file_info_df["Size"])
103 |         total_size_human = bytes2human(total_size)
104 |         print(f'''\nTotal file size is: {total_size}''')
105 |         print(f'''\nTotal file size is: {total_size_human}''')
106 |     else:
107 |         print("\nError: parse XML document error\nExiting")
108 | 
109 | 
110 | def main():
111 |     parser = RichArgumentParser("Estimate T2T data size")
112 |     parser.add_argument("--http-link", dest="http_link",
113 |                         default="https://s3-us-west-2.amazonaws.com/human-pangenomics?/delimiter=/&prefix=T2T",
114 |                         help="T2T file/directory S3 link, default:\nhttps://s3-us-west-2.amazonaws.com/human-pangenomics?/delimiter=/&prefix=T2T")
115 |     parser.add_argument("--output", dest="output", default=None,
116 |                         help="a tsv file contains file information, default=None")
117 |     args = parser.parse_args()
118 | 
119 |     xml_str = generate_xml(args.http_link)
120 |     estimate_size(xml_str, args.output)
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     main()
125 | 


--------------------------------------------------------------------------------
/scripts/mapping_statistics.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import csv
  4 | import fileinput
  5 | import os
  6 | import re
  7 | from decimal import *
  8 | 
  9 | 
 10 | """
 11 | How to assess the quality of metagenomcis assembly
 12 | https://www.biostars.org/p/128629/#128639
 13 | 
 14 | Brian Bushnell saied:
 15 | calculate the percentage of reads that map back to the assembly
 16 | if only 50% of your reads map to the assembly, it is not very complete
 17 | but if 95% of your reads map to the assembly, then even if it is
 18 | somewhat fragmented, that's probably very good
 19 | 
 20 | It might also help to look at the percentage properly paired reads to
 21 | detect any chimeras, something that seems especially relevant in a
 22 | metagenome assembly
 23 | 
 24 | the most useful tools is quast, quality assessment tool for
 25 | genome ascalculate map rate from bamsemblies
 26 | 
 27 | http://genomebio.org/alignment-stats-bwa/
 28 | getting alignment stats out of bwa
 29 | 
 30 | bwa mem -t 6 ref read.1.fq read.2.fq \
 31 | | samtools view -@6 -Sbh - \
 32 | | tee >(samtools flagstat - > stats.out) > aln.bam
 33 | 
 34 | http://www.pnas.org/content/pnas/113/42/11901.full.pdf
 35 | Deep sequencing of 10,000 human genomes(Amalio Telenti and J.Craig Venter)
 36 | 测序深度 = reads长度 × 比对的reads数目 / 参考序列长度
 37 | 
 38 | metabat_coverage
 39 | concoct_coverage
 40 | checkm_coverage
 41 | """
 42 | 
 43 | 
 44 | def mapping_rate(flagstats, out_file, method):
 45 |     """
 46 |     get alignment rate from sorted bam file
 47 |     samtools -flagstat --threads 8 sample.sort.bam
 48 |     """
 49 |     headers = [
 50 |         'sample_id', 'total_num', 'read_1_num', 'read_2_num', 'mapping_type',
 51 |         'mapped_num', 'mapped_rate', 'paired_num', 'paired_rate',
 52 |         'singletons_num', 'singletons_rate', 'mate_mapped_num',
 53 |         'mate_mapped_num_mapQge5'
 54 |     ]
 55 |     mapping_info = []
 56 |     getcontext().prec = 8
 57 | 
 58 |     # with open(flagstat_list, 'r') as list_handle:
 59 |     if method == 1:
 60 |         list_handle = open(flagstats, 'r')
 61 |     if method == 2:
 62 |         list_handle = flagstats
 63 | 
 64 |     for flagstat_file in list_handle:
 65 |         info = {}
 66 |         info['sample_id'] = os.path.basename(
 67 |             flagstat_file.strip()).split('.')[0]
 68 |         stat_list = open(flagstat_file.strip(), 'r').readlines()
 69 |         info['total_num'] = stat_list[0].split(' ')[0]
 70 |         info['read_1_num'] = stat_list[6].split(' ')[0]
 71 |         info['read_2_num'] = stat_list[7].split(' ')[0]
 72 | 
 73 |         mapped = re.split(r'\(|\s+', stat_list[4])
 74 |         info['mapped_num'] = mapped[0]
 75 |         info['mapped_rate'] = Decimal(mapped[5].rstrip('%')) / Decimal(100)
 76 | 
 77 |         paired = re.split(r'\(|\s+', stat_list[8])
 78 |         info['paired_num'] = paired[0]
 79 |         paired_rate = paired[6].rstrip('%')
 80 |         if paired_rate != "N/A":
 81 |             info['paired_rate'] = Decimal(paired_rate) / Decimal(100)
 82 |             info['mapping_type'] = "paired-end"
 83 |         else:
 84 |             info['paired_rate'] = paired_rate
 85 |             info["mapping_type"] = "single-end"
 86 | 
 87 |         singletons = re.split(r'\(|\s+', stat_list[-3])
 88 |         info['singletons_num'] = singletons[0]
 89 |         singletons_rate = singletons[5].rstrip('%')
 90 |         if singletons_rate != "N/A":
 91 |             info['singletons_rate'] = Decimal(singletons_rate) / Decimal(100)
 92 |         else:
 93 |             info['singletons_rate'] = singletons_rate
 94 | 
 95 |         info['mate_mapped_num'] = re.split(r'\(|\s+', stat_list[-2])[0]
 96 |         info['mate_mapped_num_mapQge5'] = re.split(r'\(|\s+', stat_list[-1])[0]
 97 |         mapping_info.append(info)
 98 | 
 99 |     with open(out_file, 'w') as out_handle:
100 |         f_tsv = csv.DictWriter(out_handle, headers, delimiter='\t')
101 |         f_tsv.writeheader()
102 |         f_tsv.writerows(mapping_info)
103 | 
104 | 
105 | def main():
106 |     """main funciton"""
107 |     parser = argparse.ArgumentParser(
108 |         description='compute alignment rate based bam file')
109 |     parser.add_argument(
110 |         '-statlist', default=None, type=str, help='sorted bam file list')
111 |     parser.add_argument(
112 |         '-statfiles', default=None, nargs='*', help='sorted bam file')
113 |     parser.add_argument(
114 |         '-outfile', type=str, help='output alignment rate file')
115 |     args = parser.parse_args()
116 |     if args.statlist:
117 |         method = 1
118 |         mapping_rate(args.statlist, args.outfile, method)
119 |     if args.statfiles:
120 |         method = 2
121 |         mapping_rate(args.statfiles, args.outfile, method)
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     main()
126 | 


--------------------------------------------------------------------------------
/metapi/checkmer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | import argparse
  5 | import concurrent.futures
  6 | import subprocess
  7 | import pandas as pd
  8 | import numpy as np
  9 | from natsort import index_natsorted
 10 | 
 11 | 
 12 | def checkm_prepare(gene_table, batch_num, mags_dir):
 13 |     os.makedirs(mags_dir, exist_ok=True)
 14 | 
 15 |     table_df = pd.read_csv(gene_table, sep="\t")
 16 |     table_df = table_df.sort_values(
 17 |         by="bin_id",
 18 |         key=lambda x: np.argsort(
 19 |         index_natsorted(table_df["bin_id"]))).reset_index(drop=True)
 20 | 
 21 |     batchid = -1
 22 |     if len(table_df) > 0:
 23 |         for batch in range(0, len(table_df), batch_num):
 24 |             batchid += 1
 25 |             table_split = table_df.iloc[batch:batch+batch_num, ]
 26 |             table_split.to_csv(
 27 |                 os.path.join(mags_dir, f"mags_input.{batchid}.tsv"),
 28 |                 sep="\t", index=False, header=None)
 29 |     else:
 30 |         subprocess.run(f'''touch {os.path.join(mags_dir, "mags_input.0.tsv")}''', shell=True)
 31 | 
 32 | 
 33 | def MIMAG_quality_level(row):
 34 |     """
 35 |     https://doi.org/10.1038/nbt.3893
 36 |     """
 37 |     if (row["completeness"] > 90.0) and (row["contamination"] < 5.0):
 38 |         return "high_quality"
 39 |     elif (row["completeness"] > 50.0) and (row["contamination"] < 10.0):
 40 |         return "medium_quality"
 41 |     else:
 42 |         return "low_quality"
 43 | 
 44 | 
 45 | def SGB_quality_level(row):
 46 |     """
 47 |     https://doi.org/10.1016/j.cell.2019.01.001
 48 |     """
 49 |     if (
 50 |         (row["strain_heterogeneity"] < 0.5)
 51 |         and (row["completeness"] > 90.0)
 52 |         and (row["contamination"] < 5.0)
 53 |     ):
 54 |         return "high_quality"
 55 |     elif (row["completeness"] > 50.0) and (row["contamination"] < 5.0):
 56 |         return "medium_quality"
 57 |     else:
 58 |         return "low_quality"
 59 | 
 60 | 
 61 | def quality_score(row):
 62 |     """
 63 |     https://doi.org/10.1038/s41586-019-0965-1
 64 |     """
 65 |     return row["completeness"] - 5 * row["contamination"]
 66 | 
 67 | 
 68 | def parse_checkm_table(checkm_table):
 69 |     if os.path.getsize(checkm_table) > 0:
 70 |         checkm_df = pd.read_csv(checkm_table, sep="\t")
 71 |         return checkm_df
 72 |     else:
 73 |         return None
 74 | 
 75 | 
 76 | def checkm_reporter(checkm_list, output, threads):
 77 |     df_list = []
 78 |     with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor:
 79 |         for df in executor.map(parse_checkm_table, checkm_list):
 80 |             if df is not None:
 81 |                 df_list.append(df)
 82 | 
 83 |     df_ = pd.DataFrame(
 84 |         columns=[
 85 |             "bin_id",
 86 |             "marker_lineage",
 87 |             "genomes",
 88 |             "markers",
 89 |             "marker_sets",
 90 |             "completeness",
 91 |             "contamination",
 92 |             "strain_heterogeneity",
 93 |             "MIMAG_quality_level",
 94 |             "SGB_quality_level",
 95 |             "quality_score"])
 96 | 
 97 |     if len(df_list) >= 1:
 98 |         df_ = pd.concat(df_list).rename(
 99 |             columns={
100 |                 "Bin Id": "bin_id",
101 |                 "Marker lineage": "marker_lineage",
102 |                 "# genomes": "genomes",
103 |                 "# markers": "markers",
104 |                 "# marker sets": "marker_sets",
105 |                 "Completeness": "completeness",
106 |                 "Contamination": "contamination",
107 |                 "Strain heterogeneity": "strain_heterogeneity",
108 |             }
109 |         )
110 | 
111 |     if not df_.empty:
112 |         df_["MIMAG_quality_level"] = df_.apply(
113 |             lambda x: MIMAG_quality_level(x), axis=1)
114 |         df_["SGB_quality_level"] = df_.apply(
115 |             lambda x: SGB_quality_level(x), axis=1)
116 |         df_["quality_score"] = df_.apply(lambda x: quality_score(x), axis=1)
117 | 
118 |     if output is not None:
119 |         df_.to_csv(output, sep="\t", index=False)
120 | 
121 |     return df_
122 | 
123 | 
124 | def main():
125 |     parser = argparse.ArgumentParser("CheckM reporter")
126 |     parser.add_argument("--checkm_list", type=str, help="checkm out list")
127 |     parser.add_argument("--output", type=str, required=True,
128 |                         help="checkm output file")
129 |     parser.add_argument(
130 |         "--threads", type=int, default=8, help="threads used on combine CheckM output"
131 |     )
132 |     args = parser.parse_args()
133 | 
134 |     checkm_list = [l.strip() for l in open(args.checkm_list, "r")]
135 |     checkm_reporter(checkm_list, args.output, args.threads)
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     main()
140 | 


--------------------------------------------------------------------------------
/scripts/t2d_abundance_merger.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import pandas as pd
  4 | import concurrent.futures
  5 | import os
  6 | import sys
  7 | import argparse
  8 | 
  9 | def global_init(index_metadata):
 10 |     global INDEX_METADATA__
 11 |     INDEX_METADATA__ = pd.read_csv(index_metadata, sep='\t')
 12 | 
 13 | 
 14 | def get_mgs_id(row):
 15 |     return "_".join(row["ID"].split("_")[0:-1])
 16 | 
 17 | 
 18 | def get_abun_df_hsx(abun_file):
 19 |     sample_id = os.path.basename(abun_file).split(".")[0]
 20 | 
 21 |     try:
 22 |         if os.path.exists(abun_file):
 23 |             abun = pd.read_csv(abun_file, sep='\t')
 24 |         else:
 25 |             print("%s is not exists" % abun_file)
 26 |             return None, None
 27 |     except pd.errors.EmptyDataError:
 28 |         print("%s is empty" % abun_file)
 29 |         return None, None
 30 | 
 31 |     abun["mgs_id"] = abun.apply(get_mgs_id, axis=1)
 32 | 
 33 |     count_df = abun.loc[:, ["mgs_id", "reads_pairs"]]\
 34 |                    .groupby("mgs_id")\
 35 |                    .agg({"reads_pairs": 'sum'})\
 36 |                    .rename(columns={"reads_pairs": sample_id})
 37 |     abun_df = abun.loc[:, ["mgs_id", "gene_abundance"]]\
 38 |                   .groupby("mgs_id")\
 39 |                   .agg({"gene_abundance": 'sum'})\
 40 |                   .rename(columns={"gene_abundance": sample_id})
 41 |     return count_df, abun_df
 42 | 
 43 | 
 44 | def get_abun_df_jgi(depth_file):
 45 |     sample_id = os.path.basename(depth_file).split(".")[0]
 46 | 
 47 |     try:
 48 |         if os.path.exists(depth_file):
 49 |             depth = pd.read_csv(depth_file, sep='\t')
 50 |         else:
 51 |             print("%s is not exists" % depth_file)
 52 |             return None, None
 53 |     except pd.errors.EmptyDataError:
 54 |         print("%s is empty" % depth_file)
 55 |         return None, None
 56 | 
 57 |     depth = depth.rename(columns={"contigName": "contig_name"})\
 58 |                  .merge(INDEX_METADATA__)\
 59 |                  .groupby("mgs_id")\
 60 |                  .agg({"totalAvgDepth": "mean"})
 61 |     depth[sample_id] = depth["totalAvgDepth"] / sum(depth["totalAvgDepth"])
 62 |     depth_df = depth.loc[:, ["totalAvgDepth"]].rename(columns={"totalAvgDepth": sample_id})
 63 |     abun_df = depth.loc[:, [sample_id]]
 64 |     return depth_df, abun_df
 65 | 
 66 | 
 67 | def get_all_abun_df(abun_files, workers, func):
 68 |     count_list = []
 69 |     abun_list = []
 70 |     with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
 71 |         for count_df, abun_df in executor.map(func, abun_files):
 72 |             if (not count_df is None) and (not abun_df is None):
 73 |                 count_list.append(count_df)
 74 |                 abun_list.append(abun_df)
 75 | 
 76 |     count_df_ = pd.concat(count_list, axis=1).reset_index()
 77 |     abun_df_ = pd.concat(abun_list, axis=1).reset_index()
 78 | 
 79 |     return count_df_, abun_df_
 80 | 
 81 | 
 82 | def main():
 83 |     parser = argparse.ArgumentParser('merge many samples abundance file to one profile')
 84 |     parser.add_argument(
 85 |         '-l',
 86 |         '--abundance_list',
 87 |         type=str,
 88 |         help='abundance list')
 89 |     parser.add_argument(
 90 |         '--method',
 91 |         default="hsx",
 92 |         choices=["hsx", "jgi"],
 93 |         help='compute method'
 94 |     )
 95 |     parser.add_argument(
 96 |         '--database',
 97 |         default=None,
 98 |         help='contig and genome relationships'
 99 |     )
100 |     parser.add_argument(
101 |         '--threads',
102 |         default=8,
103 |         type=int,
104 |         help='threads'
105 |     )
106 |     parser.add_argument(
107 |         '--out_count_profile',
108 |         type=str,
109 |         help='output count profile')
110 |     parser.add_argument(
111 |         '--out_abundance_profile',
112 |         type=str,
113 |         help='output abundance profile')
114 |     args = parser.parse_args()
115 | 
116 |     abun_files = pd.read_csv(args.abundance_list, names=["path"])\
117 |                    .loc[:, "path"].values
118 | 
119 |     if args.method == "jgi" and args.database is None:
120 |         print("pleas supply database when parse jgi depth file")
121 |         sys.exit(1)
122 | 
123 |     if args.method == "hsx":
124 |         count_df, abun_df = get_all_abun_df(abun_files, args.threads, get_abun_df_hsx)
125 |     elif args.method == "jgi":
126 |         global_init(args.database)
127 |         count_df, abun_df = get_all_abun_df(abun_files, args.threads, get_abun_df_jgi)
128 |     else:
129 |         print("unsupport method: %s" % args.method)
130 | 
131 |     count_df.to_csv(args.out_count_profile, sep='\t', index=False)
132 |     abun_df.to_csv(args.out_abundance_profile, sep='\t', index=False)
133 | 
134 | 
135 | if __name__ == '__main__':
136 |     main()
137 | 


--------------------------------------------------------------------------------