├── .gitignore ├── COPYING ├── Makefile ├── README.md ├── TruSeq3-PE-2+omics.fa ├── bash-completion └── omics ├── docs ├── Makefile ├── _static │ └── css │ │ └── custom.css ├── assemble.txt ├── binning.txt ├── bins2fasta.txt ├── chop-contigs.txt ├── conf.py ├── container.txt ├── illumina-reads-processing.txt ├── index.txt ├── init.txt ├── mapping.txt ├── merge-coverage.txt ├── omics.txt ├── prep.txt ├── qc-check.txt ├── qc-sample.txt ├── qc.txt ├── run.txt ├── separate-interleaved.txt ├── template.txt └── unchop-contigs.txt ├── help2rst ├── lib ├── Makefile ├── liba.sh └── omics │ ├── __init__.py │ ├── __main__.py │ ├── _version.py │ ├── bin_coverage.py │ ├── bin_stats.py │ ├── bins2fasta.py │ ├── checkm.py │ ├── db │ ├── __init__.py │ ├── apps.py │ ├── manage.py │ ├── management │ │ ├── __init__.py │ │ └── commands │ │ │ └── __init__.py │ └── models.py │ ├── derep.py │ ├── fastq2fasta.py │ ├── init.py │ ├── interleave.py │ ├── prep.py │ ├── qc.py │ ├── qc_check.py │ ├── read_counts.py │ ├── shared.py │ ├── unchop_bins.py │ └── utils.py ├── localenv ├── modulefiles ├── flux.omics │ ├── 1 │ ├── 2 │ └── .version ├── install └── omics │ ├── 1 │ └── .version ├── phylosiftrc ├── scripts ├── COPYRIGHT.tetramer_freqs_esom ├── CRISPR_spacer_extractor ├── ESOM_binning_results_parser ├── Ebot.Output.Extract.Gi.Title.Rev3 ├── GI_info_XMLParser ├── Makefile ├── Metabat_to_anvio_parser ├── U2T ├── VizBin_parser ├── addFileName2header ├── addInfo2lrn ├── ani ├── antiSmash_summary ├── assemble.pl ├── assemblyModules ├── asv-map-update ├── aview ├── bamTools ├── basicHF ├── batchBlast ├── binTablesForIMG ├── bins2fasta ├── blast2citation ├── blastDensityPlot ├── calcN50 ├── changeClasses ├── changePGDBattribs ├── chop-contigs ├── chopper ├── clusterDensity ├── comics ├── consolidateJGIdata ├── contigMetadata ├── countInstances ├── coveragePerBin ├── coveragePerScaffold ├── createFastq ├── createNodes ├── createPhgDB ├── curateDB ├── dada2shared ├── derep+alias ├── derep_ClusterMap ├── derep_getReadAbundance ├── dereplicate ├── do2folder ├── do2list ├── downsample ├── embl2picture ├── esomCodonMod ├── esomTrain ├── esomWrapper ├── expandGFF ├── extractContigReads ├── extractEuks ├── extractGenbankMetadata ├── extractGenomes ├── extractSeqs ├── extractSubSeq ├── extractTranslationsFromGbk ├── extract_Blast_Hits_Of_Interest ├── fileChopper ├── findStretchesOfNs ├── firefox_already_running ├── fixpod ├── fixpod2 ├── fixpod3 ├── fixpod4 ├── fixpod5 ├── fixpod6 ├── folderLevelSize ├── fragRec ├── gbk2fna ├── gcSkew ├── genomeCheck ├── genomicFluidity ├── getBwaMappedReadList ├── getClassFasta ├── getCol ├── getFamilyP ├── getFastaFromAccNos ├── getGFF ├── getGI ├── getGIAnnotation ├── getGISummary ├── getGeneClusters ├── getGiInfo ├── getLineage ├── getMasterList ├── getMyContigs ├── getRandomData ├── getSciNames ├── gff2fasta ├── gff2neo ├── gff2tbl ├── iClust ├── img_Bin_Classifier ├── inflate ├── interleave ├── itemize ├── kmerFreq ├── legacy_consolidateJGIdata ├── length+GC ├── limit2Length ├── mVelvetPipe_paired ├── mVelvetPipe_singles ├── makeAnnotationFile ├── map_project_names ├── mapper ├── mapper_getQueryList ├── match-dada2-mothur ├── matchQueryNames ├── measureCompleteness ├── merge-coverage ├── merge-covs ├── mockest ├── nameClassFiles ├── nsmpReport ├── oasesPaired_pipe ├── omics ├── omics-assemble ├── omics-binning ├── omics-container ├── omics-init ├── omics-mapping ├── omics-prep ├── omics-qc ├── omics-qc-check ├── omics-qc-sample ├── omics-run ├── parallel_antiSmash ├── parallel_getGenomesFromTaxa ├── parseBlastXML ├── parseFastq ├── parseTinySeqXML.xslt ├── patchBlastLineage ├── plot-blast-frag-cov ├── plot-coverage ├── plot-megahit-log ├── plot-shared-otu-counts ├── plot_alignment ├── postBlast ├── ppt_getGI ├── ppt_getXML ├── refseq-rna ├── removeBlastSubj ├── removeCommentLines ├── remove_space_from_filenames ├── renameHeaders ├── reverse_complement ├── rgi-coverage ├── rgi-setup ├── sangerSeqParser ├── separate-interleaved ├── setup_metapathways ├── shared-filter-abundance ├── shared-get ├── shared-jaccard ├── shared-merge-otus ├── shared-set-accessions ├── shared-unique-prevalence ├── shared2fasta ├── silva-align ├── silva-db ├── silvaTaxonAppend ├── slideshow.xml ├── summarize_antiSmash ├── tally ├── tally-weave ├── tallyWrap ├── taxonDist ├── test_fragRec ├── tetramer_freqs_esom ├── tinySeq2fasta.xslt ├── tinySeq2table.xslt ├── toMultiGBK ├── toPhylipAndBack ├── top5 ├── track-mothur-counts ├── triage ├── twitterscript.xml ├── uClustHomology ├── unchop-contigs └── usageStats └── test └── run /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | .*sw[op] 3 | *.pyc 4 | 5 | # make-generated files 6 | scripts/*.1 7 | geo-omics-scripts*.tar.gz 8 | geo-omics-scripts*/* 9 | docs/_build 10 | 11 | # contains hard-link to liba.sh, to allow running scripts in dev environment 12 | share/ 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Michigan Geomicrobiology Lab 2 | 3 | Welcome to the GitHub Repo for some general purpose NGS, Data analysis and 4 | mining scripts used in the lab. Some scripts implement our short-read QC, 5 | assembly, binning, etc. pipeline and depend on the presence of a number of 6 | third-party software. The rest are Bash scripts or in core 7 | [Perl](http://www.perl.org/ "Perl Home") or [Python](https://www.python.org/ 8 | "Python Home"). This means, if you have Perl or Python 3 installed, you won't 9 | need anything else to work with these scripts. 10 | 11 | Since these scripts are actively being used by the Lab, you can expect full 12 | support for any [issues](https://github.com/Geo-omics/scripts/issues "Report an 13 | issue"). Please do let us know if you find any bugs or easier/quicker/more 14 | elegant solutions. 15 | 16 | 17 | ## Language and OS Dependencies 18 | 19 | The scripts should work with variuos flavors of Linux and other unix-like 20 | environments. Here is a list of easy to install languages that you'll need: 21 | 22 | * Perl version 5.10 + 23 | * Python version 3.5 + 24 | * R version 3 + 25 | 26 | ## Contact 27 | 28 | Please send questions or comments to . 29 | 30 | ## Principal Investigator 31 | 32 | [Gregory J. Dick](https://sites.lsa.umich.edu/geomicro/ "Geomicrobiology Lab Homepage"), gdick [AT] umich [DOT] edu 33 | 34 | 35 | ## License 36 | 37 | Geo-omics-scripts is free software: you can redistribute it and/or modify it 38 | under the terms of the GNU General Public License as published by the Free 39 | Software Foundation, either version 3 of the License, or (at your option) any 40 | later version. 41 | 42 | 43 | ## Disclaimer 44 | 45 | **Geo-omics scripts are distributed in the hope that they will be useful, but 46 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 47 | FITNESS FOR A PARTICULAR PURPOSE.** 48 | -------------------------------------------------------------------------------- /TruSeq3-PE-2+omics.fa: -------------------------------------------------------------------------------- 1 | >PrefixPE/1 2 | TACACTCTTTCCCTACACGACGCTCTTCCGATCT 3 | >PrefixPE/2 4 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 5 | >PE1 6 | TACACTCTTTCCCTACACGACGCTCTTCCGATCT 7 | >PE1_rc 8 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA 9 | >PE2 10 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT 11 | >PE2_rc 12 | AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC 13 | >TruSeq_Adapter_Index_end 14 | CGTATGCCGTCTTCTGCTTG 15 | >TruSeq_Adapter_Index_end_rc 16 | CAAGCAGAAGACGGCATACG 17 | >Illumina_Paired_End_Sequencing_Primer_2 18 | CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT 19 | >Illumina_Paired_End_Sequencing_Primer_2_rc 20 | AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCG 21 | >Illumina_Paired_End_Adapter_2 22 | GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG 23 | >Illumina_RNA_PCR_Primer_rc 24 | TCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATT 25 | -------------------------------------------------------------------------------- /bash-completion/omics: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Regents of The University of Michigan. 2 | 3 | # This file is part of geo-omics-scripts. 4 | 5 | # Geo-omics-scripts is free software: you can redistribute it and/or 6 | # modify it under the terms of the GNU General Public License as published 7 | # by the Free Software Foundation, either version 3 of the License, or (at 8 | # your option) any later version. 9 | 10 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 11 | # WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License along 16 | # with Geo-omics-scripts. If not, see . 17 | 18 | _omics_completion() 19 | { 20 | # suppress stderr while running python unless we are debugging 21 | local hide_stderr=true 22 | [[ -v OMICS_AUTO_COMPLETE_DEBUG ]] && [[ -n "${OMICS_AUTO_COMPLETE_DEBUG}" ]] && hide_stderr=false 23 | $hide_stderr && exec {stderr}>&2 2>/dev/null 24 | 25 | local -a reply 26 | local do_file_completion 27 | if reply=( 28 | # call omics executable 29 | $(OMICS_AUTO_COMPLETE="$COMP_CWORD" "${COMP_WORDS[@]}") 30 | ); then 31 | # add file completion if requested, marker must be last 32 | if [[ ${#reply[@]} -eq 0 ]]; then 33 | do_file_completion=true 34 | elif [[ ${reply[-1]} == FILE_COMPLETION ]]; then 35 | unset reply[-1] 36 | do_file_completion=true 37 | else 38 | # normal, non-empty reply 39 | do_file_completion=false 40 | fi 41 | else 42 | # fall back to file completion 43 | do_file_completion=true 44 | fi 45 | 46 | # restore stderr as needed 47 | $hide_stderr && exec 2>&$stderr- 48 | 49 | $do_file_completion && reply+=( $(compgen -f -- "${COMP_WORDS[COMP_CWORD]}" ) ) 50 | 51 | COMPREPLY=("${reply[@]}") 52 | } 53 | complete -F _omics_completion omics 54 | -------------------------------------------------------------------------------- /docs/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | /* noredcode: hack to avoid red inline code or literals 2 | * Sphinx should allow one to amend themes somehow but who knows, 3 | * so append this file to theme.css of the rtd-theme 4 | */ 5 | code.samp span.pre, code.literal span.pre {color: #444;} 6 | -------------------------------------------------------------------------------- /docs/assemble.txt: -------------------------------------------------------------------------------- 1 | .. program:: omics assemble 2 | 3 | ============================================ 4 | assemble - assemble metagenomes with IDBA_UD 5 | ============================================ 6 | 7 | Synopsis 8 | ======== 9 | 10 | :program:`omics assemble` [OPTIONS]... [SAMPLE_DIR]... 11 | 12 | 13 | Description 14 | =========== 15 | 16 | This script implements the assembly step of the Geomicro Illumina Reads 17 | processing Pipeline. It uses :command:`megahit` or :command:`idba_ud` for the 18 | assembly and parameters for k-mer range and step size can be passed along. The 19 | script will check the quality of the assembly with QUAST, attempt to classify 20 | contigs, and run PhyloSift on the assembly. 21 | 22 | 23 | Options 24 | ======= 25 | 26 | .. option:: --assembly-only 27 | 28 | Stop after finishing the assembly 29 | 30 | .. option:: --contigs FILE 31 | 32 | Skip the assembly step and but continue downstream analysis steps with the 33 | provided assembly (can be contigs or scaffold stage) 34 | 35 | .. option:: --cpus N 36 | 37 | Specify how many CPUs to use for parallel execution 38 | 39 | .. option:: --force 40 | 41 | Overwrite existing results, by default will exit with an error message when 42 | the output file exist. 43 | 44 | .. option:: --idba-ud 45 | 46 | Use the IDBA_UD assembler instead of the default MEGAHIT 47 | 48 | .. option:: --mink=N, 49 | 50 | parameter for minimum k-mer size 51 | 52 | .. option:: --maxk=N 53 | 54 | parameter for maximum k-mer size 55 | 56 | .. option:: --step=N 57 | 58 | parameter for step size for increasing k-mer values 59 | 60 | .. option:: --level LEVEL 61 | 62 | Specifies the assembly level to use for downstream analysis. Can be either 63 | scaffold (the default) or contig. 64 | 65 | .. option:: --megahit 66 | 67 | Use the MEGAHIT assembler. This is the default. 68 | 69 | .. option:: --out PATH 70 | 71 | The directory to which output is saved. The default value is :file:`ASSEMBLY` 72 | 73 | .. option:: --phylosiftrc FILE 74 | 75 | Allows to specify a custom PhyloSift configuration file. 76 | 77 | .. option:: --reads FILE 78 | 79 | Interleaved reads file, by default this is :file:`dt_int.fasta` which is the output of :program:`omics qc` 80 | 81 | .. option:: --skip-blast 82 | 83 | Skip classification steps, by default run blast 84 | 85 | .. option:: --skip-phylosift 86 | 87 | Skip PhyloSift run, by default PhyloSift is run 88 | 89 | .. option:: --skip-quast 90 | 91 | Skip QUAST analysis, by default QUAST is run 92 | 93 | .. option:: --working-dir=DIR 94 | 95 | Directory under which output is stored. By default this is the current 96 | directory. 97 | 98 | .. option:: -h, --help 99 | 100 | Print help. 101 | 102 | .. option:: --no-color 103 | 104 | Disable colorful output. 105 | 106 | .. option:: -v, --verbosity=N 107 | 108 | Use one or multiple ``-v`` to increase verbosity of output or set a 109 | level of verbosity with ``--verbosity=N``. By default the verbosity 110 | level is 1. Setting verbosity to 0 silences the program. A level of 111 | 3 prints debugging info. 112 | 113 | 114 | Exit Status 115 | =========== 116 | 117 | Exits with non-zero upon encountering an error. 118 | 119 | .. only:: man 120 | 121 | See Also 122 | ======== 123 | 124 | :manpage:`omics-prep(1)`, :manpage:`omics-qc(1)`, :manpage:`omics(7)`, :manpage:`illumina-reads-processing(7)` 125 | 126 | -------------------------------------------------------------------------------- /docs/binning.txt: -------------------------------------------------------------------------------- 1 | .. program:: omics binning 2 | 3 | ==================================== 4 | binning - bin metagenomic assemblies 5 | ==================================== 6 | 7 | Synopsis 8 | ======== 9 | 10 | :program:`omics binning` [OPTIONS]... 11 | 12 | 13 | Description 14 | =========== 15 | 16 | The :program:`omics binning` script implements the binning step for the Geomicro 17 | Illumina Reads Pipeline. 18 | 19 | Options 20 | ======= 21 | 22 | .. option:: -a, --assembly FILE 23 | 24 | Fasta-formatted file containing the assembled contigs, by default this is 25 | :file:`contigs.fa` 26 | 27 | .. option:: -c, --coverage-file FILE 28 | 29 | Merged/shared per-sample-contig mean coverage file. If this option is not 30 | present, then this file will be compiled from the files found with the 31 | --coverage-path argument. 32 | 33 | .. option:: --coverage-path PATH 34 | 35 | Path to coverage files relative to each sample directory; the default is 36 | :file:`MAPPING/assembly.chop.genomeCovBed.tsv`. These files are made by the 37 | mapping script and correspond to the :file:`asm_pair-smds.bam` files made by 38 | CONCOCTs :program:`map-bowtie2-markduplicates.sh`. This option is 39 | incopmpatible with the -c option. 40 | 41 | .. option:: --force 42 | 43 | Overwrite existing data 44 | 45 | .. option:: -o, --out-dir PATH 46 | 47 | Path to output directory, by default this is :file:`BINNING` 48 | 49 | .. option:: --working-dir=DIR 50 | 51 | Directory under which output is stored. By default this is the current 52 | directory. 53 | 54 | .. option:: -h, --help 55 | 56 | Print help. 57 | 58 | .. option:: --no-color 59 | 60 | Disable colorful output. 61 | 62 | .. option:: -v, --verbosity=N 63 | 64 | Use one or multiple ``-v`` to increase verbosity of output or set a 65 | level of verbosity with ``--verbosity=N``. By default the verbosity 66 | level is 1. Setting verbosity to 0 silences the program. A level of 67 | 3 prints debugging info. 68 | 69 | 70 | Exit Status 71 | =========== 72 | 73 | Exits with non-zero upon encountering an error. 74 | 75 | .. only:: man 76 | 77 | See Also 78 | ======== 79 | 80 | :manpage:`omics-prep(1)`, :manpage:`omics-qc(1)`, :manpage:`omics-assemble(1)`, 81 | :manpage:`omics-mapping(1)`, :manpage:`omics(7)`, 82 | :manpage:`illumina-reads-processing(7)` 83 | 84 | -------------------------------------------------------------------------------- /docs/bins2fasta.txt: -------------------------------------------------------------------------------- 1 | .. program:: bins2fasta 2 | 3 | =========================================== 4 | bins2fasta - generate fasta files from bins 5 | =========================================== 6 | 7 | .. argparse:: 8 | :module: omics.bins2fasta 9 | :func: get_argp 10 | :prog: bins2fasta 11 | :nodefault: 12 | :manpage: 13 | 14 | 15 | Exit Status 16 | =========== 17 | 18 | Exits with non-zero upon encountering an error. 19 | 20 | .. only:: man 21 | 22 | See Also 23 | ======== 24 | 25 | :manpage:`omics-binning`, :manpage:`omics(7)`, 26 | :manpage:`illumina-reads-processing(7)` 27 | 28 | -------------------------------------------------------------------------------- /docs/chop-contigs.txt: -------------------------------------------------------------------------------- 1 | .. program:: chop-contigs 2 | 3 | ==================================== 4 | chop-contigs 5 | ==================================== 6 | 7 | Synopsis 8 | ======== 9 | 10 | :program:`chop-contigs` [OPTIONS]... [-i ] 11 | 12 | 13 | Description 14 | =========== 15 | 16 | Chop up an assemblies contigs to fixes sizes. 17 | 18 | 19 | Options 20 | ======= 21 | .. option:: -h, --help 22 | 23 | show this help message and exit 24 | 25 | .. option:: --chunk-size SIZE 26 | 27 | Size of chunk into which contigs are divided. Default is 10000 28 | 29 | .. option:: -i, --input FILE 30 | 31 | input, fasta-formatted file with contigs, if not given stdin is used. 32 | 33 | .. option:: -o, --output FILE 34 | 35 | Output file 36 | 37 | .. option:: --wrap 38 | 39 | Wrap output sequences to line of length 60. 40 | 41 | .. option:: --no-dot-zero 42 | 43 | Do not add a .0 to a fasta header of a short sequence that didnot need to 44 | be chopped up. This option make the output compatible with CONCOCT's 45 | cut_up_fasta.py script. 46 | 47 | .. option:: --no-truncate-headers 48 | 49 | Do not further manipulate fasta headers beyond adding the chop numbers. By 50 | default, the header is truncated at the first whitespace character, 51 | assuming this still uniquely identifies the contig. This default behaviour 52 | is needed for MEGAHIT assemblies and does no harm to IDBA assemblies. 53 | 54 | .. option:: --debug 55 | 56 | Print stack trace on errors. 57 | 58 | 59 | Exit Status 60 | =========== 61 | 62 | Exits with non-zero upon encountering an error. 63 | 64 | .. only:: man 65 | 66 | See Also 67 | ======== 68 | 69 | :manpage:`omics-binning(1)`, 70 | :manpage:`omics-mapping(1)`, :manpage:`omics(7)`, 71 | :manpage:`illumina-reads-processing(7)` 72 | 73 | 74 | -------------------------------------------------------------------------------- /docs/container.txt: -------------------------------------------------------------------------------- 1 | .. program:: omics container 2 | 3 | ================================================ 4 | container -- start a singularity container 5 | ================================================ 6 | 7 | Synopsis 8 | ======== 9 | 10 | :program:`omics container` [OPTIONS]... 11 | 12 | 13 | Description 14 | =========== 15 | 16 | Start a shell in a singularity container environment (typically on Flux HPC) with all 17 | omics scripts and software dependencies available. 18 | 19 | 20 | Options 21 | ======= 22 | 23 | .. option:: -i, --container-image PATH 24 | 25 | Path to singularity container image. A sensible default is chosen if this 26 | option is not provided. 27 | 28 | .. option:: -k, --keep-modules-loaded 29 | 30 | Do not purge environment modules, by default all modules get purged. 31 | 32 | 33 | Exit Status 34 | =========== 35 | 36 | Exits with non-zero upon encountering an error. 37 | 38 | .. only:: man 39 | 40 | See Also 41 | ======== 42 | 43 | :manpage:`omics(7)`, :manpage:`illumina-reads-processing(7)` 44 | -------------------------------------------------------------------------------- /docs/index.txt: -------------------------------------------------------------------------------- 1 | .. geo-omics-scripts documentation master file, created by 2 | sphinx-quickstart on Thu Mar 16 15:39:32 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to geo-omics-scripts's documentation! 7 | ============================================= 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | :glob: 14 | 15 | * 16 | 17 | Indices and tables 18 | ================== 19 | 20 | * :ref:`genindex` 21 | * :ref:`modindex` 22 | * :ref:`search` 23 | 24 | -------------------------------------------------------------------------------- /docs/init.txt: -------------------------------------------------------------------------------- 1 | .. program:: omics init 2 | 3 | ========================================= 4 | init - initialize omics project directory 5 | ========================================= 6 | 7 | .. argparse:: 8 | :module: omics.init 9 | :func: get_argp 10 | :prog: omics init 11 | :nodefault: 12 | :manpage: 13 | 14 | 15 | Exit Status 16 | =========== 17 | 18 | Exits with non-zero upon encountering an error. 19 | 20 | .. only:: man 21 | 22 | See Also 23 | ======== 24 | 25 | :manpage:`omics-prep(1)`, :manpage:`omics(7)`, :manpage:`illumina-reads-processing(7)` 26 | -------------------------------------------------------------------------------- /docs/mapping.txt: -------------------------------------------------------------------------------- 1 | .. program:: omics mapping 2 | 3 | ============================================ 4 | mapping - mapping reads to assembly 5 | ============================================ 6 | 7 | Synopsis 8 | ======== 9 | 10 | :program:`omics mapping` [OPTIONS...] [SAMPLE_DIRS...] 11 | 12 | 13 | Description 14 | =========== 15 | 16 | This script implements the mapping step of the Geomicro Illumina Reads Processing 17 | Pipline. 18 | 19 | 20 | Options 21 | ======= 22 | 23 | .. option:: -a, --assembly FILE 24 | 25 | Specify assembly i.e. a fasta file of contigs or scaffolds, by default 26 | this is :file:`assembly.fa` 27 | 28 | .. option:: -c, --chop 29 | 30 | Chop up contigs of assembly before mapping. This should be done before 31 | binning with CONCOCT. 32 | 33 | .. option:: --chunk-size N 34 | 35 | Size of chunks for chopping, the default is $DEFAULT_CHUNK_SIZE, 36 | implies ``--chop``. 37 | 38 | .. option:: -f, --fwd-reads FILE 39 | 40 | Fastq file with either forward reads or single-end reads, by default this 41 | is :file:`fwd.fastq` 42 | 43 | .. option:: -r, --rev-reads FILE 44 | 45 | Fastq file with reverse reads, by default this is :file:`rev.fastq` 46 | 47 | .. option:: -i, --int-reads FILE 48 | 49 | Fastq file with paired-end, interleaved reads, can also be single-paired 50 | reads, The default is dt_int.fastq 51 | 52 | .. option:: -o, --out-dir DIR 53 | 54 | Name of output directory, default is :file:`MAPPING` in the sample 55 | directory 56 | 57 | .. option:: --no-indexing 58 | 59 | Do not index the assembly but existing index 60 | 61 | .. option:: --cpus N 62 | 63 | Specify how many CPUs to use for parallel execution 64 | 65 | .. option:: --force 66 | 67 | Overwrite any previous results. 68 | 69 | .. option:: --working-dir=DIR 70 | 71 | Directory under which output is stored. By default this is the current 72 | directory. 73 | 74 | .. option:: -h, --help 75 | 76 | Print help. 77 | 78 | .. option:: --no-color 79 | 80 | Disable colorful output. 81 | 82 | .. option:: -v, --verbosity=N 83 | 84 | Use one or multiple ``-v`` to increase verbosity of output or set a 85 | level of verbosity with ``--verbosity=N``. By default the verbosity 86 | level is 1. Setting verbosity to 0 silences the program. A level of 87 | 3 prints debugging info. 88 | 89 | 90 | Exit Status 91 | =========== 92 | 93 | Exits with non-zero upon encountering an error. 94 | 95 | .. only:: man 96 | 97 | See Also 98 | ======== 99 | 100 | :manpage:`omics-prep(1)`, :manpage:`omics-qc(1)`, :manpage:`omics(7)`, 101 | :manpage:`illumina-reads-processing(7)`, :manpage:`omics-assemble(1)` 102 | 103 | -------------------------------------------------------------------------------- /docs/merge-coverage.txt: -------------------------------------------------------------------------------- 1 | .. program:: merge-coverage 2 | 3 | ==================================== 4 | merge-coverage 5 | ==================================== 6 | 7 | Synopsis 8 | ======== 9 | 10 | :program:`merge-coverage` [OPTIONS]... ... 11 | 12 | 13 | Description 14 | =========== 15 | 16 | Calculates the per-sample per-contig mean coverage from the per-sample coverage 17 | tables made with :program:`omics mapping` into on file suitable as input file for 18 | :program:`concoct`. 19 | 20 | 21 | Options 22 | ======= 23 | .. option:: -h, --help 24 | 25 | show help message and exit 26 | 27 | .. option:: -a, --assembly FILE 28 | 29 | The assembly file. This is to be compatible with the CONCOCT workflow. If 30 | used then contigs not covered by any sample will appear in the output with 31 | zeros (unlike the output of bedtool's :program:`genomeCoverageBed`.) 32 | 33 | .. option:: -o, --out FILE 34 | 35 | Output file. By default stdout is used. 36 | 37 | .. option:: --debug 38 | 39 | Print stack trace on errors. 40 | 41 | .. option:: --length 42 | 43 | Insert column with contig length. The default is not to insert lengths. 44 | 45 | ..option:: -v, --verbose 46 | 47 | Report progress to stderr. 48 | 49 | 50 | Exit Status 51 | =========== 52 | 53 | Exits with non-zero upon encountering an error. 54 | 55 | .. only:: man 56 | 57 | See Also 58 | ======== 59 | 60 | :manpage:`omics-binning(1)`, :manpage:`omics-mapping(1)`, 61 | :manpage:`omics(7)`, :manpage:`illumina-reads-processing(7)` 62 | 63 | 64 | -------------------------------------------------------------------------------- /docs/prep.txt: -------------------------------------------------------------------------------- 1 | .. program:: omics prep 2 | 3 | ============================================================ 4 | prep - prepare compressed fastq files for further processing 5 | ============================================================ 6 | 7 | Synopsis 8 | ======== 9 | 10 | :program:`omics prep` [OPTIONS]... [READS]... 11 | 12 | 13 | Description 14 | =========== 15 | 16 | To several tools of the Geomicro Illumina Reads Processing Pipeline assume 17 | that raw or intermediate data is available in files following certain naming 18 | conventions and formats and directory layout. To get started the script 19 | :program:`omics prep` will help following these conventions. 20 | 21 | 22 | Options 23 | ======= 24 | 25 | .. option:: -f, --force 26 | 27 | Allow overwriting existing files. 28 | 29 | .. option:: --keep-lanes-separate 30 | 31 | Keep data from different lanes separate. The default is to collect reads 32 | originating from the same physical sample if sequencing was done using 33 | several lanes. 34 | 35 | .. option:: --suffix LIST 36 | 37 | Comma-separated list of valid file suffices used for raw reads. This is 38 | used to find files when a directory is given as positional argument. By 39 | default .fastq and .fastq.gz files are considered. 40 | 41 | .. option:: -t N, --threads N, --cpus N 42 | 43 | Number of threads / CPUs to employ 44 | 45 | .. option:: -h, --help 46 | 47 | Print help. 48 | 49 | .. option:: -v, --verbose 50 | 51 | Use one or multiple ``-v`` to increase verbosity of output. 52 | 53 | Exit Status 54 | =========== 55 | 56 | Exits with non-zero upon encountering an error. 57 | 58 | .. only:: man 59 | 60 | See Also 61 | ======== 62 | 63 | :manpage:`omics-qc(1)`, :manpage:`omics(7)`, :manpage:`illumina-reads-processing(7)` 64 | -------------------------------------------------------------------------------- /docs/qc-check.txt: -------------------------------------------------------------------------------- 1 | .. program:: omics qc-check 2 | 3 | =================================================== 4 | qc-check - Quickly check results of quality control 5 | =================================================== 6 | 7 | .. argparse:: 8 | :module: omics.qc_check 9 | :func: get_argp 10 | :prog: omics qc-check 11 | :nodefault: 12 | :manpage: 13 | 14 | 15 | Exit Status 16 | =========== 17 | 18 | Exits with non-zero upon encountering an error. 19 | 20 | .. only:: man 21 | 22 | See Also 23 | ======== 24 | 25 | :manpage:`omics-qc`, :manpage:`omics(7)`, 26 | :manpage:`illumina-reads-processing(7)` 27 | 28 | -------------------------------------------------------------------------------- /docs/qc-sample.txt: -------------------------------------------------------------------------------- 1 | .. program:: omics qc-sample 2 | 3 | ========================================================== 4 | qc-sample - quality control for metagenomic Illumina reads 5 | ========================================================== 6 | 7 | Synopsis 8 | ======== 9 | 10 | :program:`omics qc-sample` [OPTIONS]... 11 | 12 | 13 | Description 14 | =========== 15 | 16 | The :program:`omics qc-sample` script takes a pair of fastq-formatted files 17 | (forward and reverse reads, data from a single sample) and runs them past 18 | quality assessment (FastQC), dereplication, adapter removal (Scythe), and 19 | quality-trimming (Sickle) steps and then prepares a FASTA-formatted interleaved 20 | reads file that can be used as input for the IDBA assembler. A second run of 21 | FastQC allows a before-after comparison to see if these steps led to an 22 | improvement in the quality of the data. 23 | 24 | 25 | positional arguments 26 | ==================== 27 | 28 | .. option:: samples 29 | 30 | List of directories, one per sample that contain the sample's reads. The 31 | default is to take the current directory and process a single sample. The 32 | names of the reads files must be fwd.fastq and rev.fastq, currently this 33 | can not be set manually. Use the omics-qc-sample script directly to 34 | specify filenames, omics-qc is just a wrapper after all. 35 | 36 | 37 | Options 38 | ======= 39 | 40 | .. option:: -f, --fwd=FILE 41 | 42 | fastq-formatted file with forward reads, by default this is ``fwd.fastq`` 43 | as saved by the :program:`prep` script 44 | 45 | .. option:: -r, --rev=FILE 46 | 47 | fastq-formatted file with reverse reads, by default this is ``rev.fastq`` 48 | as saved by the :program:`prep` script 49 | 50 | .. option:: --clean-only 51 | 52 | Remove all files made by a previously run of :program:`qc` and exit. 53 | 54 | .. option:: -a, --adapters=FILE 55 | 56 | Specify the adapters file used in the adpater trimming step. By default 57 | the Illumina adapter file TruSeq3-PE-2.fa as distributed by the Trimmomatic 58 | project will be used. 59 | 60 | .. option:: --keep-all 61 | 62 | Keep all intermediate files, by default some not-so-important intermediate 63 | results will be deleted to save disk space 64 | 65 | .. option:: --less-mem 66 | 67 | This option will reduce the dominating memory requirements for the 68 | de-replication step by half, typically, and double the computation time. 69 | 70 | .. option:: --no-dereplicate 71 | 72 | Option to skip the de-replication step 73 | 74 | .. option:: --no-fasta-interleave 75 | 76 | Skip building the interleaved fasta file, interleaved fastq files will 77 | still be build. 78 | 79 | .. option:: -S, --scythe-sickle 80 | 81 | Use scythe + sickle instead of (the default) Trimmomatic 82 | 83 | .. option:: --working-dir=DIR 84 | 85 | Directory under which output is stored. By default this is the current 86 | directory. 87 | 88 | .. option:: -h, --help 89 | 90 | Print help. 91 | 92 | .. option:: --no-color 93 | 94 | Disable colorful output. 95 | 96 | .. option:: -v, --verbosity=N 97 | 98 | Use one or multiple ``-v`` to increase verbosity of output or set a 99 | level of verbosity with ``--verbosity=N``. By default the verbosity 100 | level is 1. Setting verbosity to 0 silences the program. A level of 101 | 3 prints debugging info. 102 | 103 | 104 | Exit Status 105 | =========== 106 | 107 | Exits with non-zero upon encountering an error. 108 | 109 | .. only:: man 110 | 111 | See Also 112 | ======== 113 | 114 | :manpage:`omics-prep(1)`, :manpage:`omics-qc`, :manpage:`omics(7)`, 115 | :manpage:`illumina-reads-processing(7)` 116 | 117 | -------------------------------------------------------------------------------- /docs/qc.txt: -------------------------------------------------------------------------------- 1 | .. program:: omics qc 2 | 3 | =================================================== 4 | qc - quality control for metagenomic Illumina reads 5 | =================================================== 6 | 7 | Synopsis 8 | ======== 9 | 10 | :program:`omics qc` [OPTIONS]... [SAMPLES]... 11 | 12 | 13 | Description 14 | =========== 15 | 16 | The :program:`omics qc` script takes raw reads from multiple samples and runs 17 | them past quality assessment (FastQC), dereplication, adapter removal and 18 | quality-trimming with Trimmomatic (or alternatively with Scythe and Sickle) 19 | steps and then prepares a FASTA-formatted interleaved reads file that can be 20 | used as input for the assembler. A second run of FastQC allows a before-after 21 | comparison to see if these steps led to an improvement in the quality of the 22 | data. 23 | 24 | 25 | positional arguments 26 | ==================== 27 | 28 | .. option:: samples 29 | 30 | List of directories, one per sample that contain the sample's reads. The 31 | default is to take the current directory and process a single sample. The 32 | names of the reads files must be fwd.fastq and rev.fastq, currently this 33 | can not be set manually. Use the omics-qc-sample script directly to 34 | specify filenames, omics-qc is just a wrapper after all. 35 | 36 | 37 | Options 38 | ======= 39 | 40 | .. option:: --clean-only 41 | 42 | Remove all files made by a previously run of :program:`qc` and exit. 43 | 44 | .. option:: -a, --adapters=FILE 45 | 46 | Specify the adapters file used in the adpater trimming step. By default 47 | the Illumina adapter file TruSeq3-PE-2.fa as distributed by the Trimmomatic 48 | project will be used. 49 | 50 | .. option:: --keep-all 51 | 52 | Keep all intermediate files, by default some not-so-important intermediate 53 | results will be deleted to save disk space 54 | 55 | .. option:: --no-dereplicate 56 | 57 | Option to skip the de-replication step 58 | 59 | .. option:: --no-fasta-interleave 60 | 61 | Skip building the interleaved fasta file, interleaved fastq files will 62 | still be build. 63 | 64 | .. option:: -S, --scythe-sickle 65 | 66 | Use scythe + sickle instead of (the default) Trimmomatic 67 | 68 | .. option:: --cpus N, --threads N, -t N 69 | Number of threads / CPUs to employ 70 | 71 | .. option:: -h, --help 72 | 73 | Print help. 74 | 75 | .. option:: -v, --verbose 76 | 77 | Use one or multiple ``-v`` to increase verbosity of output. 78 | 79 | Exit Status 80 | =========== 81 | 82 | Exits with non-zero upon encountering an error. 83 | 84 | .. only:: man 85 | 86 | See Also 87 | ======== 88 | 89 | :manpage:`omics-prep(1)`, :manpage:`omics-qc-sample(1)`, 90 | :manpage:`omics(7)`, :manpage:`illumina-reads-processing(7)` 91 | 92 | -------------------------------------------------------------------------------- /docs/run.txt: -------------------------------------------------------------------------------- 1 | .. program:: omics run 2 | 3 | ======================================== 4 | run - run command inside omics container 5 | ======================================== 6 | 7 | 8 | Synopsis 9 | ======== 10 | 11 | :program:`omics run` [OPTIONS...] [``--``] COMMAND... 12 | 13 | 14 | Description 15 | =========== 16 | 17 | This is a wrapper around :command:`singularity run`. It runs the given command 18 | inside the omics container environment. if :program:`omics run` is called from 19 | a shell then command may need to be protected from the shell with single or 20 | double quotes as needed. The container will provide a clean environment, so if 21 | environment variables are needed inside, e.g. ``SOMEVAR=foobar``, then set instead 22 | ``SINGULARITYENV_SOMEVAR=foobar``, and the variable will be set with the 23 | ``SINGULARITYENV_`` prefix stripped. 24 | 25 | Options 26 | ======= 27 | 28 | Any options given to this script must be separated from the COMMAND 29 | by a double dash, otherwise the COMMAND will be interpreted as 30 | options with unintended consequences likely. 31 | 32 | .. option:: -i PATH, --container-image PATH 33 | 34 | Full path to singularity container image. A default is provided if this 35 | option is missing. 36 | 37 | .. option:: -s option, --singularity option 38 | 39 | Options passed on to :command:`singularity`. For instance, to additionally mount a 40 | path ``/some/path``, add ``--singularity "-B /some/path"`` and the option 41 | ``-B /some/path`` will be appended to the call to :command:`singularity run`. The 42 | empty space between the option and its parameter must be protected by 43 | quotes to prevent premature word splitting by the shell. Run 44 | :command:`singularity run --help` to see what options are supported. 45 | 46 | .. option:: --working-dir PATH 47 | 48 | Set the working directory for the command 49 | 50 | .. option:: -h, --help 51 | 52 | Print help. 53 | 54 | .. option:: --no-color 55 | 56 | Disable colorful terminal output 57 | 58 | .. option:: -v, --verbosity N 59 | 60 | Set verbosity level or use one or multiple :option:`-v` to increase verbosity of output. 61 | 62 | 63 | Exit Status 64 | =========== 65 | 66 | Exits with non-zero upon encountering an error. 67 | 68 | .. only:: man 69 | 70 | See Also 71 | ======== 72 | 73 | :manpage:`omics-prep(1)`, :manpage:`omics(7)`, :manpage:`illumina-reads-processing(7)`, :manpage:`singularity(1)` 74 | -------------------------------------------------------------------------------- /docs/separate-interleaved.txt: -------------------------------------------------------------------------------- 1 | .. program:: separate-interleaved 2 | 3 | ==================== 4 | separate-interleaved 5 | ==================== 6 | 7 | Synopsis 8 | ======== 9 | 10 | :program:`separate-interleaved` [:option:`-v`] [:option:`-f` FILE] [:option:`-r` FILE] 11 | 12 | 13 | Description 14 | =========== 15 | 16 | Separate interleaved reads fastq file into forwards and reverse files. 17 | 18 | Separate interleaved-reads fasta/q file into forwards and reverse files. Input 19 | file must be in FASTQ or FASTA format, Sequence and quality score must be on a 20 | single line each, separated by a '+', read headers must start with '@' or '>'. 21 | The script will auto-detect the file format based on the first header. It is 22 | not checked if two reads are actually paired-end reads, however an error will 23 | be raised if the input file containes an uneven number of sequences. 24 | 25 | 26 | 27 | Options 28 | ======= 29 | .. option:: -h, --help 30 | 31 | show this help message and exit 32 | 33 | .. option:: -f FILE, --fwd FILE, --forward-out FILE 34 | 35 | Name of forward output file. A value is derived from 36 | the inputfilename by default. 37 | 38 | .. option:: -r FILE, --rev FILE, --reverse-out FILE 39 | 40 | Name of reverse output file. A value is derived from 41 | the inputfilename by default. 42 | 43 | .. option:: -v, --verbose 44 | 45 | Print more informative output 46 | 47 | 48 | Exit Status 49 | =========== 50 | 51 | Exits with non-zero upon encountering an error. 52 | 53 | .. only:: man 54 | 55 | See Also 56 | ======== 57 | 58 | :manpage:`omics(7)`, 59 | :manpage:`illumina-reads-processing(7)` 60 | 61 | 62 | -------------------------------------------------------------------------------- /docs/template.txt: -------------------------------------------------------------------------------- 1 | .. program:: $program 2 | 3 | $header_line 4 | $program - $short_description 5 | $header_line 6 | 7 | Synopsis 8 | ======== 9 | 10 | :program:`$program` $usage_args 11 | 12 | 13 | Description 14 | =========== 15 | 16 | $long_description 17 | 18 | 19 | Options 20 | ======= 21 | 22 | $positional_args 23 | 24 | $optional_args 25 | 26 | 27 | Exit Status 28 | =========== 29 | 30 | Exits with non-zero upon encountering an error. 31 | 32 | .. only:: man 33 | 34 | See Also 35 | ======== 36 | 37 | :manpage:`omics(7)`, 38 | :manpage:`illumina-reads-processing(7)` 39 | -------------------------------------------------------------------------------- /docs/unchop-contigs.txt: -------------------------------------------------------------------------------- 1 | .. program:: unchop-contigs 2 | 3 | ================================================================= 4 | unchop-contigs - Stitch together chopped up contigs after binning 5 | ================================================================= 6 | 7 | Synopsis 8 | ======== 9 | 10 | :program:`unchop-contigs` [-h] [-i [BACKUP_SUFFIX] | -o OUT_DIR] [-v] [input [input ...]] 11 | 12 | 13 | Description 14 | =========== 15 | 16 | The CONCOCT binner recommends to chop long contigs into even length chunks to 17 | reduce bias related to varying contigs sizes. This script glues them back 18 | together for downstream analysis of bins. 19 | 20 | It is assumed that there is one fasta file per bin and that the fasta headers 21 | consist of the original contig id followed by a dotand a decimal chunk number. 22 | For example if a bin has three contig chunks named:: 23 | 24 | k141_531759.0 25 | k141_531759.1 26 | k141_531759.2 27 | 28 | they will be replaced by a single contig called:: 29 | 30 | k141_531759.0-2 31 | 32 | Contigs that do not have chunk information will be left alone. However contigs 33 | will be sorted by contig id and numerical chunk number. A consequence is that 34 | applying unchop-contig a second time may change the order of some contigs. 35 | 36 | 37 | Options 38 | ======= 39 | 40 | .. option:: input 41 | 42 | List of directories or fasta files. The default is to take the 43 | current diorectory. 44 | 45 | .. option:: -h, --help 46 | 47 | show this help message and exit 48 | 49 | .. option:: -i [BACKUP_SUFFIX], --in-place [BACKUP_SUFFIX] 50 | 51 | Replace input file. If provided, backup of each file is made using 52 | the provided suffix. 53 | 54 | .. option:: -o OUT_DIR, --out-dir OUT_DIR 55 | 56 | Output directory. The default is the current directory. 57 | 58 | .. option:: -v, --verbose 59 | 60 | Print diagnostic output. 61 | 62 | 63 | Exit Status 64 | =========== 65 | 66 | Exits with non-zero upon encountering an error. 67 | 68 | .. only:: man 69 | 70 | See Also 71 | ======== 72 | 73 | :manpage:`omics(7)`, 74 | :manpage:`illumina-reads-processing(7)` 75 | 76 | -------------------------------------------------------------------------------- /lib/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | 3 | # get all python files 4 | py_files = $(shell find -name "*.py") 5 | 6 | lib_files = liba.sh 7 | 8 | EXTRA_DIST = Makefile 9 | 10 | install: installdir = $(DESTDIR)$(datadir)/$(package_name) 11 | install: install-py 12 | $(info Installing lib files ...) 13 | mkdir -p -- "$(installdir)" 14 | $(INSTALL_DATA) -t $(installdir) $(lib_files) 15 | 16 | install-py: installdir = $(DESTDIR)$(prefix)/lib/python3.5/site-packages 17 | install-py: 18 | $(info Installing python packages ...) 19 | for i in $(py_files); do \ 20 | $(INSTALL_DATA) -D $$i $(installdir)/$$i; \ 21 | done 22 | 23 | distdir: 24 | $(info Copying lib files ...) 25 | mkdir -p -- "../$(dist_dir)/lib" 26 | cp -a $(lib_files) $(EXTRA_DIST) ../$(dist_dir)/lib/ 27 | # copy each python package individually 28 | for i in $(shell find -name __init__.py -printf "%h "); do \ 29 | mkdir -p ../$(dist_dir)/lib/$$i && \ 30 | cp -p $$i/*.py ../$(dist_dir)/lib/$$i/ ; \ 31 | done 32 | -------------------------------------------------------------------------------- /lib/omics/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Regents of The University of Michigan. 2 | 3 | # This file is part of geo-omics-scripts. 4 | 5 | # Geo-omics-scripts is free software: you can redistribute it and/or 6 | # modify it under the terms of the GNU General Public License as published 7 | # by the Free Software Foundation, either version 3 of the License, or (at 8 | # your option) any later version. 9 | 10 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 11 | # WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License along 16 | # with Geo-omics-scripts. If not, see . 17 | 18 | """ 19 | Invoke omics scripts and sub-commands 20 | """ 21 | 22 | from pathlib import Path 23 | import os 24 | 25 | from . import process_command_line, get_main_arg_parser, get_available_commands 26 | from . import launch_cmd_as_sub_module 27 | 28 | 29 | def main(): 30 | argp = get_main_arg_parser(description=__doc__) 31 | args = argp.parse_args() 32 | args.script_dir = Path(args.script_dir) 33 | if not args.script_dir.is_dir(): 34 | argp.error('Not a directory: {}'.format(args.script_dir)) 35 | 36 | if args.command: 37 | import_err = launch_cmd_as_sub_module(args, argp) 38 | 39 | # try calling as shellscript 40 | cmd = args.command[0] 41 | cmd_opts = args.command[1:] 42 | cmdline = process_command_line( 43 | cmd, 44 | cmd_opts, 45 | script_dir=args.script_dir 46 | ) 47 | if args.dry_run: 48 | print(*cmdline) 49 | else: 50 | try: 51 | p = os.execv(cmdline[0], cmdline) 52 | except FileNotFoundError as e: 53 | if args.traceback: 54 | raise 55 | else: 56 | msg1 = '\n {}: {}'.format(import_err.__class__.__name__, 57 | import_err) 58 | msg2 = ' {}: {}'.format(e.__class__.__name__, e) 59 | msg3 = ' ==> Not a valid omics command: {}'.format(cmd) 60 | argp.error('\n'.join([msg1, msg2, msg3])) 61 | except Exception as e2: 62 | if args.traceback: 63 | raise 64 | else: 65 | argp.error('Command "{}" failed: {}: {}' 66 | ''.format(cmdline, e2.__class__.__name__, e2)) 67 | else: 68 | argp.exit(status=p.returncode) 69 | else: 70 | subcmds = get_available_commands() 71 | if subcmds: 72 | print('Available commands:') 73 | for i in subcmds: 74 | print(' ', i) 75 | print('Type `omics -h` or `omics -h` to get help.') 76 | else: 77 | argp.print_help() 78 | 79 | 80 | if __name__ == '__main__': 81 | main() 82 | -------------------------------------------------------------------------------- /lib/omics/_version.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Regents of The University of Michigan. 2 | 3 | # This file is part of geo-omics-scripts. 4 | 5 | # Geo-omics-scripts is free software: you can redistribute it and/or 6 | # modify it under the terms of the GNU General Public License as published 7 | # by the Free Software Foundation, either version 3 of the License, or (at 8 | # your option) any later version. 9 | 10 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 11 | # WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License along 16 | # with Geo-omics-scripts. If not, see . 17 | 18 | import os.path 19 | import subprocess 20 | 21 | # Set to real version when distribute outside of git vcs 22 | VERSION = None 23 | 24 | 25 | def get_version(version=VERSION, raise_on_error=False): 26 | """ 27 | Get the version string 28 | 29 | Get the hard-coded version if possible, then fall back to ask git. If that 30 | fails raise an exeception or return an 'unknown' depending on the 31 | raise_on_error flag. 32 | """ 33 | if version is not None: 34 | return version 35 | 36 | try: 37 | p = subprocess.run( 38 | ['git', 'describe'], 39 | cwd=os.path.dirname(__file__), 40 | stdout=subprocess.PIPE, 41 | stderr=subprocess.PIPE, 42 | check=raise_on_error, 43 | ) 44 | except Exception as e: 45 | out = e.stdout.decode() 46 | err = e.stderr.decode() 47 | raise RuntimeError( 48 | 'Failed to get version info from git: {}: {}\n{}{}' 49 | ''.format(e.__class__.__name__, e, out, err)) 50 | else: 51 | version = p.stdout.decode().strip() 52 | # version should be like 1.0.134-42-gd3adb33f 53 | # make this a PEP440 local version like 1.0.134+42-gd3adb33f 54 | version = version.replace('-', '+', 1) 55 | if version: 56 | return version 57 | return 'unknown' 58 | -------------------------------------------------------------------------------- /lib/omics/checkm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Regents of The University of Michigan. 2 | 3 | # This file is part of geo-omics-scripts. 4 | 5 | # Geo-omics-scripts is free software: you can redistribute it and/or 6 | # modify it under the terms of the GNU General Public License as published 7 | # by the Free Software Foundation, either version 3 of the License, or (at 8 | # your option) any later version. 9 | 10 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 11 | # WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License along 16 | # with Geo-omics-scripts. If not, see . 17 | 18 | """ 19 | Utilities to work with CheckM output 20 | """ 21 | 22 | import argparse 23 | from collections import OrderedDict 24 | import sys 25 | 26 | from . import OmicsArgParser 27 | 28 | ACTIONS = ['convert'] 29 | DEFAULT_ACTION = ACTIONS[0] 30 | 31 | 32 | def load_tsv(file): 33 | """ 34 | Load data from CheckM-style tsv output file 35 | 36 | :param file: Filehandle 37 | :return: List of OrderedDicts, one per rows 38 | """ 39 | cols = None 40 | 41 | data = [] 42 | for line in file: 43 | line = line.split('\t') 44 | dat = line[1].strip().lstrip('{').rstrip('}') 45 | dat = dat.split(', ') 46 | dat = map(lambda x: x.split(': '), dat) 47 | 48 | dd = OrderedDict() 49 | dd['bin'] = line[0] 50 | 51 | for k, v in dat: 52 | dd[k.strip("'")] = v 53 | 54 | if cols is None: 55 | cols = dd.keys() 56 | else: 57 | if cols != dd.keys(): 58 | raise RuntimeError('data keys, inconsistency: {} vs. {}' 59 | ''.format(cols, dd.keys())) 60 | 61 | for k, v in dd.items(): 62 | try: 63 | v = int(v) 64 | except ValueError: 65 | try: 66 | v = float(v) 67 | except ValueError: 68 | pass 69 | dd[k] = v 70 | 71 | data.append(dd) 72 | return data 73 | 74 | 75 | def main(argv=None, namespace=None): 76 | prog = __loader__.name.replace('.', ' ') 77 | argp = OmicsArgParser(prog=prog, description=__doc__, threads=False) 78 | argp.add_argument( 79 | '-c', '--convert', 80 | action='store_true', 81 | help='Convert input to real tab-separated table', 82 | ) 83 | argp.add_argument( 84 | 'inputfile', 85 | metavar='FILE', 86 | type=argparse.FileType(), 87 | default=sys.stdin, 88 | help='Input file, usually a .tsv file written by CheckM', 89 | ) 90 | args = argp.parse_args(args=argv, namespace=namespace) 91 | 92 | if args.convert: 93 | data = load_tsv(args.inputfile) 94 | print(*data[0].keys(), sep='\t') 95 | for row in data: 96 | print(*row.values(), sep='\t') 97 | else: 98 | argp.error('no action specified, e.g. --convert') 99 | 100 | 101 | if __name__ == '__main__': 102 | main() 103 | -------------------------------------------------------------------------------- /lib/omics/db/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the omics database module 3 | 4 | This is 'stand-alone' Django ORM and db backend for the geo-omics-scripts. 5 | """ 6 | from .manage import main, setup 7 | -------------------------------------------------------------------------------- /lib/omics/db/apps.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Regents of The University of Michigan. 2 | 3 | # This file is part of geo-omics-scripts. 4 | 5 | # Geo-omics-scripts is free software: you can redistribute it and/or 6 | # modify it under the terms of the GNU General Public License as published 7 | # by the Free Software Foundation, either version 3 of the License, or (at 8 | # your option) any later version. 9 | 10 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 11 | # WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License along 16 | # with Geo-omics-scripts. If not, see . 17 | 18 | from django.apps import AppConfig 19 | 20 | class OmicsDBConfig(AppConfig): 21 | name = 'omics.db' 22 | label = 'omics_db' 23 | verbose_name = 'geo-omics-scripts data base' 24 | -------------------------------------------------------------------------------- /lib/omics/db/management/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Geo-omics/scripts/5f20a3418096a11604880b9c38565c9f4c9546eb/lib/omics/db/management/__init__.py -------------------------------------------------------------------------------- /lib/omics/db/management/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Geo-omics/scripts/5f20a3418096a11604880b9c38565c9f4c9546eb/lib/omics/db/management/commands/__init__.py -------------------------------------------------------------------------------- /lib/omics/db/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | -------------------------------------------------------------------------------- /lib/omics/fastq2fasta.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Regents of The University of Michigan. 2 | 3 | # This file is part of geo-omics-scripts. 4 | 5 | # Geo-omics-scripts is free software: you can redistribute it and/or 6 | # modify it under the terms of the GNU General Public License as published 7 | # by the Free Software Foundation, either version 3 of the License, or (at 8 | # your option) any later version. 9 | 10 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 11 | # WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License along 16 | # with Geo-omics-scripts. If not, see . 17 | 18 | """ 19 | Convert fastq into fasta 20 | """ 21 | 22 | import argparse 23 | import sys 24 | 25 | from . import OmicsArgParser 26 | 27 | 28 | def convert(data, output, check=True): 29 | """ 30 | Convert data from FASTQ into FASTA format 31 | 32 | :param data: File-like object with input data 33 | :param output: File-like object for output 34 | """ 35 | state = 0 36 | for line in data: 37 | if check: 38 | if state == 0 and not line.startswith('@'): 39 | raise RuntimeError('Input not in FASTQ format? Expected line ' 40 | 'to start with @: {}'.format(line)) 41 | elif state == 2 and not line == '+\n': 42 | raise RuntimeError('Input not in FASTQ format? Expected + ' 43 | 'separator line: {}'.format(line)) 44 | 45 | if state == 0: 46 | output.write('>' + line[1:]) 47 | elif state == 1: 48 | output.write(line) 49 | 50 | state = (state + 1) % 4 51 | 52 | if state != 0: 53 | raise RuntimeError('Input not in FASTQ format? Expected total number ' 54 | 'of lines to be multiple of 4, last line: {}' 55 | ''.format(line)) 56 | 57 | 58 | def main(argv=None, namespace=None): 59 | argp = OmicsArgParser( 60 | prog=__loader__.name.replace('.', ' '), 61 | description=__doc__, 62 | project_home=False, 63 | threads=False, 64 | ) 65 | argp.add_argument( 66 | 'inputfile', 67 | metavar='FILE', 68 | nargs='?', 69 | type=argparse.FileType('r'), 70 | default=sys.stdin, 71 | help='Fastq file to be converted, by default data is read from stdin.' 72 | ) 73 | argp.add_argument( 74 | '-o', '--output', 75 | metavar='FILE', 76 | nargs='?', 77 | type=argparse.FileType('w'), 78 | default=sys.stdout, 79 | help='Name of output filie. Write to stdout by default.' 80 | ) 81 | argp.add_argument( 82 | '--force', '-f', 83 | action='store_true', 84 | help='Overwrite existing files', 85 | ) 86 | argp.add_argument( 87 | '--no-check', 88 | action='store_false', 89 | dest='check', 90 | help='Skip sanity check on input data. By default it is checked that ' 91 | 'the input is indeed in fastq format.', 92 | ) 93 | args = argp.parse_args(args=argv, namespace=namespace) 94 | convert(args.inputfile, args.output, check=args.check) 95 | 96 | 97 | if __name__ == '__main__': 98 | main() 99 | -------------------------------------------------------------------------------- /lib/omics/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Regents of The University of Michigan. 2 | 3 | # This file is part of geo-omics-scripts. 4 | 5 | # Geo-omics-scripts is free software: you can redistribute it and/or 6 | # modify it under the terms of the GNU General Public License as published 7 | # by the Free Software Foundation, either version 3 of the License, or (at 8 | # your option) any later version. 9 | 10 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 11 | # WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # General Public License for more details. 14 | 15 | # You should have received a copy of the GNU General Public License along 16 | # with Geo-omics-scripts. If not, see . 17 | 18 | """ 19 | Omics utilities collection 20 | """ 21 | 22 | from collections import defaultdict 23 | from matplotlib.pyplot import subplots 24 | from pathlib import Path 25 | 26 | 27 | def load_read_coordinates(reads_file, file_format='fastq'): 28 | """ 29 | Load read coordinates into dict of lists of points 30 | 31 | :param reads_file: File object or str or Path of inout file 32 | :param str file_format: Either 'fasta' or 'fastq' 33 | """ 34 | if file_format == 'fastq': 35 | mark = '@' 36 | elif file_format == 'fasta': 37 | mark = '>' 38 | else: 39 | raise ValueError('Illegal file format specified: {}' 40 | ''.format(file_format)) 41 | 42 | # handle input parameter type 43 | keep_open = False 44 | if isinstance(reads_file, str): 45 | reads_file = open(reads_file) 46 | elif isinstance(reads_file, Path): 47 | reads_file = reads_file.open() 48 | else: 49 | keep_open = True 50 | 51 | try: 52 | data = defaultdict(list) 53 | for line in reads_file: 54 | if line.startswith(mark): 55 | try: 56 | point = line.split(':')[4:7] 57 | except Exception: 58 | raise RuntimeError('Failed parsing sequence header ' 59 | '(split by :): {}'.format(line)) 60 | point[2] = point[2].split()[0] 61 | 62 | try: 63 | data[int(point[0])].append((int(point[1]), int(point[2]))) 64 | except Exception: 65 | raise RuntimeError('Failed parsing sequence header ' 66 | '(int conversion): {}'.format(line)) 67 | except Exception: 68 | raise 69 | else: 70 | return data 71 | finally: 72 | if not keep_open: 73 | reads_file.close() 74 | 75 | 76 | def scatter(points): 77 | """ 78 | Diplay scatterplot 79 | """ 80 | fig, ax = subplots() 81 | ax.scatter( 82 | [i[0] for i in points], 83 | [i[1] for i in points], 84 | marker='.', 85 | ) 86 | fig.show() 87 | 88 | 89 | def hist(data): 90 | """ 91 | Diplay histogram 92 | """ 93 | import matplotlib.pyplot as plt 94 | fig = plt.figure() 95 | ax = fig.add_subplot(111) 96 | ax.hist( 97 | data, 98 | bins='auto', 99 | ) 100 | plt.show() 101 | plt.close() 102 | -------------------------------------------------------------------------------- /localenv: -------------------------------------------------------------------------------- 1 | # To use this repo as a local installation run 2 | # 3 | # $ source localenv 4 | # 5 | # from a bash prompt 6 | 7 | base=$(readlink -f "$(dirname "${BASH_SOURCE[0]}")") 8 | export PATH="$base/scripts:$PATH" 9 | export PYTHONPATH="$base/lib:$PYTHONPATH" 10 | mkdir -p -- "$base/share/geo-omics-scripts" 11 | ln -f -s -t "$base/share/geo-omics-scripts/" ../../lib/liba.sh 12 | ln -f -s -t "$base/share/geo-omics-scripts/" ../../TruSeq3-PE-2+omics.fa 13 | ln -f -s -t "$base/share/geo-omics-scripts/" ../../phylosiftrc 14 | source "$base/bash-completion/omics" 15 | -------------------------------------------------------------------------------- /modulefiles/flux.omics/.version: -------------------------------------------------------------------------------- 1 | #%Module1.0 2 | 3 | set ModulesVersion "1" 4 | -------------------------------------------------------------------------------- /modulefiles/flux.omics/1: -------------------------------------------------------------------------------- 1 | #%Module1.0 2 | # 3 | # to be installed as /dept/geology/geomicro/data9/flux/modulefiles/geomicro/omics/1 4 | # 5 | # this module's maintainer's email: heinro@umich.edu 6 | # 7 | proc ModulesHelp { } { 8 | puts stderr "Load this module to use the 'omics scripts." 9 | } 10 | 11 | module-whatis "All-in-one omics module" 12 | 13 | if { [module-info mode load] } { 14 | # prerequsites for quast module 15 | module load boost 16 | # gsl libary needed by concoct 17 | module load gsl 18 | # prerequsites for quast, concoct module 19 | module load python-anaconda2/latest 20 | # prerequisite for bedtools2 module 21 | module load samtools 22 | # Load standard software packages 23 | # (required for geo-omics scripts) 24 | module load bedtools2 25 | module load bwa 26 | module load fastqc 27 | module load geomicro/idba 28 | module load geomicro/scythe 29 | module load ncbi-blast 30 | module load phylosift 31 | module load python-anaconda3 32 | module load quast 33 | module load sickle 34 | module load megahit 35 | } 36 | 37 | set OMICS_ROOT /dept/geology/geomicro/data9/flux/apps/omics_root 38 | 39 | append-path PATH $OMICS_ROOT/bin 40 | append-path MANPATH $OMICS_ROOT/share/man 41 | append-path PYTHONPATH $OMICS_ROOT/lib/python3.5/site-packages 42 | 43 | setenv PYTHONUSERBASE $OMICS_ROOT 44 | -------------------------------------------------------------------------------- /modulefiles/flux.omics/2: -------------------------------------------------------------------------------- 1 | #%Module1.0 2 | # 3 | # to be installed as /dept/geology/geomicro/data9/flux/modulefiles/geomicro/omics/2 4 | # 5 | # this module's maintainer's email: heinro@umich.edu 6 | # 7 | proc ModulesHelp { } { 8 | puts stderr "Module to enable the 'comics' command to enter the 'omics container" 9 | } 10 | 11 | module-whatis "Allows access to the 'omics container" 12 | 13 | set COMICS_ROOT /dept/geology/geomicro/data9/flux/apps/comics 14 | 15 | prepend-path PATH $COMICS_ROOT/bin 16 | prepend-path MANPATH $COMICS_ROOT/share/man 17 | -------------------------------------------------------------------------------- /modulefiles/install: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # --- CAUTION --- 4 | # 5 | # This script will install module files to production ! 6 | # 7 | # --- CAUTION --- 8 | 9 | set -e 10 | 11 | # this will fail, no root rights after all 12 | # contact Mike Messina 13 | #scp -p omics/1 cayman:/usr/share/Modules/modulefiles/omics/1 14 | #scp -p omics/1 vondamm:/usr/share/Modules/modulefiles/omics/1 15 | 16 | # should work as heinro user 17 | scp -p flux.omics/1 guaymas.earth.lsa.umich.edu:/gmb/data9/flux/modulefiles/geomicro/omics/1 18 | -------------------------------------------------------------------------------- /modulefiles/omics/.version: -------------------------------------------------------------------------------- 1 | #%Module1.0 2 | 3 | set ModulesVersion "1" 4 | -------------------------------------------------------------------------------- /modulefiles/omics/1: -------------------------------------------------------------------------------- 1 | #%Module1.0 2 | # 3 | # to be installed as /usr/share/Modules/modulefiles/omics/1 on vondamm, cayman 4 | # 5 | # this module's maintainer's email: heinro@umich.edu 6 | # 7 | 8 | proc ModulesHelp { } { 9 | puts stderr "Load this module to use the (geo-)omics scripts." 10 | } 11 | 12 | module-whatis "All-in-one omics module" 13 | 14 | if { [module-info mode load] } { 15 | # Load standard software packages 16 | module load AnacondaPython3 17 | module load AnacondaPython 18 | module load Scythe 19 | module load blast 20 | module load PhyloSift 21 | module load idba 22 | module load QUAST 23 | module load bwa 24 | module load samtools 25 | module load bedtools 26 | module load megahit 27 | } 28 | 29 | set OMICS_ROOT /geomicro/data9/flux/apps/omics_root 30 | 31 | append-path PATH $OMICS_ROOT/bin 32 | append-path MANPATH $OMICS_ROOT/share/man 33 | append-path PYTHONPATH $OMICS_ROOT/lib/python3.5/site-packages 34 | 35 | setenv PYTHONUSERBASE $OMICS_ROOT 36 | -------------------------------------------------------------------------------- /phylosiftrc: -------------------------------------------------------------------------------- 1 | # PhyloSift run control file 2 | # 3 | # see also: 4 | # https://phylosift.wordpress.com/tutorials/running-phylosift/phylosift-run-control-file/ 5 | # 6 | 7 | use Env qw($OMICS_REFERENCE_DATA); 8 | 9 | my $common = "data9/flux/reference-data/phylosift"; 10 | my @ref_alternatives = ( 11 | "$OMICS_REFERENCE_DATA/phylosift", 12 | "/geomicro/$common", 13 | "/gmb/$common", 14 | "/dept/geology/geomicro/$common", 15 | ); 16 | 17 | foreach (@ref_alternatives) { 18 | $ref_data_path = $_ if (-d $_); 19 | } 20 | print "[phylosiftrc] using reference data from: $ref_data_path\n"; 21 | 22 | $marker_path = "$ref_data_path"; 23 | $ncbi_path = "$ref_data_path"; 24 | 25 | # prevent this: 26 | # Error: requested HMM banded DP mx of 4749.29 Mb > 2500.00 Mb limit. 27 | # Increase limit with --mxsize or tau with --tau. 28 | $cm_align_long_mxsize = "10000"; 29 | $cm_align_short_mxsize = "10000"; 30 | 31 | # Use last-align bundled binaries if available 32 | 33 | $lastdb = "/usr/lib/phylosift/lastdb"; 34 | $lastal = "/usr/lib/phylosift/lastal"; 35 | 36 | -e $lastdb or $lastdb=""; 37 | -e $lastal or $lastal=""; 38 | -------------------------------------------------------------------------------- /scripts/COPYRIGHT.tetramer_freqs_esom: -------------------------------------------------------------------------------- 1 | For tetramer_freqs_esom: 2 | 3 | ############################################################################### 4 | Copyright (C) 2007 Anders Andersson (anders.andersson@scilifelab.se) 5 | 6 | This program is free software; you can redistribute it and/or 7 | modify it under the terms of the GNU General Public License 8 | as published by the Free Software Foundation; either version 2 9 | of the License, or (at your option) any later version. 10 | 11 | This program is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with this program. If not, see . 18 | 19 | Anders Andersson 20 | Assistant Professor 21 | SciLifeLab 22 | School of Biotechnology 23 | KTH Royal Institute of Technology 24 | Stockholm, Sweden 25 | Email: anders.andersson@scilifelab.se 26 | ############################################################################### 27 | -------------------------------------------------------------------------------- /scripts/ESOM_binning_results_parser: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Derek Smith 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | set -e 21 | 22 | cat ./*.conf > esom_scaffolds2bin.tsv 23 | sed '/^#/ d' esom_scaffolds2bin.tsv > esom_scaffolds2bin.cleaned.tsv 24 | awk 'BEGIN{OFS="\t"}{print $2,$1}' esom_scaffolds2bin.cleaned.tsv > esom_scaffolds2bin.tsv 25 | awk 'BEGIN{OFS="\t"}{$2="Bin_"$2; print}' esom_scaffolds2bin.tsv > ESOM_binning_results.txt 26 | perl -pe 's/(?<=\d)_(?=\d)/./g' ESOM_binning_results.txt > ESOM_binning_results.txt.fixed 27 | sed 's/k141\./k141_/g' ESOM_binning_results.txt.fixed > ESOM_binning_results.txt 28 | rm esom_scaffolds2bin.cleaned.tsv 29 | rm esom_scaffolds2bin.tsv 30 | rm ESOM_binning_results.txt.fixed 31 | -------------------------------------------------------------------------------- /scripts/Ebot.Output.Extract.Gi.Title.Rev3: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | """ 21 | This python script will take the output from ebot obtain summaries and 22 | extract the title line and the gi line. 23 | """ 24 | 25 | from sys import argv 26 | import re 27 | 28 | 29 | def main(): 30 | ebotFile = open(argv[1], 'r') 31 | 32 | currentLine = "" 33 | gi = "" 34 | title = "" 35 | journal = "" 36 | titleTrue = False 37 | titleCounter = 0 38 | JournalCounter = 0 39 | currentLine = "" 40 | firstEntry = True 41 | sequence = False 42 | fasta = "" 43 | 44 | for line in ebotFile: 45 | if line.startswith("VERSION") and not titleTrue: 46 | if not firstEntry and journal != "Unpublished" \ 47 | and title != "Direct Submission ": 48 | # print "a." + fasta + ".b" 49 | print(gi + "\t" + title + "\t" + journal + "\t" + fasta) 50 | # print "\t" + "fail" 51 | # print "\n" + journal 52 | title = "" 53 | currentLine = "" 54 | fasta = "" 55 | # exit() 56 | else: 57 | title = "" 58 | currentLine = "" 59 | fasta = "" 60 | # exit() 61 | line = line.split() 62 | gi = line[2].replace("GI:", "") 63 | titleCounter = 0 64 | JournalCounter = 0 65 | firstEntry = False 66 | 67 | elif line.startswith(" TITLE") and titleCounter == 0: 68 | title = line 69 | title = title.replace(" TITLE ", "") 70 | title = title.replace("\n", " ") 71 | title = title.strip("\n") 72 | titleTrue = True 73 | titleCounter += 1 74 | firstEntry = False 75 | 76 | elif line.find(" JOURNAL") != -1 and titleTrue: 77 | titleTrue = False 78 | journal = line 79 | journal = journal.replace(" JOURNAL ", "") 80 | journal = journal.strip("\n") 81 | # print "a." + journal + ".b" 82 | 83 | elif titleTrue: 84 | currentLine = line.replace("\n", "") 85 | title += currentLine.replace(" ", "") 86 | title += " " 87 | title = title.strip("\n") 88 | 89 | elif line.startswith("ORIGIN"): 90 | sequence = True 91 | 92 | elif sequence and line.find("//") == -1: 93 | splitLine = line.split(" ") 94 | for element in splitLine: 95 | if re.match("\D", element): 96 | # print element 97 | fasta += element.strip("\t\n") 98 | 99 | elif line.find("//") != -1: 100 | sequence = False 101 | 102 | # print gi + "\t" + title 103 | 104 | 105 | if __name__ == "__main__": 106 | main() 107 | -------------------------------------------------------------------------------- /scripts/GI_info_XMLParser: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | use strict; 22 | 23 | my $in= $ARGV[0]; 24 | my $out= $ARGV[1]; 25 | 26 | open( SUMM, $in)|| die "$!\n"; 27 | open (OUT, ">".$out); 28 | 29 | while(my $line=){ 30 | my ($id); 31 | if($line=~ m/(\d*)<\/ID>/i){ 32 | print OUT $1."\t"; 33 | } 34 | elsif($line=~ m/([\w\W]*)<\/Item>/i){ 35 | print OUT $1; 36 | } 37 | elsif($line=~ m/<\/DocSum>/i){ 38 | print OUT "\n"; 39 | } 40 | else{ 41 | next; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /scripts/Metabat_to_anvio_parser: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Derek Smith 4 | # Copyright 2019 Regents of The University of Michigan. 5 | 6 | # This file is part of geo-omics-scripts. 7 | 8 | # Geo-omics-scripts is free software: you can redistribute it and/or 9 | # modify it under the terms of the GNU General Public License as published 10 | # by the Free Software Foundation, either version 3 of the License, or (at 11 | # your option) any later version. 12 | 13 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 14 | # WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 | # General Public License for more details. 17 | 18 | # You should have received a copy of the GNU General Public License along 19 | # with Geo-omics-scripts. If not, see . 20 | 21 | ############################################################################################# 22 | # 23 | # Parse metabat output into format that can be import as a collection into the anvio profile 24 | # 25 | ############################################################################################ 26 | 27 | set -eu 28 | 29 | [[ "$#" -gt 0 ]] || { echo "Arguments required: Metabat output files"; exit 1; } 30 | # This command will add the Bin ID in a column after the split name, 31 | # replace the dots with underscores to make anvio happy..., 32 | # and concatenate all the files into one binning results file for anvio. 33 | # (Assumes that filenames (if coming from different directories) don't collide) 34 | for i in "$@"; do 35 | binid=$(basename -s .fa "$i") 36 | binid=${binid/./_} 37 | sed "s/$/\t$binid/" "$i" 38 | done > Metabat_binning_results.txt 39 | -------------------------------------------------------------------------------- /scripts/U2T: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | use strict; 22 | use Getopt::Long; 23 | 24 | =head1 NAME 25 | 26 | U2T - Converts U -> T and removes gaps 27 | 28 | 29 | =head1 SYNOPSIS 30 | 31 | B B<-in> I B<-out> I 32 | 33 | 34 | =head1 SEE ALSO 35 | 36 | L, L 37 | 38 | =head2 Other local resources 39 | 40 | =over 41 | 42 | =item [1] 43 | 44 | L 45 | 46 | =item [2] 47 | 48 | L 49 | 50 | =back 51 | 52 | =head2 Web 53 | 54 | =over 55 | 56 | =item [3] 57 | 58 | L 59 | 60 | =item [4] 61 | 62 | L 63 | 64 | =item [5] 65 | 66 | L 67 | 68 | =back 69 | 70 | =cut 71 | 72 | my $seqFile; 73 | my $out; 74 | 75 | GetOptions( 76 | "in:s" => \$seqFile, 77 | "out:s" => \$out, 78 | "h|help" => sub {system('perldoc', $0); exit;}, 79 | ); 80 | 81 | $/=">"; 82 | open (SEQ, $seqFile) || die "Couldn't open $seqFile\n"; 83 | open (OUT, ">".$out); 84 | while (my $line = ) { 85 | next if $line=~ m/^#/; 86 | chomp $line; 87 | $line=~ s/\r//; 88 | next unless $line; 89 | 90 | my($seqDesc, @sequence)=split(/\n/, $line); 91 | my $seq=join("", @sequence); 92 | 93 | $seq=~ tr/ACGTU/ACGTT/; 94 | $seq=~ s/[\.\-\s]//g; 95 | print OUT ">". $seqDesc."\n".$seq."\n"; 96 | } 97 | close SEQ; 98 | close OUT; 99 | -------------------------------------------------------------------------------- /scripts/VizBin_parser: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Derek Smith 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | ################################################################################# 21 | # 22 | # DJS 5 September 2018 23 | # 24 | # Run this shell script in a directory containing bin fasta files to get a 25 | # summary tab-delimited file to import the bins as a collection in ANVIO. It 26 | # was originally written for VizBin collections, but will work for any group of 27 | # Bin fastas generated from any binning program 28 | # 29 | ################################################################################# 30 | 31 | set -eu 32 | 33 | # Make a list of contigs in each bin, and add the bin file in a column next to the contig: 34 | for i in *.fa; do 35 | grep ">" "$i" | sed "s/$/ $i/" > "${i}".list; 36 | done 37 | 38 | # Concactenate the data into one list file: 39 | cat ./*.list > cat.list 40 | 41 | # remove the file extension from the bin name: 42 | sed 's/.fa//g' cat.list > cat2.list 43 | 44 | # Remove the ">" leftover from fasta headers: 45 | sed 's/>//g' cat2.list > VizBin_binning_results.txt 46 | 47 | # delete intermediate files: 48 | rm ./*.list 49 | -------------------------------------------------------------------------------- /scripts/antiSmash_summary: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | set -e 22 | 23 | echo "GeneClusters: $(grep -v "^>" Overview.geneclusters.txt | wc -l)" 24 | echo "smcogs: $(grep -c "^>>" Overview.smcogs.txt)" 25 | 26 | ls */structures/* | cut -f 1 -d "/" | sort -u > structures.list 27 | echo "Structures: `wc -l structures.list`" 28 | -------------------------------------------------------------------------------- /scripts/assemblyModules: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | # Copyright 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | module load AMOS/3.1.0 22 | module load velvet/1.1.07-MAX99-OPENMP 23 | module load MetaVelvet/1.0.01 24 | 25 | #meta-velvetg 26 | 27 | -------------------------------------------------------------------------------- /scripts/asv-map-update: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2020 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | """ 20 | Update fasta and map files with existing ASV accessions 21 | 22 | Takes a reference fasta file of known ASV sequences, and from a Mothur run a 23 | sequence id->OTU map and fasta file and replaces the OTUs in the map with ASVs. 24 | The output is a two-column table, mapping mothur OTUs to ASVs from the 25 | reference. 26 | 27 | All fasta input must be 1-line per sequence. 28 | 29 | This script is part of the Schmidt Lab 16S mothur pipeline. 30 | """ 31 | import argparse 32 | import sys 33 | 34 | argp = argparse.ArgumentParser(description=__doc__) 35 | argp.add_argument( 36 | 'asv_fa', 37 | metavar='asv-fasta', 38 | type=argparse.FileType(), 39 | help='Fasta file sequences with ASV accessions', 40 | ) 41 | argp.add_argument( 42 | 'map', 43 | metavar='seq-otu-map', 44 | type=argparse.FileType(), 45 | help='Mothur Sequence ID to OTU mapping, a two-column table, ' 46 | 'usually with suffix asv0.precluster.denovo.uchime.pick.map', 47 | ) 48 | argp.add_argument( 49 | 'fasta', 50 | type=argparse.FileType(), 51 | help='Mothur fasta file. Fasta headers should contain sequence IDs.', 52 | ) 53 | argp.add_argument( 54 | '-o', '--output', 55 | default=sys.stdout, 56 | type=argparse.FileType('w'), 57 | help='A two-column output mapping OTUs to ASVs, this file can then ' 58 | 'be used with the --map option for the shared-set-accessions script', 59 | ) 60 | argp.add_argument('--version', action='version', version='%(prog)s ' 61 | 'is part of geo-omics-scripts VERSION_PLACEHOLDER') 62 | args = argp.parse_args() 63 | 64 | 65 | # Load ASV fasta 66 | asvs = {} 67 | for line in args.asv_fa: 68 | line = line.strip() 69 | if line.startswith('>'): 70 | cur = line.lstrip('>').split()[0] # take "first word" as asv 71 | continue 72 | else: 73 | asvs[line] = cur 74 | del cur 75 | 76 | # Load mothur fasta 77 | seqs = {} 78 | for line in args.fasta: 79 | line = line.strip() 80 | if line.startswith('>'): 81 | cur = line.lstrip('>').split()[0] # take "first word" as id 82 | continue 83 | else: 84 | seqs[cur] = line.replace('-', '').strip('.') # rm alignment 85 | del cur 86 | 87 | otu2asv = {} 88 | for line in args.map: 89 | seqid, otu = line.strip().split('\t') 90 | try: 91 | otu2asv[otu] = asvs[seqs[seqid]] 92 | except KeyError: 93 | continue 94 | 95 | for otu, asv in sorted(otu2asv.items(), key=lambda x: x[0]): 96 | args.output.write(f'{otu}\t{asv}\n') 97 | 98 | args.output.close() 99 | -------------------------------------------------------------------------------- /scripts/bins2fasta: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | from omics import bins2fasta 21 | 22 | 23 | bins2fasta.main() 24 | -------------------------------------------------------------------------------- /scripts/calcN50: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | # Copyright 2013, 2017, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | use strict; 22 | 23 | my $fasta=$ARGV[0]; 24 | 25 | ## Read Fasta File and compute N50, L50, N95 and L95 ## 26 | my $length; 27 | my $totalLength; 28 | my $totalContigs; 29 | my @allLen; 30 | open(FASTA, $fasta)|| die $!; 31 | $/=">"; 32 | while(my $line=){ 33 | chomp $line; 34 | next unless $line; 35 | 36 | my ($header, @sequence)=split(/\n/, $line); 37 | my $length=length(join("", @sequence)); 38 | 39 | push (@allLen, $length); 40 | $totalLength += $length; 41 | $totalContigs++; 42 | } 43 | $/="\n"; 44 | close(FASTA); 45 | 46 | my @sortedLen = sort {$b <=> $a} @allLen; 47 | my $cumLen; 48 | my $numContig; 49 | print "Total_Contigs:\t$totalContigs\n"; 50 | foreach my $len(@sortedLen){ 51 | $cumLen+=$len; 52 | $numContig++; 53 | if ($cumLen >= $totalLength * 0.95) { 54 | print "N95:\t$len\n"; 55 | print "L95:\t$numContig\n"; 56 | } 57 | if($cumLen >= $totalLength * 0.50){ 58 | print "N50:\t$len\n"; 59 | print "L50:\t$numContig\n"; 60 | last; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /scripts/clusterDensity: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | =head1 NAME 22 | 23 | clusterDensity - Do this. 24 | 25 | 26 | =head1 SYNOPSIS 27 | 28 | B 29 | 30 | 31 | =head1 DESCRIPTION 32 | 33 | Do this. 34 | 35 | 36 | =head1 OPTIONS 37 | 38 | =over 8 39 | 40 | =item B<-version>, B<-v> I 41 | 42 | version of the current script 43 | 44 | =item B<-help>, B<-h> I 45 | 46 | This message. 47 | 48 | =back 49 | 50 | 51 | =head1 AUTHOR 52 | 53 | Sunit Jain, (Mon Mar 2 16:41:57 EST 2015) 54 | sunitj [AT] umich [DOT] edu 55 | 56 | 57 | =head1 SEE ALSO 58 | 59 | L, L 60 | 61 | =head2 Other local resources 62 | 63 | =over 64 | 65 | =item [1] 66 | 67 | L 68 | 69 | =item [2] 70 | 71 | L 72 | 73 | =back 74 | 75 | =head2 Web 76 | 77 | =over 78 | 79 | =item [3] 80 | 81 | L 82 | 83 | =item [4] 84 | 85 | L 86 | 87 | =item [5] 88 | 89 | L 90 | 91 | =back 92 | 93 | =cut 94 | 95 | use strict; 96 | use Getopt::Long; 97 | use FileHandle; 98 | use File::Basename; 99 | 100 | my $help; 101 | my $version=fileparse($0)."\tv0.0.1b"; 102 | my $clustFile="results.txt"; 103 | GetOptions( 104 | 'c|clusters:s'=>\$clustFile, 105 | 'v|version'=>sub{print $version."\n"; exit;}, 106 | 'h|help'=>sub{system("perldoc $0 \| cat"); exit;}, 107 | ); 108 | print "\# $version\n"; 109 | 110 | my $FILE=FileHandle->new(); 111 | open( $FILE, "<", $clustFile) || die $!; 112 | while(my $line=<$FILE>){ 113 | chomp $line; 114 | next unless $line; 115 | 116 | my($thresh, $fMeasure, $combinedClust)=split(/\t/, $line); 117 | my @clustComma=split(/\,/,$combinedClust); 118 | my @clustSemiColon=split(/\;/, $combinedClust); 119 | my $totalClusters=scalar(@clustSemiColon); 120 | my $totalNodes=scalar(@clustComma); 121 | print $thresh."\t".$totalNodes."\t".$totalClusters."\t".($totalNodes/$totalClusters)."\n"; 122 | } 123 | close $FILE; 124 | 125 | 126 | -------------------------------------------------------------------------------- /scripts/countInstances: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | =head1 NAME 22 | 23 | countInstances - count the number of times a value is seen in a column 24 | 25 | 26 | =head1 SYNOPSIS 27 | 28 | B B<-in> tab-delimitedB<-in>put.txt B<-out> output-filename.txt B<-col> column-number 29 | 30 | 31 | =head1 DESCRIPTION 32 | 33 | count the number of times a value is seen in a column 34 | 35 | 36 | =head1 OPTIONS 37 | 38 | =over 8 39 | 40 | =item B<-in> I 41 | 42 | input file name 43 | 44 | =item B<-out> I 45 | 46 | output file name 47 | 48 | =item B<-col> I 49 | 50 | column to subtotal; start counting from 1. 51 | 52 | =item B<-sum> I 53 | 54 | sum the column # instead of incrementing by 1. 55 | 56 | =back 57 | 58 | 59 | =head1 AUTHOR 60 | 61 | Sunit Jain, (Thu Jul 18 20:37:35 EDT 2013) 62 | sunitj [AT] umich [DOT] edu 63 | 64 | 65 | =head1 SEE ALSO 66 | 67 | L, L 68 | 69 | =head2 Other local resources 70 | 71 | =over 72 | 73 | =item [1] 74 | 75 | L 76 | 77 | =item [2] 78 | 79 | L 80 | 81 | =back 82 | 83 | =head2 Web 84 | 85 | =over 86 | 87 | =item [3] 88 | 89 | L 90 | 91 | =item [4] 92 | 93 | L 94 | 95 | =item [5] 96 | 97 | L 98 | 99 | =back 100 | 101 | =cut 102 | 103 | use strict; 104 | use Getopt::Long; 105 | 106 | my $version="0.0.1b"; 107 | my $col=2; 108 | my ($in, $out, $sum); 109 | GetOptions( 110 | 'in=s'=>\$in, 111 | 'o|out=s'=>\$out, 112 | 'col:i'=>\$col, 113 | 'sum:i'=>\$sum, 114 | 'v|version'=>sub{print $version."\n"; exit;}, 115 | 'h|help'=>sub{system('perldoc', $0); exit;}, 116 | ); 117 | 118 | $col--; 119 | #$sum--; 120 | 121 | my %counts; 122 | open(IN, "<".$in)|| die $!; 123 | while(my $line=){ 124 | chomp $line; 125 | next if $line=~ /^#/; 126 | 127 | my @cols=split(/\t/, $line); 128 | if($sum){ 129 | $sum--; 130 | $counts{$cols[$col]}+=$cols[$sum]; 131 | } 132 | else{ 133 | $counts{$cols[$col]}++; 134 | } 135 | } 136 | close IN; 137 | 138 | open(OUT, ">".$out)||die $!; 139 | foreach my $keys(keys %counts){ 140 | print OUT $keys."\t".$counts{$keys}."\n"; 141 | } 142 | close OUT; 143 | -------------------------------------------------------------------------------- /scripts/createFastq: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | use warnings; 22 | use strict; 23 | use File::Basename; 24 | 25 | my $inFasta = $ARGV[0]; 26 | my $baseName = basename($inFasta, qw/.fasta .fna/); 27 | my $inQual = $baseName . ".qual"; 28 | my $outFastq = $baseName . ".fastq"; 29 | 30 | my %seqs; 31 | 32 | $/ = ">"; 33 | 34 | open (FASTA, "<$inFasta"); 35 | my $junk = (); 36 | 37 | while (my $frecord = ) { 38 | chomp $frecord; 39 | my ($fdef, @seqLines) = split /\n/, $frecord; 40 | my $seq = join '', @seqLines; 41 | $seqs{$fdef} = $seq; 42 | } 43 | 44 | close FASTA; 45 | 46 | open (QUAL, "<$inQual"); 47 | $junk = ; 48 | open (FASTQ, ">$outFastq"); 49 | 50 | while (my $qrecord = ) { 51 | chomp $qrecord; 52 | my ($qdef, @qualLines) = split /\n/, $qrecord; 53 | my $qualString = join ' ', @qualLines; 54 | my @quals = split / /, $qualString; 55 | print FASTQ "@","$qdef\n"; 56 | print FASTQ "$seqs{$qdef}\n"; 57 | print FASTQ "+\n"; 58 | foreach my $qual (@quals) { 59 | print FASTQ chr($qual + 33); 60 | } 61 | print FASTQ "\n"; 62 | } 63 | 64 | close QUAL; 65 | close FASTQ; 66 | -------------------------------------------------------------------------------- /scripts/createNodes: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | =head1 NAME 22 | 23 | createNodes - Do this. 24 | 25 | 26 | =head1 SYNOPSIS 27 | 28 | B 29 | 30 | 31 | =head1 DESCRIPTION 32 | 33 | Do this. 34 | 35 | 36 | =head1 OPTIONS 37 | 38 | =over 8 39 | 40 | =item B<-config>, B<-c> I 41 | 42 | Config file explicitly declaring which columns to make nodes out of and which the properties; Recommended but Optional; default = guess. 43 | 44 | =item B<-nodes>, B<-n> I 45 | 46 | Read this nodes file created by one of the parsers and create nodes in the database; Required. 47 | 48 | =item B<-port>, B<-p> I 49 | 50 | Port which the database is listening to; default: 7474. 51 | 52 | =item B<-version>, B<-v> I 53 | 54 | version of the current script 55 | 56 | =item B<-help>, B<-h> I 57 | 58 | This message. 59 | 60 | =back 61 | 62 | 63 | =head1 AUTHOR 64 | 65 | Sunit Jain, (Tue Jun 30 08:14:43 EDT 2015) 66 | sunitj [AT] umich [DOT] edu 67 | 68 | 69 | =head1 SEE ALSO 70 | 71 | L, L 72 | 73 | =head2 Other local resources 74 | 75 | =over 76 | 77 | =item [1] 78 | 79 | L 80 | 81 | =item [2] 82 | 83 | L 84 | 85 | =back 86 | 87 | =head2 Web 88 | 89 | =over 90 | 91 | =item [3] 92 | 93 | L 94 | 95 | =item [4] 96 | 97 | L 98 | 99 | =item [5] 100 | 101 | L 102 | 103 | =back 104 | 105 | =cut 106 | 107 | use strict; 108 | use Getopt::Long; 109 | use FileHandle; 110 | use File::Basename; 111 | 112 | my $help; 113 | my $version=fileparse($0)."\tv0.0.1b"; 114 | my ($configFile, $nodesFile); 115 | my $port = 7474; 116 | GetOptions( 117 | 'c|config:s'=>\$configFile, 118 | 'n|nodes:n'=>\$nodesFile, 119 | 'p|port:i'=>\$port, 120 | 'v|version'=>sub{print $version."\n"; exit;}, 121 | 'h|help'=>sub{system("perldoc $0 \| cat"); exit;}, 122 | ); 123 | print "\# $version\n"; 124 | 125 | my $NODES=FileHandle->new(); 126 | open( $NODES, "<", $nodesFile) || die $!; 127 | while(my $line=<$NODES>){ 128 | 129 | } 130 | close $NODES; 131 | 132 | 133 | -------------------------------------------------------------------------------- /scripts/curateDB: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2014, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | use strict; 21 | use Getopt::Long; 22 | 23 | my $in; 24 | my $out = $$.".fasta"; 25 | my $delTmp; 26 | my $hasGItags; 27 | my $printDuplicates; 28 | 29 | GetOptions( 30 | 'i|in:s'=> \$in, 31 | 'o|out:s'=> \$out, 32 | 't|del'=> \$delTmp, 33 | 'g|gi'=>\$hasGItags, 34 | 'd|duplicates'=>\$printDuplicates, 35 | 'h|help'=> sub{ system('perldoc', $0); exit; }, 36 | ); 37 | 38 | if (! $in){ system('perldoc', $0); exit; } 39 | 40 | open (IN, $in)||die "[ERROR] $in: $!\n"; 41 | my $tmpFile= $$.".tmp"; 42 | open (TMPW, ">".$tmpFile); 43 | my $u=0; 44 | my $s=0; 45 | my $t=0; 46 | my %seen; 47 | 48 | while(my $line=){ 49 | chomp $line; 50 | next unless $line; 51 | if ($line=~ m/^>/){ 52 | $t++; 53 | $line=~ s/\>/\+/g; 54 | $line=~ s/^\+/>/; 55 | print TMPW $line."\n"; 56 | } 57 | else{ 58 | print TMPW $line."\n"; 59 | } 60 | } 61 | close IN; 62 | close TMPW; 63 | print "#Number of Headers before Curation: $t\n"; 64 | 65 | open (TMPR, $tmpFile) || die "[ERROR] $tmpFile: $!\n"; 66 | open (OUT, ">".$out); 67 | my $printSeq= 0; 68 | $/= ">"; 69 | while (my $line=){ 70 | chomp $line; 71 | next unless $line; 72 | 73 | if ($hasGItags){ 74 | my ($giTag, $giNum, @etc)=split(/\|/, $line); 75 | print OUT ">".$line unless ($seen{$giNum}); 76 | $seen{$giNum}++; 77 | } 78 | else{ 79 | my($name, @seqs)=split(/\n/,$line); 80 | print OUT ">".$line."\n" unless $seen{$name}; 81 | $seen{$name}++; 82 | } 83 | } 84 | $/= "\n"; 85 | close TMPR; 86 | close OUT; 87 | print "#Number of Sequences after Curation: ".keys(%seen)."\n"; 88 | if ($printDuplicates){ 89 | 90 | foreach my $k(keys %seen){ 91 | #print "."; 92 | print $k."\t".$seen{$k}."\n" if ($seen{$k} >1); 93 | } 94 | } 95 | if (! $delTmp){ 96 | print "#Deleting temporary file...\n"; 97 | unlink $tmpFile; 98 | } 99 | exit; 100 | -------------------------------------------------------------------------------- /scripts/dada2shared: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2020 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | """ 21 | Convert a DADA2 sequence table into a fasta file and a mothur shared file 22 | """ 23 | import argparse 24 | from pathlib import Path 25 | import re 26 | 27 | 28 | argp = argparse.ArgumentParser(description=__doc__) 29 | argp.add_argument( 30 | 'infile', 31 | metavar='dada2_sequence_table', 32 | type=argparse.FileType(), 33 | help='Sequence table from dada2, saved with write.table()', 34 | ) 35 | argp.add_argument('--version', action='version', version='%(prog)s ' 36 | 'is part of geo-omics-scripts VERSION_PLACEHOLDER') 37 | args = argp.parse_args() 38 | 39 | # compute output file names 40 | basename = Path(args.infile.name).stem 41 | outfasta = Path() / (basename + '.fa') 42 | outshared = Path() / (basename + '.shared') 43 | 44 | # read sequences 45 | seqs = enumerate(args.infile.readline().strip().strip('"').split('" "'), 46 | start=1) 47 | seqs = [('{}_{}'.format(basename, i), s) for i, s in seqs] 48 | 49 | print('Total ASVs:', len(seqs)) 50 | 51 | with outfasta.open('w') as fasta: 52 | for id, seq in seqs: 53 | fasta.write('>{}\n{}\n'.format(id, seq)) 54 | 55 | print('Fasta file written:', outfasta) 56 | 57 | pat = re.compile(r'"([^"])"') 58 | with outshared.open('w') as shared: 59 | header = ['label', 'Group', 'numOtus'] + [i for i, _ in seqs] 60 | shared.write('\t'.join(header) + '\n') 61 | total = len(seqs) 62 | replace_warning_shown = False 63 | for row in args.infile: 64 | sample, _, counts = row.strip().partition(' ') 65 | sample = sample.strip('"') 66 | if '-' in sample: 67 | sample = sample.replace('-', '_') 68 | if not replace_warning_shown: 69 | print('Some sample ids have dashes replaced by underscores to ' 70 | 'make them mothur-compatible') 71 | replace_warning_shown = True 72 | counts = counts.replace(' ', '\t') 73 | shared.write('dada2asv\t{}\t{}\t{}\n'.format(sample, total, counts)) 74 | 75 | print('Shared file written:', outshared) 76 | -------------------------------------------------------------------------------- /scripts/derep_getReadAbundance: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | # Copyright 2013, 2014, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | use strict; 22 | use Getopt::Long; 23 | 24 | my ($list,$bam, $stats, $fwdClust, $revClust); 25 | my $out=$$.".abundance"; 26 | 27 | GetOptions( 28 | 'b|bam=s'=>\$list, 29 | 'fwd=s'=>\$fwdClust, 30 | 'rev=s'=>\$revClust, 31 | 'stats=s'=>\$stats, 32 | 'o|out:s'=>\$out, 33 | 'h'=>sub{system('perldoc', $0); exit;}, 34 | ); 35 | my %clust; 36 | 37 | open (FCLUST, $fwdClust) || die "[ERROR] $fwdClust:$!\n"; 38 | while(my $line=){ 39 | next if ($line=~ /^#/); 40 | chomp($line); 41 | $line=~ s/\r//; 42 | next unless $line; 43 | 44 | my($cNum, $size, $rep, @seqNames)=split(/\t/, $line); 45 | my ($name, $strand)=split(/\s/, $rep); 46 | $name=~ s/^@//; 47 | 48 | $clust{$name}=$size; 49 | } 50 | close FCLUST; 51 | 52 | open (RCLUST, $revClust) || die "[ERROR] $revClust:$!\n"; 53 | while(my $line=){ 54 | next if ($line=~ /^#/); 55 | chomp($line); 56 | $line=~ s/\r//; 57 | next unless $line; 58 | 59 | my($cNum, $size, $rep, @seqNames)=split(/\t/, $line); 60 | my ($name, $strand)=split(/\s/, $rep); 61 | $name=~ s/^@//; 62 | 63 | if ($clust{$name}){ 64 | $clust{$name}+=$size; 65 | } 66 | else{ 67 | $clust{$name}=$size; 68 | } 69 | } 70 | close RCLUST; 71 | 72 | print "All Clusters read into memory...\n"; 73 | 74 | open (STATS, $stats); 75 | my %index; 76 | while(my $line=){ 77 | chomp $line; 78 | next unless $line; 79 | 80 | my ($name, $len, $mapped, $unmapped)=split(/\t/, $line); 81 | $name=~ s/\|/\\\|/; 82 | 83 | $index{$name}=$mapped; 84 | } 85 | close STATS; 86 | 87 | print "All Stats read into memory...\n"; 88 | 89 | open(OUT, ">".$out); 90 | print OUT "Gene\tDerepMapped\tTotalMapped\n"; 91 | my $allReads=0; 92 | foreach my $i(keys %index){ 93 | my @list=`samtools view -F0x4 $bam $i | cut -f 1`; 94 | chomp @list; 95 | 96 | my $totalMapped=0; 97 | foreach my $l(@list){ 98 | $totalMapped+= $clust{$l}; 99 | $allReads+=$clust{$l}; 100 | } 101 | my $derepMapped= $index{$i}; 102 | print OUT $i."\t".$derepMapped."\t".$totalMapped."\n"; 103 | } 104 | 105 | print "Total Reads mapped: $allReads\n"; 106 | -------------------------------------------------------------------------------- /scripts/do2folder: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | #function do2all() { 21 | if [ -z $1 ]; then echo "command failed: Give a directroy name with files"; exit; fi 22 | if [ ! -d $1 ]; then echo "$1 does not exist"; exit; fi 23 | for i in $1/*; do 24 | if [ ! -s $i ]; then echo "$i is empty; skipping..."; continue; fi 25 | OUT="$i.out" 26 | # replace the following line with the desired command and $i as input and $OUT as output 27 | echo "IN: $i, OUT:$OUT" 28 | done 29 | #} 30 | -------------------------------------------------------------------------------- /scripts/do2list: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | if [ -z $1 ]; then echo "command failed: Give a file name with list"; exit; fi 21 | if [ ! -s $1 ]; then echo "$1 does not exist"; exit; fi 22 | for i in $(grep "^" $1); do 23 | if [ ! -s $i ]; then echo "$i is empty; skipping..."; continue; fi 24 | OUT="$i.out" 25 | # replace the following line with the desired command and $i as input and $OUT as output 26 | echo "IN: $i, OUT:$OUT" 27 | done 28 | -------------------------------------------------------------------------------- /scripts/extractEuks: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | legacy_consolidateJGIdata -DIR . -OUTDIR consolidated 21 | awk < consolidated/Unclassified.tsv -F'\t' '{ print $6, t, $2 }' > locus_contig.list 22 | awk < *phylodist -F'\t' '{ print $1, "\t", $5 }' | cut -f 1 -d ";" | grep "Eukaryota" | cut -f 1 > eukaryota.list 23 | fgrep -f eukaryota.list locus_contig.list | cut -f 2 | sort -u | sed "s# ##" > eukaryota_contigs.list 24 | extractSeqs -e -l eukaryota_contigs.list -f *.fna -o euksRemoved.fasta 25 | -------------------------------------------------------------------------------- /scripts/firefox_already_running: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | LOCK=$(find $HOME/.mozilla/firefox/ -name lock) 22 | PLOCK=$(find $HOME/.mozilla/firefox/ -name \.parentlock) 23 | 24 | echo "Deleting Lock file: $LOCK" 25 | rm -f $LOCK 26 | echo "Deleting Parent Lock file: $PLOCK" 27 | rm -f $PLOCK 28 | 29 | echo "To see why these files had to be deleted, see: http://www.mattcutts.com/blog/how-to-fix-firefox-is-already-running-error/" 30 | echo "Try running firefox again..." 31 | -------------------------------------------------------------------------------- /scripts/fixpod2: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | """ 21 | Fix additional stuff in POD in perl scripts 22 | 23 | Note: Makes assumptions about input, little error checking 24 | """ 25 | import argparse 26 | import re 27 | import sys 28 | 29 | 30 | argp = argparse.ArgumentParser(description=__doc__) 31 | argp.add_argument('inputfile', type=argparse.FileType()) 32 | argp.add_argument('-w', '--write-to-file', action='store_true') 33 | 34 | args = argp.parse_args() 35 | 36 | out = '' 37 | 38 | for line in args.inputfile: 39 | if line.startswith('=item '): 40 | item_line = line.strip() 41 | 42 | empty_line = args.inputfile.readline() # empty line following =item 43 | if empty_line.strip(): 44 | raise RuntimeError('non-empty line following =head') 45 | 46 | descr = args.inputfile.readline() 47 | m = re.match(r'^(or|OR)\s*-(?P\w+)\s*:?\s*(?P.*)$', descr) 48 | if m is None: 49 | # remove any leading ':\t+' from description 50 | descr = re.sub('^:\s+', '', descr.strip()) 51 | else: 52 | # Fix options with 'or' between long and short option name 53 | _, opt, descr = m.groups() 54 | item_line = item_line + ', B<-{}>'.format(opt) 55 | 56 | out += item_line + '\n\n' + descr + '\n' 57 | else: 58 | out += line 59 | 60 | # write output 61 | if args.write_to_file: 62 | args.inputfile.close() 63 | outfile = open(args.inputfile.name, 'w') 64 | else: 65 | outfile = sys.stdout 66 | 67 | outfile.write(out) 68 | -------------------------------------------------------------------------------- /scripts/fixpod3: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | """ 21 | Fix additional stuff in POD in perl scripts 22 | 23 | Note: Makes assumptions about input, little error checking 24 | """ 25 | import argparse 26 | import re 27 | import sys 28 | 29 | 30 | argp = argparse.ArgumentParser(description=__doc__) 31 | argp.add_argument('inputfile', type=argparse.FileType()) 32 | argp.add_argument('-w', '--write-to-file', action='store_true') 33 | 34 | args = argp.parse_args() 35 | 36 | out = '' 37 | 38 | for line in args.inputfile: 39 | if line.startswith('=item '): 40 | item_line = line.strip() 41 | 42 | empty_line = args.inputfile.readline() # empty line following =item 43 | if empty_line.strip(): 44 | raise RuntimeError('non-empty line following =head') 45 | 46 | descr = args.inputfile.readline() 47 | m = re.match( 48 | r'^\[(boolean|characters?|flag|float|integers?|real number)\]' 49 | r'\s*:?\s*(?P.*)$', 50 | descr, 51 | flags=re.IGNORECASE 52 | ) 53 | if m is not None: 54 | # move non-boolean types to =item line 55 | valtype, descr = m.groups() 56 | descr += '\n' # add nl lost by matching 57 | valtype = valtype.lower().rstrip('s') 58 | if valtype not in ['flag', 'boolean']: 59 | opts = item_line[6:] # remove leading '=item ' 60 | opts = opts.split(', ') 61 | opts = [i + ' I<{}>'.format(valtype) for i in opts] 62 | opts = ', '.join(opts) 63 | item_line = '=item ' + opts 64 | 65 | out += item_line + '\n\n' + descr 66 | else: 67 | out += line 68 | 69 | # write output 70 | if args.write_to_file: 71 | args.inputfile.close() 72 | outfile = open(args.inputfile.name, 'w') 73 | else: 74 | outfile = sys.stdout 75 | 76 | outfile.write(out) 77 | -------------------------------------------------------------------------------- /scripts/fixpod4: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | """ 21 | Fix additional stuff in POD in perl scripts 22 | 23 | Note: Makes assumptions about input, little error checking 24 | """ 25 | import argparse 26 | import re 27 | import sys 28 | 29 | 30 | argp = argparse.ArgumentParser(description=__doc__) 31 | argp.add_argument('inputfile', type=argparse.FileType()) 32 | argp.add_argument('-w', '--write-to-file', action='store_true') 33 | argp.add_argument('-c', '--check-name-only', action='store_true') 34 | 35 | args = argp.parse_args() 36 | 37 | out = '' 38 | 39 | for line in args.inputfile: 40 | if line.startswith('=head1 NAME'): 41 | out += line 42 | 43 | empty_line = args.inputfile.readline() # empty line following =head 44 | if empty_line.strip(): 45 | raise RuntimeError('non-empty line following =head') 46 | 47 | short_descr = args.inputfile.readline() 48 | m = re.match(r'^{} - '.format(args.inputfile.name), short_descr) 49 | if m is None: 50 | if args.check_name_only: 51 | print('bad NAME section:', args.inputfile.name, 52 | '=>', short_descr[:90].strip()) 53 | continue 54 | # best effort sub '^something -- ' ===> 'realname - ' 55 | short_descr = re.sub( 56 | r'^.*\s+--\s+', 57 | '{} - '.format(args.inputfile.name), 58 | short_descr 59 | ) 60 | else: 61 | # all good 62 | pass 63 | 64 | out += '\n' + short_descr 65 | else: 66 | out += line 67 | 68 | if args.check_name_only: 69 | sys.exit() 70 | 71 | # write output 72 | if args.write_to_file: 73 | args.inputfile.close() 74 | outfile = open(args.inputfile.name, 'w') 75 | else: 76 | outfile = sys.stdout 77 | 78 | outfile.write(out) 79 | -------------------------------------------------------------------------------- /scripts/fixpod5: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | """ 21 | Add SEE ALSO section to POD in perl scripts 22 | 23 | Note: Makes assumptions about input, little error checking 24 | """ 25 | import argparse 26 | import sys 27 | 28 | 29 | argp = argparse.ArgumentParser(description=__doc__) 30 | argp.add_argument('inputfile', type=argparse.FileType()) 31 | argp.add_argument('-w', '--write-to-file', action='store_true') 32 | 33 | args = argp.parse_args() 34 | 35 | see_also = """ 36 | =head1 SEE ALSO 37 | 38 | L, L 39 | 40 | =head3 Other local resources 41 | 42 | =over 43 | 44 | =item 1 45 | 46 | L 47 | 48 | =item 2 49 | 50 | L 51 | 52 | =back 53 | 54 | =head3 Web 55 | 56 | =over 57 | 58 | =item 1 59 | 60 | L 61 | 62 | =item 2 63 | 64 | L 65 | 66 | =item 3 67 | 68 | L 69 | 70 | =back 71 | 72 | """ 73 | 74 | out = '' 75 | done = False 76 | 77 | for line in args.inputfile: 78 | if line.startswith('=cut') and not done: 79 | # ensure consistent empty lines between sections 80 | out = out.rstrip() + '\n\n' 81 | # add section at the end 82 | out += see_also 83 | done = True # in case we find a second =cut 84 | 85 | out += line 86 | 87 | # write output 88 | if args.write_to_file: 89 | args.inputfile.close() 90 | outfile = open(args.inputfile.name, 'w') 91 | else: 92 | outfile = sys.stdout 93 | 94 | outfile.write(out) 95 | -------------------------------------------------------------------------------- /scripts/fixpod6: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | """ 21 | Improve SEE ALSO section to POD in perl scripts 22 | 23 | Note: Makes assumptions about input, little error checking 24 | """ 25 | import argparse 26 | import re 27 | import sys 28 | 29 | 30 | argp = argparse.ArgumentParser(description=__doc__) 31 | argp.add_argument('inputfile', type=argparse.FileType()) 32 | argp.add_argument('-w', '--write-to-file', action='store_true') 33 | 34 | args = argp.parse_args() 35 | 36 | out = '' 37 | in_see_also = False 38 | ref_count = 1 39 | 40 | for line in args.inputfile: 41 | if not in_see_also and line.startswith('=head1 SEE ALSO'): 42 | in_see_also = True 43 | if in_see_also and line.startswith('=cut'): 44 | in_see_also = False 45 | 46 | if in_see_also: 47 | line = re.sub(r'^=head3', '=head2', line) 48 | if line.startswith('=item'): 49 | line = '=item [{}]\n'.format(ref_count) 50 | ref_count += 1 51 | 52 | out += line 53 | 54 | # write output 55 | if args.write_to_file: 56 | args.inputfile.close() 57 | outfile = open(args.inputfile.name, 'w') 58 | else: 59 | outfile = sys.stdout 60 | 61 | outfile.write(out) 62 | -------------------------------------------------------------------------------- /scripts/folderLevelSize: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | # This script will only look at folders and sub-folders of the present working directory. You'll HAVE TO paste this script to the other folder if you want it's stats. Also make sure you have read permissions for the folders and sub folders before you run this script. 22 | 23 | use strict; 24 | 25 | #my $path= `pwd`; 26 | 27 | my $level=$ARGV[0]; 28 | my $tmp=$$.".tmp"; 29 | 30 | `du -h > $tmp`; 31 | 32 | 33 | if(! $ARGV[1]){ $level =1;} 34 | $level++; 35 | 36 | open (IN, $tmp); 37 | while (my $line=){ 38 | next if $line=~ /^#/; 39 | $line=~ s/\r//; 40 | chomp $line; 41 | next unless $line; 42 | 43 | my($size, $path)=split(/\t/, $line); 44 | my @levels=split(/\//, $path); 45 | 46 | print $size."\t".$path."\n" if (scalar(@levels) == $level); 47 | } 48 | unlink $tmp; 49 | close IN; 50 | -------------------------------------------------------------------------------- /scripts/gbk2fna: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | =head1 NAME 22 | 23 | gbk2fna - Read Genbank to Nucleotide Fasta file. 24 | 25 | 26 | =head1 SYNOPSIS 27 | 28 | B 29 | 30 | 31 | =head1 DESCRIPTION 32 | 33 | Read Genbank to Nucleotide Fasta file. 34 | 35 | 36 | =head1 OPTIONS 37 | 38 | =over 8 39 | 40 | =item B<-in> I 41 | 42 | FASTA File 43 | 44 | =item B<-version>, B<-v> I 45 | 46 | version of the current script 47 | 48 | =item B<-help>, B<-h> I 49 | 50 | This message. 51 | 52 | =back 53 | 54 | 55 | =head1 AUTHOR 56 | 57 | Sunit Jain, (Thu Oct 1 11:10:47 EDT 2015) 58 | sunitj [AT] umich [DOT] edu 59 | 60 | 61 | =head1 SEE ALSO 62 | 63 | L, L 64 | 65 | =head2 Other local resources 66 | 67 | =over 68 | 69 | =item [1] 70 | 71 | L 72 | 73 | =item [2] 74 | 75 | L 76 | 77 | =back 78 | 79 | =head2 Web 80 | 81 | =over 82 | 83 | =item [3] 84 | 85 | L 86 | 87 | =item [4] 88 | 89 | L 90 | 91 | =item [5] 92 | 93 | L 94 | 95 | =back 96 | 97 | =cut 98 | 99 | use strict; 100 | use Getopt::Long; 101 | use FileHandle; 102 | use File::Basename; 103 | 104 | my $help; 105 | my $version=fileparse($0)."\tv0.0.1b"; 106 | my $infile; 107 | my $outfile; 108 | GetOptions( 109 | 'in:s'=>\$infile, 110 | 'out:s'=>\$outfile, 111 | 'v|version'=>sub{print $version."\n"; exit;}, 112 | 'h|help'=>sub{system("perldoc $0 \| cat"); exit;}, 113 | ); 114 | print "\# $version\n"; 115 | 116 | use Bio::SeqIO; 117 | my $seq_in = Bio::SeqIO->new( 118 | -file => "<$infile", 119 | -format => "genbank", 120 | ); 121 | my $seq_out = Bio::SeqIO->new( 122 | -file => ">$outfile", -format => "fasta", ); 123 | while (my $inseq = $seq_in->next_seq) { 124 | $seq_out->write_seq($inseq); 125 | } 126 | -------------------------------------------------------------------------------- /scripts/genomeCheck: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | use strict; 22 | 23 | sub checkForCompleteness{ 24 | my $fName=shift; 25 | chomp($fName); 26 | open (CONTIGS, $fName) || die "Couldn't open $fName\n"; 27 | $/= ">"; 28 | my %sequences; 29 | while (my $b = ) { 30 | chomp $b; 31 | next unless $b; 32 | my ($name, @sequence) = split (/\n/, $b); 33 | my $seq = join ("", @sequence); 34 | $sequences{$name} = uc $seq; 35 | } 36 | close CONTIGS; 37 | 38 | while (my($n, $s)=each(%sequences)){ 39 | chomp($s); 40 | print "F:$fName\tSN: $n\n" unless (length($s)>0); 41 | } 42 | $/="\n"; 43 | return (); 44 | } 45 | 46 | my $listOfFiles= $ARGV[0]; 47 | open (LOF, "$listOfFiles") || die "ERROR: $ARGV[0]\n $!\n"; 48 | print "Summary for incomplete Genomes:\n"; 49 | while (my $file=){ 50 | checkForCompleteness($file); 51 | } 52 | print "All Done!!\n"; 53 | -------------------------------------------------------------------------------- /scripts/getGFF: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2014, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | =head1 NAME 22 | 23 | getGFF - Given a list of contig names extract GFF data. 24 | 25 | 26 | =head1 SYNOPSIS 27 | 28 | B B<-list> contig_names.list B<-gff> annotated_metagenome.gff 29 | 30 | 31 | =head1 DESCRIPTION 32 | 33 | Given a list of contig names extract GFF data. 34 | 35 | 36 | =head1 OPTIONS 37 | 38 | =over 8 39 | 40 | =item B<-list> I 41 | 42 | list of contigs 43 | 44 | =item B<-gff> I 45 | 46 | metagenome GFF file. 47 | 48 | =item B<-col> I 49 | 50 | Column number that contains the contig names; start count from 1. [ default = 1] 51 | 52 | =item B<-version>, B<-v> I 53 | 54 | version of the current script 55 | 56 | =item B<-help>, B<-h> I 57 | 58 | This message. 59 | 60 | =back 61 | 62 | 63 | =head1 AUTHOR 64 | 65 | Sunit Jain, (Thu Jan 2 12:41:53 EST 2014) 66 | sunitj [AT] umich [DOT] edu 67 | 68 | 69 | =head1 SEE ALSO 70 | 71 | L, L 72 | 73 | =head2 Other local resources 74 | 75 | =over 76 | 77 | =item [1] 78 | 79 | L 80 | 81 | =item [2] 82 | 83 | L 84 | 85 | =back 86 | 87 | =head2 Web 88 | 89 | =over 90 | 91 | =item [3] 92 | 93 | L 94 | 95 | =item [4] 96 | 97 | L 98 | 99 | =item [5] 100 | 101 | L 102 | 103 | =back 104 | 105 | =cut 106 | 107 | use strict; 108 | use Getopt::Long; 109 | use File::Basename; 110 | 111 | my $help; 112 | my $version="getGFF\tv0.1.0"; 113 | my ($list, $gff); 114 | my $col=1; 115 | GetOptions( 116 | 'l|list:s'=>\$list, 117 | 'g|gff:s'=>\$gff, 118 | 'c|col:i'=>\$col, 119 | 'v|version'=>sub{print $version."\n"; exit;}, 120 | 'h|help'=>sub{system("perldoc $0 \| cat"); exit;}, 121 | ); 122 | print "\# $version\n"; 123 | 124 | if ($col==0){ warn "Column 0, does not compute! Start your counts from 1\nAssuming you meant Column 1\n"; $col=1} 125 | 126 | print $list; 127 | open(LIST, "<".$list)|| die $!; 128 | my %index; 129 | while(my $line=){ 130 | chomp $line; 131 | $index{uc($line)}++; 132 | } 133 | close LIST; 134 | 135 | print "\t.."; 136 | 137 | $col--; 138 | my $out=fileparse($list, ".list"); 139 | open(GFF, "<".$gff)|| die $!; 140 | open(OUT, ">".$out.".gff")|| die $!; 141 | while(my $line=){ 142 | chomp $line; 143 | my (@data)=split(/\t/, $line); 144 | my $contig=$data[$col]; 145 | print OUT $line."\n" if ($index{uc($contig)}); 146 | } 147 | close GFF; 148 | close OUT; 149 | print ".Done.\n" 150 | 151 | -------------------------------------------------------------------------------- /scripts/getGIAnnotation: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | # usage: getGIAnnotation 22 | # example: getGIAnnotation test.blastn nucleotide 23 | if [ -z "$1" ]; then echo "command failed: Give a file name with list"; exit; fi 24 | if [ ! -s "$1" ]; then echo "$1 does not exist"; exit; fi 25 | cut -d '|' -f 2 "$1" | sort -u > gi.list 26 | 27 | getGiInfo -d "$2" -o anno.xml -l gi.list 28 | 29 | GI_info_XMLParser anno.xml gi.desc 30 | -------------------------------------------------------------------------------- /scripts/getGISummary: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | use Bio::DB::EUtilities; 21 | use strict; 22 | 23 | my @ids; 24 | open (IN, $ARGV[0]) || die "[error] $ARGV[0] : $!\n"; 25 | while (my $line=){ 26 | next if $line=~ m/^#/; 27 | chomp $line; 28 | $line=~ s/\r//; 29 | next unless $line; 30 | 31 | push(@ids, $line); 32 | } 33 | 34 | my $factory = Bio::DB::EUtilities->new(-eutil => 'esummary', 35 | -email => 'sunitj@umich.edu', 36 | -db => 'protein', 37 | -id => \@ids); 38 | 39 | open (OUT, ">".$ARGV[1]); 40 | while (my $ds = $factory->next_DocSum) { 41 | my $id=$ds->get_id; 42 | print OUT $id."\t"; 43 | # flattened mode 44 | while (my $item = $ds->next_Item('flattened')) { 45 | # not all Items have content, so need to check... 46 | if ($item->get_content){ 47 | my $name= $item->get_name; 48 | my $content= $item->get_content; 49 | print OUT $name."\t".$content; 50 | } 51 | } 52 | print OUT "\n"; 53 | } 54 | 55 | -------------------------------------------------------------------------------- /scripts/getMasterList: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2014, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | use strict; 21 | use Getopt::Long; 22 | 23 | my $ext="out"; 24 | my $out=$$.".list"; 25 | my $col=1; 26 | my $bs=0; 27 | GetOptions( 28 | 'e:s'=>\$ext, 29 | 'o:s'=>\$out, 30 | 'c:i'=>\$col, 31 | 's:f'=>\$bs, 32 | ); 33 | 34 | my @listOfFiles=glob("*.".$ext); 35 | print @listOfFiles." Filenames provided\n"; 36 | 37 | my $c= $col-1; 38 | my %masterList; 39 | open (OUT, ">".$out); 40 | foreach my $f(@listOfFiles){ 41 | my $fh; 42 | open($fh, $f) || die "[error] $f: $!\n"; 43 | while (my $line=<$fh>){ 44 | next if ($line=~ m/^#/); 45 | chomp $line; 46 | $line=~ s/\r//g; 47 | next unless $line; 48 | 49 | my @cols=split(/\t/, $line); 50 | print OUT $cols[$c]."\n" unless ($masterList{$cols[$c]}); 51 | $masterList{$cols[$c]}++; 52 | } 53 | close $fh; 54 | } 55 | close OUT; 56 | exit; 57 | -------------------------------------------------------------------------------- /scripts/getMyContigs: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | # USAGE: perl getMyContigs.pl 22 | 23 | use strict; 24 | 25 | my $readCov=$ARGV[0]; 26 | my $list=$ARGV[1]; 27 | my $OUT=$ARGV[2]; 28 | 29 | die "Incorrect number of files input\nUSAGE: perl getMyContigs.pl " if (scalar(@ARGV) != 3); 30 | 31 | open(LIST, $list)|| $!; 32 | my %LIST; 33 | while(my $line=){ 34 | chomp; 35 | next unless $line; 36 | next if $line=~ /^#/; 37 | # NODE_14_length_2679_cov_8.406121 38 | my @headerParts=split(/\_/, $line); 39 | $LIST{$headerParts[1]}++; 40 | } 41 | close LIST; 42 | 43 | my %READS; 44 | open(READ, $readCov)|| $!; 45 | while(my $line=){ 46 | next if $line=~ /^#/; 47 | chomp $line; 48 | next unless $line; 49 | 50 | my ($contigName, $size, @reads)=split(/\t/, $line); 51 | 52 | next unless $LIST{$contigName}; 53 | foreach my $r(@reads){ 54 | $READS{$r}++; 55 | } 56 | } 57 | close READ; 58 | undef %LIST; 59 | 60 | print "Total # Reads Mapped to this bin:".keys(%READS)."\n"; 61 | open(OUT, ">".$OUT)|| die $!; 62 | foreach my $r(keys %READS){ 63 | print OUT $r."\n"; 64 | } 65 | close OUT; 66 | -------------------------------------------------------------------------------- /scripts/kmerFreq: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | use strict; 22 | use Getopt::Long; 23 | 24 | ################################### 25 | ## Parameters 26 | ################################### 27 | 28 | my $seqFile; 29 | my $kmerFile=$$.".kmer.out"; 30 | my $kps; 31 | my $isFastq; 32 | my $k=4; 33 | 34 | GetOptions( 35 | 'i|in:s'=>\$seqFile, 36 | 'k|kmer:i'=>\$k, 37 | 'o|out:s'=>\$kmerFile, 38 | 's|each_seq:s'=>\$kps, 39 | 'fq|fastq'=>\$isFastq, 40 | ); 41 | 42 | ################################### 43 | ## Main 44 | ################################### 45 | 46 | $/= $isFastq ? "@" : ">"; 47 | my %kmers; 48 | my %kmersPerSeq; 49 | 50 | open (SEQ, $seqFile)|| die "$! : $seqFile\n"; 51 | while(my $line=){ 52 | next if ($line=~ /^#/); 53 | chomp $line; 54 | $line=~ s/ //; 55 | next unless $line; 56 | 57 | $isFastq ? &parseFastq($line) : &parseFasta($line); 58 | } 59 | close SEQ; 60 | 61 | open (SUMM, ">".$kmerFile); 62 | my %seen; 63 | my @kmerArray; 64 | while(my($kmer, $count)=each(%kmers)){ 65 | next if $seen{$kmer}; 66 | 67 | my $rc_kmer=&rev_comp($kmer); 68 | 69 | # if rev_comp exists, combine counts and mark with '*'. 70 | my $totalCount= $kmers{$rc_kmer} ? ($kmers{$kmer} + $kmers{$rc_kmer})."\t\*" : $kmers{$kmer} ; 71 | 72 | # Print to Output file 73 | print SUMM $kmer."\t".$totalCount."\n"; 74 | push(@kmerArray, $kmer); 75 | $seen{$kmer}++; 76 | $seen{$rc_kmer}++; 77 | } 78 | close SUMM; 79 | unlink %kmers; 80 | unlink %seen; 81 | 82 | if($kps){ 83 | open (OUT, ">".$kps); 84 | print OUT "#SeqNames.\t"; 85 | foreach my $km(@kmerArray){ print OUT $km."\t"; } 86 | print OUT "\n"; 87 | 88 | foreach my $desc(keys %kmersPerSeq){ 89 | print OUT $desc."\t"; 90 | foreach my $km(@kmerArray){ 91 | my $rc_km=&rev_comp($km); 92 | my $totalCount= $kmersPerSeq{$desc}{$km} + $kmersPerSeq{$desc}{$rc_km}; 93 | $totalCount = $totalCount ? $totalCount : 0; 94 | print OUT $totalCount."\t"; 95 | } 96 | print OUT "\n"; 97 | } 98 | close OUT; 99 | } 100 | unlink %kmersPerSeq; 101 | exit; 102 | 103 | ################################### 104 | ## Sub-Routines 105 | ################################### 106 | 107 | sub parseFastq{ 108 | my $line=shift; 109 | my($seqDesc,$seq,$qualDesc,$qual)=split(/\n/, $line); 110 | $seq=~ s/ //g; 111 | &getKmers($seq, $seqDesc); 112 | return; 113 | } 114 | 115 | sub parseFasta{ 116 | my $line=shift; 117 | my($seqDesc,@sequence)=split(/\n/, $line); 118 | my $seq=join("",@sequence); 119 | $seq=~ s/ //g; 120 | &getKmers($seq, $seqDesc); 121 | return; 122 | } 123 | 124 | sub getKmers{ 125 | my $seq = shift; 126 | my $desc=shift; 127 | $seq=uc($seq); 128 | my $windows=length($seq) - $k; 129 | 130 | for (my $pos=0; $pos <= $windows; $pos++){ 131 | my $kmer=substr $seq, $pos, $k; 132 | $kmers{$kmer}++; 133 | $kmersPerSeq{$desc}{$kmer}++; 134 | } 135 | return; 136 | } 137 | 138 | sub rev_comp{ 139 | my $seq=shift; 140 | $seq=uc($seq); 141 | $seq=reverse($seq); 142 | $seq=~ tr/ATGCN/TACGN/; 143 | return $seq; 144 | } 145 | -------------------------------------------------------------------------------- /scripts/length+GC: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | =head1 NAME 21 | 22 | length+GC - extract length and GC content from a fasta file 23 | 24 | 25 | =head1 SYNOPSIS 26 | 27 | length+GC.pl -f input.fasta 28 | 29 | 30 | =head1 DESCRIPTION 31 | 32 | This program takes a fasta file, extracts length and %GC information (if '-gc' is specified) 33 | 34 | 35 | =head1 OPTIONS 36 | 37 | =over 8 38 | 39 | =item B<-gc> 40 | 41 | Calculate GC(%) content. 42 | 43 | =item B<-len> 44 | 45 | calculate for sequences abovea certain length only. 46 | 47 | =back 48 | 49 | 50 | =head1 AUTHOR 51 | 52 | Sunit Jain 53 | 54 | 55 | =head1 SEE ALSO 56 | 57 | L, L 58 | 59 | =head2 Other local resources 60 | 61 | =over 62 | 63 | =item [1] 64 | 65 | L 66 | 67 | =item [2] 68 | 69 | L 70 | 71 | =back 72 | 73 | =head2 Web 74 | 75 | =over 76 | 77 | =item [3] 78 | 79 | L 80 | 81 | =item [4] 82 | 83 | L 84 | 85 | =item [5] 86 | 87 | L 88 | 89 | =back 90 | 91 | =cut 92 | 93 | 94 | use strict; 95 | use Getopt::Long; 96 | use FileHandle; 97 | 98 | my $calcGC; 99 | my $fasta; 100 | my $minLen=1; 101 | my $version="0.1.1"; 102 | 103 | GetOptions( 104 | "gc"=>\$calcGC, 105 | "f:s"=>\$fasta, 106 | "len:i"=>\$minLen, 107 | "v|version"=>\$version, 108 | "h|help"=>sub{system('perldoc', $0); exit;}, 109 | ); 110 | 111 | &help if ! $fasta; 112 | 113 | my $CONTIGS=FileHandle->new(); 114 | open ($CONTIGS, "<",$fasta) || die "Couldn't open $fasta\n"; 115 | $/= ">"; 116 | my (%sequences, @names); 117 | while (my $b = <$CONTIGS>) { 118 | chomp $b; 119 | next unless $b; 120 | my ($name, @sequence) = split (/\n/, $b); 121 | my $seq = join ("", @sequence); 122 | my $length = length($seq); 123 | if($length < $minLen){ 124 | print STDERR "[WARNING: Length_less_than_minimum]\t".$name."\t".$length."\n"; 125 | next; 126 | } 127 | 128 | unless ($calcGC){ 129 | print "$name\t$length\n" ; 130 | } 131 | else{ 132 | my ($g, $c); 133 | $seq=uc($seq); 134 | while ( $seq =~ /G/ig ) { $g++ } 135 | while ( $seq =~ /C/ig ) { $c++ } 136 | 137 | my $GC = (($g+$c)/$length)*100; 138 | my $printGC = sprintf( "%.4f", $GC); 139 | print "$name\t$printGC\t$length\n"; 140 | } 141 | } 142 | close $CONTIGS; 143 | 144 | sub help{ 145 | system('perldoc', $0); 146 | exit; 147 | } 148 | 149 | exit; 150 | 151 | 152 | -------------------------------------------------------------------------------- /scripts/map_project_names: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | =head1 NAME 22 | 23 | map_project_names - Map IMG project names to your own. Creates Symbolic links with your project names to extracted IMG tar balls. 24 | 25 | 26 | =head1 SYNOPSIS 27 | 28 | B 29 | 30 | 31 | =head1 DESCRIPTION 32 | 33 | Map IMG project names to your own. Creates Symbolic links with your project names to extracted IMG tar balls. 34 | 35 | 36 | =head1 OPTIONS 37 | 38 | =over 8 39 | 40 | =item B<-version>, B<-v> I 41 | 42 | version of the current script 43 | 44 | =item B<-help>, B<-h> I 45 | 46 | This message. 47 | 48 | =back 49 | 50 | 51 | =head1 AUTHOR 52 | 53 | Sunit Jain, (Mon Feb 23 12:55:40 EST 2015) 54 | sunitj [AT] umich [DOT] edu 55 | 56 | 57 | =head1 SEE ALSO 58 | 59 | L, L 60 | 61 | =head2 Other local resources 62 | 63 | =over 64 | 65 | =item [1] 66 | 67 | L 68 | 69 | =item [2] 70 | 71 | L 72 | 73 | =back 74 | 75 | =head2 Web 76 | 77 | =over 78 | 79 | =item [3] 80 | 81 | L 82 | 83 | =item [4] 84 | 85 | L 86 | 87 | =item [5] 88 | 89 | L 90 | 91 | =back 92 | 93 | =cut 94 | 95 | use strict; 96 | use Getopt::Long; 97 | use FileHandle; 98 | use File::Basename; 99 | 100 | my $help; 101 | my $version=fileparse($0)."\tv0.0.1b"; 102 | my $mapFile="names_map.txt"; 103 | my $path2col1="./"; 104 | my $outPath="./"; 105 | 106 | GetOptions( 107 | 'm|map:s'=>\$mapFile, 108 | 'p1|path2col1:s'=>\$path2col1, 109 | 'o|outdir:s'=>\$outPath, 110 | 'v|version'=>sub{print $version."\n"; exit;}, 111 | 'h|help'=>sub{system("perldoc $0 \| cat"); exit;}, 112 | ); 113 | print "\# $version\n"; 114 | 115 | my %projects; 116 | my $MAP=FileHandle->new(); 117 | open( $MAP, "<", $mapFile) || die $!; 118 | while(my $line=<$MAP>){ 119 | chomp $line; 120 | next unless ($line=~/^\d+/); 121 | next unless $line; 122 | my($imgName, $sampleNum, $desc)=split(/\t/, $line); 123 | my $oldPath=File::Spec->catdir($path2col1,$imgName); 124 | my $newPath=File::Spec->catdir($outPath,$sampleNum); 125 | symlink($oldPath, $newPath); 126 | } 127 | close $MAP; 128 | 129 | 130 | -------------------------------------------------------------------------------- /scripts/mapper_getQueryList: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | =head1 NAME 22 | 23 | mapper_getQueryList - get subset of query list from mapper script 24 | 25 | =head1 SYNOPSIS 26 | 27 | B B<-log> I B<-list> I B<-out> I 28 | 29 | 30 | =head1 MOTIVATION 31 | 32 | Get a subset of query list from the mapper script 33 | 34 | 35 | =head1 AUTHOR 36 | 37 | Sunit Jain, July 2013 38 | 39 | 40 | =head1 SEE ALSO 41 | 42 | L, L 43 | 44 | =head2 Other local resources 45 | 46 | =over 47 | 48 | =item [1] 49 | 50 | L 51 | 52 | =item [2] 53 | 54 | L 55 | 56 | =back 57 | 58 | =head2 Web 59 | 60 | =over 61 | 62 | =item [3] 63 | 64 | L 65 | 66 | =item [4] 67 | 68 | L 69 | 70 | =item [5] 71 | 72 | L 73 | 74 | =back 75 | 76 | =cut 77 | 78 | use strict; 79 | use Getopt::Long; 80 | 81 | my ($logFile,$list); 82 | my $version="0.3.1"; 83 | my $compatible="0.3.0 +"; 84 | my $out=$$.".list"; 85 | 86 | GetOptions( 87 | 'log=s'=>\$logFile, 88 | 'list=s'=>\$list, 89 | 'o|out:s'=>\$out, 90 | 'v|version'=>sub{print $version."\n"."Compatible with mapper script version $compatible"; exit;}, 91 | 'h|help'=>sub{system('perldoc', $0); exit;}, 92 | ); 93 | 94 | my %index; 95 | open(LIST, "<".$list)|| die $!; 96 | while(my $line=){ 97 | next if ($line=~ /^#/); 98 | chomp($line); 99 | $line=~ s/\r//; 100 | next unless $line; 101 | 102 | $index{$line}++; 103 | } 104 | close LIST; 105 | 106 | open (LOG, "<".$logFile)|| die $!; 107 | open (OUT, ">".$out)|| die $!; 108 | while(my $line=){ 109 | next if ($line=~ /^#/); 110 | chomp($line); 111 | $line=~ s/\r//; 112 | next unless $line; 113 | 114 | my ($subj, @queries)=split(/\t/, $line); 115 | next unless $index{$subj}; 116 | 117 | print OUT "#".$subj."\n"; 118 | foreach my $q(@queries){ 119 | print OUT $q."\n"; 120 | } 121 | } 122 | close LOG; 123 | close OUT; 124 | -------------------------------------------------------------------------------- /scripts/match-dada2-mothur: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2020 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | """ 21 | Match final DADA2 sequences to sequences in mothur pre.cluster map files 22 | """ 23 | import argparse 24 | from pathlib import Path 25 | 26 | from Bio import pairwise2 27 | 28 | 29 | argp = argparse.ArgumentParser(description=__doc__) 30 | argp.add_argument( 31 | 'pre_cluster_map', 32 | metavar='PRECLUSTER_MAP', 33 | type=argparse.FileType(), 34 | help='map output file from mothur\'s per.cluster command', 35 | ) 36 | argp.add_argument( 37 | 'dada2_seqtab', 38 | metavar='DADA2_SEQTAB', 39 | type=argparse.FileType(), 40 | help='DADA2 sequence table, saved with write.table and tab delimited', 41 | ) 42 | argp.add_argument('--version', action='version', version='%(prog)s ' 43 | 'is part of geo-omics-scripts VERSION_PLACEHOLDER') 44 | args = argp.parse_args() 45 | 46 | # output file names 47 | map_o = Path(Path(args.pre_cluster_map.name).name) 48 | map_o = map_o.with_suffix('.dada2' + map_o.suffixes[-1]) 49 | fa_o = Path(Path(args.dada2_seqtab.name).name).with_suffix('.fa') 50 | 51 | # read map file 52 | map_data = [] 53 | map_header = args.pre_cluster_map.readline() 54 | for line in args.pre_cluster_map: 55 | line = line.strip() 56 | map_data.append(line.split('\t')) 57 | 58 | # read seqtab file 59 | seqtab = [] 60 | seqs = args.dada2_seqtab.readline() 61 | seqs = seqs.strip().split('\t') 62 | seqs = [i.replace('"', '') for i in seqs] 63 | for line in args.dada2_seqtab: 64 | line = line.strip() 65 | seqtab.append(line.split('\t')) 66 | 67 | # assign ids to sequences 68 | seqs = {'dada2_{}'.format(i): s for i, s in enumerate(seqs)} 69 | 70 | 71 | # implement sequence matching 72 | def match_seq(query_seq, seqs): 73 | """ 74 | Match a mothur-aligned query seq to those in seqs 75 | 76 | Return id of closest match 77 | """ 78 | # rm gaps 79 | q = query_seq.replace('-', '') 80 | high_score = None 81 | best_hit = None 82 | for id, seq in seqs.items(): 83 | score = pairwise2.align.globalms( 84 | q, seq, 85 | 1, -1, -1, -1, 86 | score_only=True 87 | ) 88 | if high_score is None or score > high_score: 89 | high_score = score 90 | best_hit = id 91 | 92 | return best_hit 93 | 94 | 95 | # write output map 96 | with map_o.open('w') as f: 97 | map_header = map_header.split('\t') 98 | map_header.insert(4, 'dada2') 99 | f.write('\t'.join(map_header)) 100 | for row in map_data: 101 | match = match_seq(row[4], seqs) 102 | if match is None: 103 | match = '' 104 | row.insert(4, match) 105 | f.write('\t'.join(row) + '\n') 106 | 107 | print('map output written to: {}'.format(map_o)) 108 | 109 | # write fasta file 110 | with fa_o.open('w') as f: 111 | for id, seq in seqs.items(): 112 | f.write('>{}\n{}\n'.format(id, seq)) 113 | 114 | print('Fasta output written to: {}'.format(fa_o)) 115 | -------------------------------------------------------------------------------- /scripts/matchQueryNames: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | =head1 NAME 22 | 23 | matchQueryNames - Do this. 24 | 25 | 26 | =head1 SYNOPSIS 27 | 28 | B 29 | 30 | 31 | =head1 DESCRIPTION 32 | 33 | Do this. 34 | 35 | 36 | =head1 OPTIONS 37 | 38 | =over 8 39 | 40 | =item B<-version>, B<-v> I 41 | 42 | version of the current script 43 | 44 | =item B<-help>, B<-h> I 45 | 46 | This message. 47 | 48 | =back 49 | 50 | 51 | =head1 AUTHOR 52 | 53 | Sunit Jain, (Fri Jul 18 14:46:20 EDT 2014) 54 | sunitj [AT] umich [DOT] edu 55 | 56 | 57 | =head1 SEE ALSO 58 | 59 | L, L 60 | 61 | =head2 Other local resources 62 | 63 | =over 64 | 65 | =item [1] 66 | 67 | L 68 | 69 | =item [2] 70 | 71 | L 72 | 73 | =back 74 | 75 | =head2 Web 76 | 77 | =over 78 | 79 | =item [3] 80 | 81 | L 82 | 83 | =item [4] 84 | 85 | L 86 | 87 | =item [5] 88 | 89 | L 90 | 91 | =back 92 | 93 | =cut 94 | 95 | use strict; 96 | use Getopt::Long; 97 | 98 | my ($meta1, $meta2, $out); 99 | my $help; 100 | my $version="matchQueryNames.pl\tv0.0.1b"; 101 | GetOptions( 102 | '1|meta1:s'=>\$meta1, 103 | '2|meta2:s'=>\$meta2, 104 | 'o|out:s'=>\$out, 105 | 'v|version'=>sub{print $version."\n"; exit;}, 106 | 'h|help'=>sub{system("perldoc $0 \| cat"); exit;}, 107 | ); 108 | print "\# $version\n"; 109 | my %metaDataIndex; 110 | print $meta1."\n"; 111 | open(META1, "<".$meta1)|| die $!; 112 | while(my $line=){ 113 | next if ($line=~/^#/); 114 | chomp $line; 115 | next unless $line; 116 | match("1",$line); 117 | } 118 | close META1; 119 | 120 | open(META2, "<".$meta2)|| die $!; 121 | while(my $line=){ 122 | next if ($line=~/^#/); 123 | chomp $line; 124 | next unless $line; 125 | match("2",$line); 126 | } 127 | close META2; 128 | 129 | open(OUT, ">".$out)||die $!; 130 | print OUT $meta1."\t".$meta2."\n"; 131 | my $gt_2=0; 132 | my %seen; 133 | foreach my $meta(keys %metaDataIndex){ 134 | if (@{$metaDataIndex{$meta}}>2){ 135 | $gt_2++; 136 | } 137 | elsif(@{$metaDataIndex{$meta}}==2){ 138 | my $line; 139 | foreach my $name(@{$metaDataIndex{$meta}}){ 140 | $line.=$name."\t"; 141 | } 142 | $line=~ s/\t$/\n/; 143 | next if $seen{$line}; 144 | print OUT $line; 145 | $seen{$line}++; 146 | } 147 | } 148 | close OUT; 149 | print "gt_2 = ".$gt_2."\n"; 150 | 151 | sub match{ 152 | my $prefix=shift; 153 | my $line=shift; 154 | 155 | my($alias, @metadata)=split(/\t/,$line); 156 | foreach my $m(@metadata){ 157 | push(@{$metaDataIndex{$m}},$prefix."_".$alias); 158 | } 159 | } 160 | 161 | -------------------------------------------------------------------------------- /scripts/nameClassFiles: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | =head1 NAME 22 | 23 | nameClassFiles - Rename class files according to a tab-separated list of old/original names to new/more sensible names. 24 | 25 | 26 | =head1 SYNOPSIS 27 | 28 | B B<-tsv> old_and_new_filenames.tsv B<-ext> fasta 29 | 30 | 31 | =head1 DESCRIPTION 32 | 33 | Rename class files according to a tab-separated list of old/original names to new/more sensible names. 34 | 35 | 36 | =head1 OPTIONS 37 | 38 | =over 8 39 | 40 | =item B<-tsv> I 41 | 42 | col1=old name; col2=new name 43 | 44 | =item B<-out> I 45 | 46 | Output Folder [default: "Renamed"] 47 | 48 | =item B<-ext> I 49 | 50 | extensions for the old and new names if the tsv doesn't already have them. 51 | 52 | =item B<-version>, B<-v> I 53 | 54 | version of the current script 55 | 56 | =item B<-help>, B<-h> I 57 | 58 | This message. 59 | 60 | =back 61 | 62 | 63 | =head1 AUTHOR 64 | 65 | Sunit Jain, (Mon Feb 10 14:26:12 EST 2014) 66 | sunitj [AT] umich [DOT] edu 67 | 68 | 69 | =head1 SEE ALSO 70 | 71 | L, L 72 | 73 | =head2 Other local resources 74 | 75 | =over 76 | 77 | =item [1] 78 | 79 | L 80 | 81 | =item [2] 82 | 83 | L 84 | 85 | =back 86 | 87 | =head2 Web 88 | 89 | =over 90 | 91 | =item [3] 92 | 93 | L 94 | 95 | =item [4] 96 | 97 | L 98 | 99 | =item [5] 100 | 101 | L 102 | 103 | =back 104 | 105 | =cut 106 | 107 | use strict; 108 | use Getopt::Long; 109 | use File::Spec; 110 | use File::Copy "cp"; 111 | 112 | my ($tsv); 113 | my $out="Renamed"; 114 | my $ext="fasta"; 115 | my $help; 116 | my $version="nameClassFiles.pl\tv0.0.2b"; 117 | GetOptions( 118 | 'list|tsv:s'=>\$tsv, 119 | 'out:s'=>\$out, 120 | 'ext:s'=>\$ext, 121 | 'v|version'=>sub{print $version."\n"; exit;}, 122 | 'h|help'=>sub{system("perldoc $0 \| cat"); exit;}, 123 | ); 124 | print "\# $version\n"; 125 | 126 | unless (-e $out){mkdir($out, 0755)}; 127 | 128 | my %tracker; 129 | open(TSV, "<".$tsv)|| die $!; 130 | while(my $line=){ 131 | next if ($line=~ "^#"); 132 | chomp $line; 133 | next unless $line; 134 | 135 | my($old, $new)=split(/\t/, $line); 136 | if ($ext){ 137 | $old.=".".$ext; 138 | $new.=".".$ext; 139 | } 140 | $tracker{$old}=$new; 141 | } 142 | close TSV; 143 | 144 | foreach my $file(keys %tracker){ 145 | my $new=File::Spec->catfile( $out, $tracker{$file} ); 146 | print "Creating:\t$new\n"; 147 | cp($file, $new); 148 | } 149 | -------------------------------------------------------------------------------- /scripts/oasesPaired_pipe: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | #velveth assembly_61 61 -fastq -short derep_trimmed_day_fwd.fastq 22 | #velvetg assembly_61 -read_trkg yes 23 | #oases ~/Velvet_Assembly & 24 | 25 | 26 | module load AMOS/3.1.0-rc1 27 | module load velvet/1.1.07-MAX99-OPENMP 28 | module load oases/0.2.01 29 | 30 | args=("$@") 31 | 32 | KMER=${args[0]} 33 | FASTQ=${args[@]:1} # everything from element 2 onwards, inc element 2 34 | 35 | INTERVAL=120 # Change this number to modify the time interval (in seconds) of the usageStats script. 36 | 37 | echo "K-mer Length= $KMER" 38 | echo "FastQ Files= $FASTQ" 39 | 40 | if [ $# -ne 3 ]; then # if num of arguments not equal to 4 41 | echo "USAGE: ./$0 " 42 | echo "Yes, it has to be in that exact order!" 43 | exit 44 | fi 45 | 46 | OUTDIR="assembly_paired_$KMER" 47 | # bash check if directory exists 48 | if [ -d $OUTDIR ]; then 49 | echo "$OUTDIR already exists!" 50 | exit 51 | fi 52 | 53 | BANK="bank_paired_$KMER" 54 | if [ -d $BANK ]; then 55 | echo "$BANK already exists!" 56 | exit 57 | fi 58 | LOG="$OUTDIR.log" 59 | STATS="usageStats_K$KMER.tsv" 60 | 61 | perl /geomicro/data1/COMMON/scripts/usageStats.pl -i $INTERVAL -o $STATS -e & 62 | 63 | echo "************************************* VELVETH *******************************************************" > $LOG 64 | echo >> $LOG 65 | velveth $OUTDIR $KMER -fastq -shortPaired $FASTQ >> $LOG 66 | echo >> $LOG 67 | echo "************************************* VELVETG *******************************************************" >> $LOG 68 | echo >> $LOG 69 | velvetg $OUTDIR -read_trkg yes >> $LOG 70 | echo >> $LOG 71 | echo "************************************* OASES **************************************************" >> $LOG 72 | echo >> $LOG 73 | oases $OUTDIR >> $LOG 74 | echo >> $LOG 75 | 76 | 77 | exit 78 | -------------------------------------------------------------------------------- /scripts/omics: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | from omics.__main__ import main 21 | 22 | 23 | main() 24 | -------------------------------------------------------------------------------- /scripts/omics-container: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | echo "omics-container is deprecated, use comics instead" 21 | exit 1 22 | -------------------------------------------------------------------------------- /scripts/omics-init: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | from omics import init 21 | 22 | 23 | init.main() 24 | -------------------------------------------------------------------------------- /scripts/omics-prep: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | from omics import prep 21 | 22 | 23 | prep.main() 24 | -------------------------------------------------------------------------------- /scripts/omics-qc: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2014, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | from omics import qc 21 | 22 | 23 | qc.main() 24 | -------------------------------------------------------------------------------- /scripts/omics-qc-check: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | from omics import qc_check 21 | 22 | 23 | qc_check.main() 24 | -------------------------------------------------------------------------------- /scripts/omics-run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | echo "omics-run is deprecated, use comics instead" 21 | exit 1 22 | -------------------------------------------------------------------------------- /scripts/parseBlastXML: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | use strict; 21 | use Bio::SearchIO; 22 | use Getopt::Long; 23 | 24 | =head1 NAME 25 | 26 | parseBlastXML - parse and print BlastXML results 27 | 28 | 29 | =head1 SYNOPSIS 30 | 31 | B -b 32 | 33 | 34 | =head1 DESCRIPTION 35 | 36 | parse BlastXML results and print out Query, Subj, %id, evalue, bitScore. To get more contact me. 37 | 38 | 39 | =head1 OPTIONAL 40 | 41 | -o output file name; default processID.tsv, 42 | -s minimum bit score; default 0 43 | 44 | =head4 NOTE: 45 | 46 | To use this script Bioperl MUST be installed. 47 | For our lab, here is how you can access Bioperl for this script: 48 | 49 | 1) Make sure you're on Cayman. 50 | 2) type "/opt/package/perl5/5.12.2/bin/perl" instead of just "perl" while running the script. example: 51 | 52 | /opt/package/perl5/5.12.2/bin/perl parseBlastXML.pl -b 53 | 54 | 55 | =head1 AUTHOR 56 | 57 | Sunit Jain, July 2011 58 | sunitj [AT] umich [DOT] edu 59 | 60 | 61 | =head1 SEE ALSO 62 | 63 | L, L 64 | 65 | =head2 Other local resources 66 | 67 | =over 68 | 69 | =item [1] 70 | 71 | L 72 | 73 | =item [2] 74 | 75 | L 76 | 77 | =back 78 | 79 | =head2 Web 80 | 81 | =over 82 | 83 | =item [3] 84 | 85 | L 86 | 87 | =item [4] 88 | 89 | L 90 | 91 | =item [5] 92 | 93 | L 94 | 95 | =back 96 | 97 | =cut 98 | 99 | my $blastXml; 100 | my $tab=$$.".tsv"; 101 | my $bs=0; 102 | GetOptions( 103 | 'b:s'=>\$blastXml, 104 | 'o:s'=>\$tab, 105 | 's:f'=>\$bs, 106 | ); 107 | 108 | open (OUT, ">".$tab); 109 | print OUT "#File:$blastXml\tMin_BitScore:$bs\n"; 110 | print OUT "#Query\tHit\t\%ID\tEvalue\tBitScore\n"; 111 | chomp($blastXml); 112 | 113 | my $in = new Bio::SearchIO(-format => 'blastxml', -file=> $blastXml); 114 | while( my $result = $in->next_result ) { 115 | ## $result is a Bio::Search::Result::ResultI compliant object 116 | while( my $hit = $result->next_hit ) { 117 | ## $hit is a Bio::Search::Hit::HitI compliant object 118 | while( my $hsp = $hit->next_hsp ) { 119 | ## $hsp is a Bio::Search::HSP::HSPI compliant object 120 | if($hsp->score >= $bs){ 121 | print OUT $result->query_name, 122 | "\t", $hit->name, 123 | "\t", $hsp->percent_identity, 124 | "\t", $hsp->evalue, 125 | "\t", $hsp->score,"\n"; 126 | } 127 | } 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /scripts/parseTinySeqXML.xslt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /scripts/patchBlastLineage: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | use strict; 22 | 23 | # $ARGV[0]; Blast output. 24 | # $ARGV[1]; name of hit (y/n). 25 | 26 | my $fName= $ARGV[0]; 27 | my $lFile= "l_".$fName.".txt"; 28 | 29 | my %index; 30 | open (LF, $lFile) || die "[err] $lFile not found\n".$!."\n"; 31 | while (my $desc=){ 32 | my($gi, $taxa, $rank)=split(/\t/, $desc); 33 | chomp($gi); 34 | chomp($taxa); 35 | $index{$gi}=$taxa; 36 | } 37 | close LF; 38 | 39 | open (OUT, ">taxaBlast_".$ARGV[0]); 40 | open (BO, $fName) || die "[err] $fName not found\n".$!."\n"; 41 | while(my $line=){ 42 | next if ($line=~ m/^\#/); 43 | my @blast=split(/\t/, $line); 44 | chomp(@blast); 45 | my($giTag, $gi, $id, $name)=split(/\|/, $blast[1]); 46 | chomp($gi); 47 | $blast[1]=$index{$gi}."\|".$id; 48 | $blast[1].="\|".$name if (lc($ARGV[1]) eq 'y'); 49 | my $bo=join("\t", @blast); 50 | print OUT $bo."\n"; 51 | } 52 | close BO; 53 | close OUT; 54 | -------------------------------------------------------------------------------- /scripts/refseq-rna: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | # 21 | # 22 | # Download RefSeq rna sequences and create blast db 23 | # 24 | # 25 | set -eE 26 | # shellcheck disable=SC2034 27 | { 28 | USAGE="[OPTIONS]..." 29 | HELP="download archaeal and bacterial NCBI RefSeq RNA sequences and build BLAST DBs" 30 | ARG_HELP=" 31 | " 32 | CHECK_PROGS="makeblastdb" 33 | } 34 | # shellcheck disable=SC1090 35 | . "$(dirname "$0")/../share/geo-omics-scripts/liba.sh" || (echo "Failed to source script library"; exit 1) 36 | 37 | URL=ftp://ftp.ncbi.nih.gov/refseq/release 38 | 39 | if [ "$VERBOSITY" -ge 2 ]; then 40 | CURL=curl 41 | else 42 | CURL="curl --silent" 43 | fi 44 | 45 | if [[ -z ${REFSEQ_RELEASE:-} ]]; then 46 | echo -n "Getting current RefSeq release number... " 47 | refseq_release=$($CURL $URL/RELEASE_NUMBER) 48 | archive_part= 49 | echo "$refseq_release" 50 | else 51 | refseq_release=$REFSEQ_RELEASE 52 | # assume given release number is from archive, not the current one 53 | archive_part=archive/ 54 | fi 55 | [[ $refseq_release =~ [0-9]+ ]] || (echo "Failed to get RefSeq release number, got: ${refseq_release}"; exit 1) 56 | 57 | out_dir="$WORK_DIR/refseq$refseq_release" 58 | $MKDIR "$out_dir" || (echo "Directory exists alread: $out_dir"; exit 1) 59 | 60 | log=$out_dir/refseq-rna.log 61 | 62 | files_installed=$URL/release-catalog/${archive_part}release${refseq_release}.files.installed 63 | curl --silent "$files_installed" | grep -o -P '(archaea|bacteria).[0-9]+\.rna\.fna\.gz' | sort -n | 64 | while read -r i; do 65 | echo -n "Getting $i... " 66 | kingdom=${i%%.*} 67 | url=$URL/$kingdom/$i 68 | out=$out_dir/$kingdom.rna.fasta 69 | $CURL "$url" | gunzip -c >> "$out" 70 | echo "Appending $url to $out" >> "$log" 71 | echo "done" 72 | done 73 | 74 | for i in archaea bacteria; do 75 | echo -n "Making blast db for $i... " 76 | makeblastdb -in "$out_dir/$i.rna.fasta" -dbtype nucl >> "$log" 77 | echo "done" 78 | done 79 | 80 | echo -n "Finishing... " 81 | # no need to ever touch these files again 82 | chmod -w "$out_dir"/* 83 | 84 | if [[ -z ${REFSEQ_RELEASE:-} ]]; then 85 | $RM -f latest && $LN -s "$(basename "$out_dir")" latest 86 | fi 87 | echo "done" 88 | -------------------------------------------------------------------------------- /scripts/removeBlastSubj: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | use strict; 22 | 23 | my $in=$ARGV[0]; 24 | my $blastOut= $ARGV[1]; 25 | my $out= $$.".QueriesFromListRemoved.out"; 26 | my $list= $$.".QueriesFromList.out"; 27 | 28 | my %exclude; 29 | open(LIST, $in)|| die $!; 30 | while (my $line=){ 31 | next if ($line=~ m/^#/); 32 | chomp ($line); 33 | next unless ($line); 34 | $line=~ s/ //g; 35 | $line=~ s/\r//g; 36 | $line=lc($line); 37 | $exclude{$line}++; 38 | } 39 | close LIST; 40 | 41 | print keys(%exclude)."\n"; 42 | 43 | open(BOUT, $blastOut) || die $!; 44 | open(OUT, ">".$out); 45 | open(OUT2, ">".$list); 46 | my $count=0; 47 | while (my $line= ){ 48 | next if ($line=~ m/^#/); 49 | chomp ($line); 50 | next unless ($line); 51 | 52 | my ($query, $subj, @etc)=split(/\t/, $line); 53 | chomp($query, $subj); 54 | $subj=~ s/ //g; 55 | 56 | $subj=lc($subj); 57 | if ($exclude{$subj}){ 58 | $count++; 59 | print OUT2 $line."\n"; 60 | } 61 | else{ 62 | print OUT $line."\n"; 63 | } 64 | } 65 | print "Matches Found:".$count."\n"; 66 | close BOUT; 67 | close OUT; 68 | -------------------------------------------------------------------------------- /scripts/removeCommentLines: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | use strict; 22 | 23 | my $in=$ARGV[0]; 24 | my $out=$ARGV[1]; 25 | 26 | open(IN, $in)|| die $!."\n"; 27 | open(OUT, ">".$out); 28 | 29 | while(my $line=){ 30 | next if $line=~ /^#/; 31 | chomp $line; 32 | $line=~ s/\r//; 33 | next unless $line; 34 | 35 | print OUT $line."\n"; 36 | } 37 | close IN; 38 | close OUT; 39 | -------------------------------------------------------------------------------- /scripts/remove_space_from_filenames: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | find . -name '* *' | while read file; 21 | do 22 | target=`echo "$file" | sed 's/ /_/g'`; 23 | echo "Renaming '$file' to '$target'"; 24 | mv "$file" "$target"; 25 | done; 26 | -------------------------------------------------------------------------------- /scripts/reverse_complement: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | =head1 NAME 22 | 23 | reverse_complement - Do this. 24 | 25 | 26 | =head1 SYNOPSIS 27 | 28 | B B<-fasta> input.fasta B<-out> output.fasta 29 | 30 | 31 | =head1 DESCRIPTION 32 | 33 | Do this. 34 | 35 | 36 | =head1 OPTIONS 37 | 38 | =over 8 39 | 40 | =item B<-fasta>, B<-f> I 41 | 42 | Fasta file 43 | 44 | =item B<-out>, B<-o> I 45 | 46 | Output file name 47 | 48 | =item B<-version>, B<-v> I 49 | 50 | version of the current script 51 | 52 | =item B<-help>, B<-h> I 53 | 54 | This message. 55 | 56 | =back 57 | 58 | 59 | =head1 AUTHOR 60 | 61 | Sunit Jain, (Tue Jan 20 11:00:25 EST 2015) 62 | sunitj [AT] umich [DOT] edu 63 | 64 | 65 | =head1 SEE ALSO 66 | 67 | L, L 68 | 69 | =head2 Other local resources 70 | 71 | =over 72 | 73 | =item [1] 74 | 75 | L 76 | 77 | =item [2] 78 | 79 | L 80 | 81 | =back 82 | 83 | =head2 Web 84 | 85 | =over 86 | 87 | =item [3] 88 | 89 | L 90 | 91 | =item [4] 92 | 93 | L 94 | 95 | =item [5] 96 | 97 | L 98 | 99 | =back 100 | 101 | =cut 102 | 103 | use strict; 104 | use Getopt::Long; 105 | 106 | my $help; 107 | my $version="reverse_complement.pl\tv0.0.1b"; 108 | my ($fasta, $out); 109 | GetOptions( 110 | 'f|fasta:s'=>\$fasta, 111 | 'o|out:s'=>\$out, 112 | 'v|version'=>sub{print $version."\n"; exit;}, 113 | 'h|help'=>sub{system("perldoc $0 \| cat"); exit;}, 114 | ); 115 | print "\# $version\n"; 116 | 117 | open(FASTA, "<".$fasta) || die $!; 118 | open(OUT, ">".$out)|| die $!; 119 | $/=">"; 120 | while(my $line=){ 121 | chomp $line; 122 | next unless $line; 123 | 124 | my($header, @seq)=split(/\n/, $line); 125 | my $s=join("",@seq); 126 | my $rcSeq=reverseComplement($s); 127 | 128 | print OUT ">".$header."\n".$rcSeq."\n"; 129 | } 130 | $/="\n"; 131 | close FASTA; 132 | close OUT; 133 | 134 | sub reverseComplement{ 135 | my $seq=shift; 136 | chomp $seq; 137 | my $rSeq=uc(reverse($seq)); 138 | $rSeq=~ tr/GTCA/CAGT/; 139 | return $rSeq; 140 | } 141 | -------------------------------------------------------------------------------- /scripts/rgi-setup: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2019 Regents of The University of Michigan. 3 | 4 | # This file is part of geo-omics-scripts. 5 | 6 | # Geo-omics-scripts is free software: you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as published 8 | # by the Free Software Foundation, either version 3 of the License, or (at 9 | # your option) any later version. 10 | 11 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | 16 | # You should have received a copy of the GNU General Public License along 17 | # with Geo-omics-scripts. If not, see . 18 | 19 | ########################################### 20 | # This script loads the CARD reference data 21 | # and prepares a directory from within you 22 | # can run `rgi main` 23 | # The commands were adatped from the README 24 | # of the RGI software. 25 | ########################################### 26 | set -euo pipefail 27 | 28 | trap 'echo "error at line $LINENO, exit status $?"' ERR 29 | card=./card.json 30 | 31 | # Load CARD reference data 32 | 33 | if [[ ! -e $card ]]; then 34 | wget https://card.mcmaster.ca/latest/data 35 | tar -xvf data $card 36 | rm data 37 | 38 | fi 39 | 40 | rgi load --card_json $card --local 41 | rgi card_annotation -i $card > card_annotation.log 2>&1 42 | [[ $(ls card_database_v*.fasta) =~ card_database_v(.*).fasta ]] 43 | version=${BASH_REMATCH[1]} 44 | echo "[INFO] version parsed: $version" 45 | rgi load -i $card --card_annotation card_database_v"$version".fasta --local 46 | 47 | wget -O wildcard_data.tar.bz2 https://card.mcmaster.ca/latest/variants 48 | mkdir -p wildcard 49 | tar -xvf wildcard_data.tar.bz2 -C wildcard 50 | rm wildcard_data.tar.bz2 51 | gunzip wildcard/*.gz 52 | 53 | rgi wildcard_annotation -i wildcard --card_json $card -v "$version" > wildcard_annotation.log 2>&1 54 | rgi load --wildcard_annotation wildcard_database_v"$version".fasta --wildcard_index wildcard/index-for-model-sequences.txt --card_annotation card_database_v"$version".fasta --local 55 | 56 | # check db version 57 | rgi database --version --local 58 | -------------------------------------------------------------------------------- /scripts/sangerSeqParser: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | =head1 NAME 22 | 23 | sangerSeqParser - sanger seq parser 24 | 25 | =head1 SYNOPSIS 26 | 27 | B B<-p> I 28 | 29 | 30 | =head1 OPTIONS 31 | 32 | =over 8 33 | 34 | =item B<-o> I 35 | 36 | output file name; default= processID.fasta 37 | 38 | =item B<-e> I 39 | 40 | file extension to look for in folder; default= fasta 41 | 42 | =item B<-h> 43 | 44 | this page. 45 | 46 | =back 47 | 48 | 49 | =head1 Suggestions/Corrections/Feedback/Beer 50 | 51 | Sunit Jain, sunitj@umich.edu 52 | 53 | 54 | =head1 SEE ALSO 55 | 56 | L, L 57 | 58 | =head2 Other local resources 59 | 60 | =over 61 | 62 | =item [1] 63 | 64 | L 65 | 66 | =item [2] 67 | 68 | L 69 | 70 | =back 71 | 72 | =head2 Web 73 | 74 | =over 75 | 76 | =item [3] 77 | 78 | L 79 | 80 | =item [4] 81 | 82 | L 83 | 84 | =item [5] 85 | 86 | L 87 | 88 | =back 89 | 90 | =cut 91 | 92 | use strict; 93 | use Getopt::Long; 94 | use File::Basename; 95 | 96 | my $path; # Folder path 97 | my $ext="fasta"; 98 | my $out; 99 | 100 | GetOptions( 101 | 'p:s'=>\$path, 102 | 'o:s'=>\$out, 103 | 'e:s'=>\$ext, 104 | 'h|help'=>sub{system('perldoc', $0); exit;}, 105 | ); 106 | 107 | $path= `pwd` if !$path; 108 | chomp $path; 109 | $out=$$.".".$ext if !$out; 110 | 111 | my @files=<$path/*.$ext>; 112 | 113 | open(FASTA, ">".$out) || die $!; 114 | 115 | foreach my $f(@files){ 116 | my $fhIN; 117 | 118 | open($fhIN, $f) || die $!; 119 | my @sequence; 120 | while(my $line=<$fhIN>){ 121 | chomp $line; 122 | $line=~ s/\r//; 123 | next unless $line; 124 | push(@sequence, $line); 125 | } 126 | close($fhIN); 127 | my $seq= join("", @sequence); 128 | 129 | my $nuHead=fileparse($f); 130 | 131 | print FASTA ">".$nuHead."\n".$seq."\n"; 132 | } 133 | close (FASTA); 134 | -------------------------------------------------------------------------------- /scripts/setup_metapathways: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Copyright 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | ln -s /opt/packages/MetaPathways/1.0/blastDB 21 | ln -s /opt/packages/MetaPathways/1.0/executables 22 | cp /geomicro/data1/COMMON/src/MetaPathways/setup/template_* . 23 | -------------------------------------------------------------------------------- /scripts/shared-filter-abundance: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2020 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | """ 21 | Remove OTUs with small relative abundance from a mothur shared file. 22 | 23 | For each sample, for OTUs less abundant than the threashold, the count is set 24 | to zero for that sample. OTUs that consequently are all zero in the whole data 25 | set will be removed completely. 26 | """ 27 | import argparse 28 | from pathlib import Path 29 | 30 | from omics.shared import MothurShared 31 | 32 | 33 | DEFAULT_FILTER_CUTOFF = 1000 34 | 35 | 36 | def per_sample_cut_off(row): 37 | """ 38 | Set read counts below cut-off to zero 39 | 40 | To be passed to DataFrame.apply 41 | """ 42 | cutoff = size[row.name] / args.filter_cut_off 43 | row[row < cutoff] = 0 44 | sh.counts.loc[row.name] = row 45 | 46 | 47 | def is_all_zero(col): 48 | """ 49 | Return name of an "all zero count" OTU 50 | 51 | To be passed to DataFrame.apply 52 | """ 53 | for i in col: 54 | if i > 0: 55 | return None 56 | return col.name 57 | 58 | 59 | argp = argparse.ArgumentParser(description=__doc__) 60 | argp.add_argument( 61 | 'shared_file', 62 | type=argparse.FileType(), 63 | help='Input data, a mothur shared file.' 64 | ) 65 | argp.add_argument( 66 | '-t', '--threads', 67 | type=int, 68 | default=1, 69 | help='Number of threads to use for parallizable steps', 70 | ) 71 | argp.add_argument( 72 | '-f', '--filter-cut-off', 73 | type=int, 74 | default=DEFAULT_FILTER_CUTOFF, 75 | help='Set OTUs count with relative abundance of less than one in this many' 76 | 'sequences in a sample to zero. The default is {0}, i.e. counts of ' 77 | 'less than 1/{0} of the sample size are set to zero.' 78 | ''.format(DEFAULT_FILTER_CUTOFF), 79 | ) 80 | argp.add_argument('--version', action='version', version='%(prog)s ' 81 | 'is part of geo-omics-scripts VERSION_PLACEHOLDER') 82 | args = argp.parse_args() 83 | 84 | sh = MothurShared(args.shared_file, threads=args.threads) 85 | old_sizes = sh.sample_sizes.copy() 86 | size = dict(zip(sh.samples, sh.sample_sizes)) 87 | 88 | sh.counts.apply(per_sample_cut_off, axis=1) 89 | zero_otus = [i for i in sh.counts.apply(is_all_zero, axis=0) if i is not None] 90 | sh.remove_otus(zero_otus) 91 | 92 | diffs = old_sizes - sh.sample_sizes 93 | print('reads removed from dataset: {} ({:%})' 94 | ''.format(diffs.sum(), diffs.sum() / old_sizes.sum())) 95 | max_sample = diffs.idxmax() 96 | print('max removed in a sample: {} {:%}' 97 | ''.format(max_sample, diffs[max_sample] / old_sizes[max_sample])) 98 | print('Low-abundance OTUs removed: {} ({:%})' 99 | ''.format( 100 | len(zero_otus), 101 | len(zero_otus) / (len(zero_otus) + len(sh.otus)) 102 | )) 103 | 104 | outfile = Path(Path(args.shared_file.name).name).with_suffix( 105 | '.f{}.shared'.format(args.filter_cut_off) 106 | ) 107 | sh.save(outfile) 108 | -------------------------------------------------------------------------------- /scripts/shared-set-accessions: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2020 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | """ 21 | Set new OTU numbering scheme 22 | 23 | This can be used in the ASV-producing pipeline. An existing ASV->sequence 24 | assignment can be supplied. Any new sequence will be assigned a new ASV 25 | number. 26 | 27 | This script is part of the Schmidt Lab Mothur pipeline. 28 | """ 29 | import argparse 30 | from pathlib import Path 31 | 32 | from omics.shared import MothurShared 33 | 34 | DEFAULT_PREFIX = 'ASV' 35 | 36 | 37 | argp = argparse.ArgumentParser(description=__doc__) 38 | argp.add_argument( 39 | 'shared', 40 | type=argparse.FileType(), 41 | help='Mothur shared file', 42 | ) 43 | argp.add_argument( 44 | '-p', '--prefix', 45 | default=DEFAULT_PREFIX, 46 | help='Numbering scheme prefix, default is ' + DEFAULT_PREFIX + '. This ' 47 | 'option has no effect if the --with-map option is used', 48 | ) 49 | argp.add_argument( 50 | '-z', '--leading-zeros', 51 | action='store_true', 52 | help='Add leading zeros to small numbers', 53 | ) 54 | argp.add_argument( 55 | '-s', '--start-with', 56 | type=int, 57 | default=1, 58 | help='Accession number to start with. This should be one larger than the ' 59 | 'largest existing ASV number', 60 | ) 61 | argp.add_argument( 62 | '--with-map', 63 | type=argparse.FileType(), 64 | help='Use the mapping in given two-column tab-separated text file' 65 | ) 66 | argp.add_argument( 67 | '--save-map', 68 | type=argparse.FileType('w'), 69 | help='Save old->new OTU accession mapping in two-column tab-separated text' 70 | ' file under given name', 71 | ) 72 | argp.add_argument( 73 | '-o', '--output', 74 | default=None, 75 | help='Name of output shared file, by default a name will be generated ' 76 | 'based on the input file name', 77 | ) 78 | argp.add_argument( 79 | '-t', '--threads', 80 | type=int, 81 | default=1, 82 | help='Number of threads', 83 | ) 84 | argp.add_argument('--version', action='version', version='%(prog)s ' 85 | 'is part of geo-omics-scripts VERSION_PLACEHOLDER') 86 | args = argp.parse_args() 87 | if args.output is None: 88 | output = Path(Path(args.shared.name).name).with_suffix('.accs.shared') 89 | else: 90 | output = args.output 91 | 92 | if args.with_map is None: 93 | acc_map = None 94 | else: 95 | acc_map = {} 96 | for line in args.with_map: 97 | old, new = line.strip().split('\t') 98 | acc_map[old] = new 99 | 100 | sh = MothurShared(args.shared, threads=args.threads) 101 | acc_map = sh.set_accessions( 102 | with_map=acc_map, 103 | prefix=args.prefix, 104 | leading_zeros=args.leading_zeros, 105 | first=args.start_with, 106 | ) 107 | if args.save_map is not None: 108 | for k, v in acc_map.items(): 109 | args.save_map.write('{}\t{}\n'.format(k, v)) 110 | args.save_map.close() 111 | sh.info('Accession map saved as', args.save_map.name) 112 | sh.save(str(output)) 113 | sh.info('New shared file saved as:', output) 114 | -------------------------------------------------------------------------------- /scripts/silva-db: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019, 2022 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | # 21 | # 22 | # Download SILVA SSU/LSU rRNA data and create blast db 23 | # 24 | # 25 | set -eE 26 | # shellcheck disable=SC2034 27 | { 28 | USAGE="[OPTIONS]..." 29 | HELP="download SILVA SSU/LSU rRNA sequences and build BLAST DBs" 30 | ARG_HELP=" 31 | " 32 | CHECK_PROGS="makeblastdb" 33 | } 34 | # shellcheck disable=SC1090 35 | . "$(dirname "$0")/../share/geo-omics-scripts/liba.sh" || (echo "Failed to source script library"; exit 1) 36 | 37 | URL=ftp://ftp.arb-silva.de 38 | 39 | if [ "$VERBOSITY" -ge 2 ]; then 40 | CURL=curl 41 | else 42 | CURL="curl --silent" 43 | fi 44 | 45 | echo -n "Getting current SIVLA db release number... " 46 | # slash after url makes curl list directory in complex format, -> indicates a symlink 47 | release=$($CURL $URL/ | grep -o "current.*" | sed -r 's/current -> release_//') 48 | [[ $release =~ [0-9]+ ]] || (echo; echo "Failed to get current SIVLA db release number, got: ${release}"; exit 1) 49 | # for point releases, replace _ by . 50 | release=${release/_/.} 51 | echo "$release" 52 | 53 | out_dir="$WORK_DIR/release_$release" 54 | $MKDIR "$out_dir" || (echo "Directory exists alread: $out_dir"; exit 1) 55 | 56 | log=$out_dir/silva-db.log 57 | 58 | ( 59 | cd "$out_dir" 60 | 61 | echo -n "Downloading SIVLA SSU/LSU db..." 62 | $CURL -O "$URL/current/Exports/SILVA_${release}_{SSU,LSU}Ref_tax_silva.fasta.gz" 63 | echo "done" 64 | 65 | echo -n "Decompressing... " 66 | $GUNZIP -- SILVA_"${release}"_{SSU,LSU}Ref_tax_silva.fasta.gz 67 | echo "done" 68 | ) 69 | 70 | for i in SSU LSU; do 71 | echo -n "Making $i BLAST db... " 72 | makeblastdb -in "$out_dir/SILVA_${release}_${i}Ref_tax_silva.fasta" -dbtype nucl >> "$log" 73 | echo "done" 74 | done 75 | 76 | echo -n "Finishing... " 77 | # no need to ever touch these files again 78 | chmod -R -w "$out_dir" 79 | 80 | $RM -f latest && $LN -s "$(basename "$out_dir")" latest 81 | echo "done" 82 | -------------------------------------------------------------------------------- /scripts/tally: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2014, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | use strict; 21 | use Getopt::Long; 22 | 23 | my $in; 24 | my $master; 25 | my $out=$$.".out"; 26 | my $bs=0; 27 | my $printValues; # which column? 28 | my $fasta; 29 | GetOptions( 30 | 'i:s'=>\$in, 31 | 'm:s'=>\$master, 32 | 'o:s'=>\$out, 33 | 's:f'=>\$bs, 34 | 'f|fasta:s'=>\$fasta, 35 | 'v|values|value:i'=>\$printValues, 36 | ); 37 | 38 | my %seqLen; 39 | if($fasta){ 40 | $/=">"; 41 | open(FASTA, "<".$fasta)|| die $!; 42 | while(my $line=){ 43 | chomp $line; 44 | next unless $line; 45 | my($header, @s)=split(/\n/, $line); 46 | my $seq=join("",@s); 47 | $header=~ s/^>//; 48 | $seqLen{$header}=length($seq); 49 | } 50 | close(FASTA); 51 | $/="\n"; 52 | } 53 | 54 | $printValues--; 55 | open(IN, $in)|| die "[Error] $in: $!\n"; 56 | my %seen; 57 | while (my $line=){ 58 | next if ($line=~ m/^#/); 59 | chomp $line; 60 | next unless $line; 61 | $line=~ s/\r//g; 62 | 63 | my @lineParts=split(/\t/, $line); 64 | 65 | if(($fasta) && ($seqLen{$lineParts[0]})){ 66 | # Alternative to previous condition; don't need it if query 100% identical with same start and stop positions. 67 | my($subjStart,$subjStop)=sort{$a <=> $b} ($lineParts[8],$lineParts[9]); 68 | next if(($lineParts[6]==$lineParts[8]) && ($lineParts[7]==$lineParts[9]) && ($lineParts[2]==100) && ($subjStop==$seqLen{$lineParts[0]})); 69 | } 70 | else{ 71 | next if ($lineParts[0] eq $lineParts[1]); # Don't want query and subj to be the same 72 | } 73 | 74 | next if($seen{$lineParts[0]}); # Only need the top hit. 75 | next if ($lineParts[-1] < $bs); # Don't need anything with a bitscore less than user provided BS. 76 | 77 | if(! $printValues){ 78 | $seen{$lineParts[0]}++; 79 | } 80 | else{ 81 | $seen{$lineParts[0]}=$lineParts[$printValues]; 82 | } 83 | } 84 | close IN; 85 | 86 | open(MASTER, $master) || die "[Error] $master: $!\n"; 87 | open(OUT, ">".$out); 88 | while (my $line=){ 89 | next if ($line=~ m/^#/); 90 | chomp $line; 91 | next unless $line; 92 | $line=~ s/\r//g; 93 | 94 | if ($seen{$line}){ 95 | print OUT $line."\t".$seen{$line}."\n"; 96 | } 97 | else{ 98 | print OUT $line."\t0\n"; 99 | } 100 | } 101 | close MASTER; 102 | close OUT; 103 | exit; 104 | -------------------------------------------------------------------------------- /scripts/tally-weave: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2014, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | use strict; 21 | use Getopt::Long; 22 | use File::Basename; 23 | 24 | my $ext="tally"; 25 | my $out; 26 | GetOptions( 27 | 'e:s'=>\$ext, 28 | 'o:s'=>\$out, 29 | ); 30 | 31 | my @DBs; 32 | my @files=glob("*.".$ext); 33 | open(OUT, ">".$out); 34 | print OUT "#Transcripts\t"; 35 | my %master; 36 | print @files." Files will be tallied...!\n"; 37 | foreach my $f(@files){ 38 | my $dbName=basename($f,"\.$ext"); #split(/\_/, $f); 39 | push(@DBs, $dbName); 40 | print OUT $dbName."\t"; 41 | my $fh; 42 | open($fh, $f) || die "[error] $f: $! \n"; 43 | while (my $line=<$fh>){ 44 | next if ($line=~ m/^#/); 45 | chomp $line; 46 | $line=~ s/\r//g; 47 | next unless $line; 48 | 49 | my @cols=split(/\t/, $line); 50 | $master{$cols[0]}{$dbName}=$cols[1]; 51 | } 52 | close $fh; 53 | } 54 | print OUT "DB-presence\n"; 55 | 56 | foreach my $key(keys %master){ 57 | print OUT $key."\t"; 58 | my $total=0; 59 | foreach my $db(@DBs){ 60 | my $v; 61 | if($master{$key}{$db}){$v = $master{$key}{$db}} 62 | else{$v=0} 63 | print OUT $v."\t"; 64 | $total++ if($v != 0); 65 | } 66 | print OUT $total."\n"; 67 | } 68 | -------------------------------------------------------------------------------- /scripts/tallyWrap: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright 2013, 2014, 2015, 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | 21 | =head1 NAME 22 | 23 | tallyWrap - count hits above given bit score 24 | 25 | =head1 SYNOPSIS 26 | In multiple blast outputs, count the number of times a query gets a hit above a certain bit score: 27 | 28 | B -ext blastp -m masterList_output -t combinedTally_output -s 40 29 | 30 | In multiple tab delimited files, for each value in the first column, combine the values of the last column from each file into a combined tabular file: 31 | 32 | B -ext blastp -m masterList_output -t combinedTally_output -values 33 | 34 | 35 | =head1 DEPENDENCIES 36 | 37 | getMasterList 38 | tally 39 | tally-weave 40 | 41 | 42 | =head1 SEE ALSO 43 | 44 | L, L 45 | 46 | =head2 Other local resources 47 | 48 | =over 49 | 50 | =item [1] 51 | 52 | L 53 | 54 | =item [2] 55 | 56 | L 57 | 58 | =back 59 | 60 | =head2 Web 61 | 62 | =over 63 | 64 | =item [3] 65 | 66 | L 67 | 68 | =item [4] 69 | 70 | L 71 | 72 | =item [5] 73 | 74 | L 75 | 76 | =back 77 | 78 | =cut 79 | 80 | 81 | use strict; 82 | use Getopt::Long; 83 | use File::Spec::Functions; 84 | use File::Which; 85 | 86 | my $masterList; 87 | my $ext="blastn"; 88 | my $combinedTally; 89 | my $bs=0; 90 | my $printValue; # column number (start counting from 1) 91 | my $fasta; # query fasta 92 | GetOptions( 93 | 'ext:s'=>\$ext, 94 | 'm:s'=>\$masterList, 95 | 't:s'=>\$combinedTally, 96 | 's:f'=>\$bs, 97 | 'value|values:i'=>\$printValue, 98 | 'f|fasta:s'=>\$fasta, 99 | ); 100 | 101 | #Check for dependencies 102 | my @deps = ('getMasterList', 'tally', 'tally-weave'); 103 | foreach $dep (@deps) { 104 | my $p = catfile('.', $dep); 105 | if (-x $p) { 106 | $dep = $p; 107 | } else { 108 | $dep = which $dep or die "Could not find dependency: $dep"; 109 | } 110 | } 111 | my ($master_list_script, $tally_script, $tally_weave_script) = @deps; 112 | 113 | my @files=glob("*.$ext"); 114 | print "Creating MasterList\n"; 115 | system("$master_list_script -o ".$masterList." -s ".$bs.($ext ? " -e $ext" : "")); 116 | foreach my $f(@files){ 117 | print "\tTally: $f\n"; 118 | my($name, $ext)=split(/\./, $f); 119 | my $tallyFile=$name.".tally"; 120 | system("$tally_script -m ".$masterList." -i ".$f ." -o ".$tallyFile." -s ".$bs.($printValue ? " -values $printValue" : "").($fasta ? " -fasta $fasta":"")); 121 | } 122 | print "Weaving all tally files...\n"; 123 | system("$tally_weave_script -o ".$combinedTally); 124 | exit; 125 | -------------------------------------------------------------------------------- /scripts/tinySeq2fasta.xslt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /scripts/tinySeq2table.xslt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /scripts/twitterscript.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Tweets by @umich_geomicro 7 | 8 | ]]> 9 | 10 | 11 | -------------------------------------------------------------------------------- /test/run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Regents of The University of Michigan. 4 | 5 | # This file is part of geo-omics-scripts. 6 | 7 | # Geo-omics-scripts is free software: you can redistribute it and/or 8 | # modify it under the terms of the GNU General Public License as published 9 | # by the Free Software Foundation, either version 3 of the License, or (at 10 | # your option) any later version. 11 | 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | # General Public License for more details. 16 | 17 | # You should have received a copy of the GNU General Public License along 18 | # with Geo-omics-scripts. If not, see . 19 | 20 | set -ue 21 | # shellcheck disable=SC2034 22 | USAGE="[--help|-h|-t] [-- []-[]]" 23 | # shellcheck disable=SC2034 24 | SHORT_OPTIONS=t: 25 | # shellcheck disable=SC2034 26 | LONG_OPTIONS=threads: 27 | # shellcheck disable=SC2034 28 | HELP="Run omics SOP tests" 29 | handle_options () { 30 | if [ "$#" -gt 0 ]; then 31 | case "$1" in 32 | -t|--threads) 33 | THREADS=$2 34 | return 2;; 35 | esac 36 | else 37 | return 0 38 | fi 39 | } 40 | 41 | # default values 42 | STEPS=(prep qc assemble mapping binning) 43 | THREADS=32 44 | DATA_DIR=data.01 45 | TMPDIR=/tmp 46 | 47 | # shellcheck disable=SC1090 48 | . "$(dirname "$0")/../share/geo-omics-scripts/liba.sh" || (echo "Failed to source script library"; exit 1) 49 | 50 | # set script name here to prefix script output more meaningfully 51 | # shellcheck disable=2034 52 | SCRIPT_NAME="omics test" 53 | 54 | if [ "$#" -gt 0 ]; then 55 | if echo "$1" | grep -q -; then 56 | first=$(echo "$1" | cut -d- -f1) 57 | last=$(echo "$1" | cut -d- -f2) 58 | else 59 | first=$1 60 | last=$1 61 | fi 62 | fi 63 | 64 | # The default: run all steps 65 | first=${first:-${STEPS[0]}} 66 | last=${last:-${STEPS[-1]}} 67 | 68 | pick_step=false 69 | last_ok=false 70 | steps=() 71 | for i in "${STEPS[@]}"; do 72 | [ "$i" == "$first" ] && pick_step=true 73 | $pick_step && steps+=($i) 74 | [ "$i" == "$last" ] && last_ok=true && break 75 | done 76 | 77 | [ ${#steps[@]} == 0 ] && abort "Unknown first step: $first" 78 | $last_ok || abort "Unknown last step: $last" 79 | 80 | data=($(find "$(realpath "$(dirname "$0")")"/$DATA_DIR -type f -name "*.fastq.gz")) 81 | 82 | do_prep () { 83 | omics prep --cpus $THREADS "${data[@]}" 84 | } 85 | 86 | do_qc () { 87 | omics qc -t $THREADS 66* 88 | } 89 | 90 | do_assemble () { 91 | omics assemble --cpus $THREADS --skip-phylosift --megahit 66* 92 | } 93 | 94 | do_mapping () { 95 | omics mapping --cpus $THREADS --chop --index-only 96 | for i in 66*; do ( 97 | cd "$i" 98 | omics mapping --cpus $THREADS --index-dir ../bowtie2-index \ 99 | -a ../assembly.chop.fa \ 100 | -f fwd.good.fastq -r rev.good.fastq 101 | cd .. 102 | ) done 103 | } 104 | 105 | do_binning () { 106 | omics binning --cpus $THREADS --assembly assembly.chop.fa 66* 107 | } 108 | 109 | do_steps () { 110 | for i in "${steps[@]}"; do 111 | case $i in 112 | prep) do_prep;; 113 | qc) do_qc;; 114 | assemble) do_assemble;; 115 | mapping) do_mapping;; 116 | binning) do_binning;; 117 | *) abort "runtime error: illegal step: $i";; 118 | esac 119 | done 120 | } 121 | 122 | tmpdir=$(mktemp -d --tmpdir=$TMPDIR omics_test.XXXXXXX) 123 | info "Created working directory: $tmpdir" 124 | cd "$tmpdir" 125 | time do_steps 126 | info "Ran $first to $last, results in $tmpdir" 127 | 128 | --------------------------------------------------------------------------------