├── .gitignore
├── COPYING
├── Makefile
├── README.md
├── TruSeq3-PE-2+omics.fa
├── bash-completion
    └── omics
├── docs
    ├── Makefile
    ├── _static
    │   └── css
    │   │   └── custom.css
    ├── assemble.txt
    ├── binning.txt
    ├── bins2fasta.txt
    ├── chop-contigs.txt
    ├── conf.py
    ├── container.txt
    ├── illumina-reads-processing.txt
    ├── index.txt
    ├── init.txt
    ├── mapping.txt
    ├── merge-coverage.txt
    ├── omics.txt
    ├── prep.txt
    ├── qc-check.txt
    ├── qc-sample.txt
    ├── qc.txt
    ├── run.txt
    ├── separate-interleaved.txt
    ├── template.txt
    └── unchop-contigs.txt
├── help2rst
├── lib
    ├── Makefile
    ├── liba.sh
    └── omics
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── _version.py
    │   ├── bin_coverage.py
    │   ├── bin_stats.py
    │   ├── bins2fasta.py
    │   ├── checkm.py
    │   ├── db
    │       ├── __init__.py
    │       ├── apps.py
    │       ├── manage.py
    │       ├── management
    │       │   ├── __init__.py
    │       │   └── commands
    │       │   │   └── __init__.py
    │       └── models.py
    │   ├── derep.py
    │   ├── fastq2fasta.py
    │   ├── init.py
    │   ├── interleave.py
    │   ├── prep.py
    │   ├── qc.py
    │   ├── qc_check.py
    │   ├── read_counts.py
    │   ├── shared.py
    │   ├── unchop_bins.py
    │   └── utils.py
├── localenv
├── modulefiles
    ├── flux.omics
    │   ├── 1
    │   ├── 2
    │   └── .version
    ├── install
    └── omics
    │   ├── 1
    │   └── .version
├── phylosiftrc
├── scripts
    ├── COPYRIGHT.tetramer_freqs_esom
    ├── CRISPR_spacer_extractor
    ├── ESOM_binning_results_parser
    ├── Ebot.Output.Extract.Gi.Title.Rev3
    ├── GI_info_XMLParser
    ├── Makefile
    ├── Metabat_to_anvio_parser
    ├── U2T
    ├── VizBin_parser
    ├── addFileName2header
    ├── addInfo2lrn
    ├── ani
    ├── antiSmash_summary
    ├── assemble.pl
    ├── assemblyModules
    ├── asv-map-update
    ├── aview
    ├── bamTools
    ├── basicHF
    ├── batchBlast
    ├── binTablesForIMG
    ├── bins2fasta
    ├── blast2citation
    ├── blastDensityPlot
    ├── calcN50
    ├── changeClasses
    ├── changePGDBattribs
    ├── chop-contigs
    ├── chopper
    ├── clusterDensity
    ├── comics
    ├── consolidateJGIdata
    ├── contigMetadata
    ├── countInstances
    ├── coveragePerBin
    ├── coveragePerScaffold
    ├── createFastq
    ├── createNodes
    ├── createPhgDB
    ├── curateDB
    ├── dada2shared
    ├── derep+alias
    ├── derep_ClusterMap
    ├── derep_getReadAbundance
    ├── dereplicate
    ├── do2folder
    ├── do2list
    ├── downsample
    ├── embl2picture
    ├── esomCodonMod
    ├── esomTrain
    ├── esomWrapper
    ├── expandGFF
    ├── extractContigReads
    ├── extractEuks
    ├── extractGenbankMetadata
    ├── extractGenomes
    ├── extractSeqs
    ├── extractSubSeq
    ├── extractTranslationsFromGbk
    ├── extract_Blast_Hits_Of_Interest
    ├── fileChopper
    ├── findStretchesOfNs
    ├── firefox_already_running
    ├── fixpod
    ├── fixpod2
    ├── fixpod3
    ├── fixpod4
    ├── fixpod5
    ├── fixpod6
    ├── folderLevelSize
    ├── fragRec
    ├── gbk2fna
    ├── gcSkew
    ├── genomeCheck
    ├── genomicFluidity
    ├── getBwaMappedReadList
    ├── getClassFasta
    ├── getCol
    ├── getFamilyP
    ├── getFastaFromAccNos
    ├── getGFF
    ├── getGI
    ├── getGIAnnotation
    ├── getGISummary
    ├── getGeneClusters
    ├── getGiInfo
    ├── getLineage
    ├── getMasterList
    ├── getMyContigs
    ├── getRandomData
    ├── getSciNames
    ├── gff2fasta
    ├── gff2neo
    ├── gff2tbl
    ├── iClust
    ├── img_Bin_Classifier
    ├── inflate
    ├── interleave
    ├── itemize
    ├── kmerFreq
    ├── legacy_consolidateJGIdata
    ├── length+GC
    ├── limit2Length
    ├── mVelvetPipe_paired
    ├── mVelvetPipe_singles
    ├── makeAnnotationFile
    ├── map_project_names
    ├── mapper
    ├── mapper_getQueryList
    ├── match-dada2-mothur
    ├── matchQueryNames
    ├── measureCompleteness
    ├── merge-coverage
    ├── merge-covs
    ├── mockest
    ├── nameClassFiles
    ├── nsmpReport
    ├── oasesPaired_pipe
    ├── omics
    ├── omics-assemble
    ├── omics-binning
    ├── omics-container
    ├── omics-init
    ├── omics-mapping
    ├── omics-prep
    ├── omics-qc
    ├── omics-qc-check
    ├── omics-qc-sample
    ├── omics-run
    ├── parallel_antiSmash
    ├── parallel_getGenomesFromTaxa
    ├── parseBlastXML
    ├── parseFastq
    ├── parseTinySeqXML.xslt
    ├── patchBlastLineage
    ├── plot-blast-frag-cov
    ├── plot-coverage
    ├── plot-megahit-log
    ├── plot-shared-otu-counts
    ├── plot_alignment
    ├── postBlast
    ├── ppt_getGI
    ├── ppt_getXML
    ├── refseq-rna
    ├── removeBlastSubj
    ├── removeCommentLines
    ├── remove_space_from_filenames
    ├── renameHeaders
    ├── reverse_complement
    ├── rgi-coverage
    ├── rgi-setup
    ├── sangerSeqParser
    ├── separate-interleaved
    ├── setup_metapathways
    ├── shared-filter-abundance
    ├── shared-get
    ├── shared-jaccard
    ├── shared-merge-otus
    ├── shared-set-accessions
    ├── shared-unique-prevalence
    ├── shared2fasta
    ├── silva-align
    ├── silva-db
    ├── silvaTaxonAppend
    ├── slideshow.xml
    ├── summarize_antiSmash
    ├── tally
    ├── tally-weave
    ├── tallyWrap
    ├── taxonDist
    ├── test_fragRec
    ├── tetramer_freqs_esom
    ├── tinySeq2fasta.xslt
    ├── tinySeq2table.xslt
    ├── toMultiGBK
    ├── toPhylipAndBack
    ├── top5
    ├── track-mothur-counts
    ├── triage
    ├── twitterscript.xml
    ├── uClustHomology
    ├── unchop-contigs
    └── usageStats
└── test
    └── run


/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | .*sw[op]
 3 | *.pyc
 4 | 
 5 | # make-generated files
 6 | scripts/*.1
 7 | geo-omics-scripts*.tar.gz
 8 | geo-omics-scripts*/*
 9 | docs/_build
10 | 
11 | # contains hard-link to liba.sh, to allow running scripts in dev environment
12 | share/
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Michigan Geomicrobiology Lab
 2 | 
 3 | Welcome to the GitHub Repo for some general purpose NGS, Data analysis and
 4 | mining scripts used in the lab. Some scripts implement our short-read QC,
 5 | assembly, binning, etc. pipeline and depend on the presence of a number of
 6 | third-party software.  The rest are Bash scripts or in core
 7 | [Perl](http://www.perl.org/ "Perl Home") or [Python](https://www.python.org/
 8 | "Python Home"). This means, if you have Perl or Python 3 installed, you won't
 9 | need anything else to work with these scripts.
10 | 
11 | Since these scripts are actively being used by the Lab, you can expect full
12 | support for any [issues](https://github.com/Geo-omics/scripts/issues "Report an
13 | issue"). Please do let us know if you find any bugs or easier/quicker/more
14 | elegant solutions.
15 | 
16 | 
17 | ## Language and OS Dependencies
18 | 
19 | The scripts should work with variuos flavors of Linux and other unix-like
20 | environments.  Here is a list of easy to install languages that you'll need:
21 | 
22 | * Perl version 5.10 +
23 | * Python version 3.5 +
24 | * R version 3 +
25 | 
26 | ## Contact
27 | 
28 | Please send questions or comments to <geo-omics-scripts@umich.edu>.
29 | 
30 | ## Principal Investigator
31 | 
32 | [Gregory J. Dick](https://sites.lsa.umich.edu/geomicro/ "Geomicrobiology Lab Homepage"), gdick [AT] umich [DOT] edu
33 | 
34 | 
35 | ## License
36 | 
37 | Geo-omics-scripts is free software: you can redistribute it and/or modify it
38 | under the terms of the GNU General Public License as published by the Free
39 | Software Foundation, either version 3 of the License, or (at your option) any
40 | later version.
41 | 
42 | 
43 | ## Disclaimer
44 | 
45 | **Geo-omics scripts are distributed in the hope that they will be useful, but
46 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
47 | FITNESS FOR A PARTICULAR PURPOSE.**
48 | 


--------------------------------------------------------------------------------
/TruSeq3-PE-2+omics.fa:
--------------------------------------------------------------------------------
 1 | >PrefixPE/1
 2 | TACACTCTTTCCCTACACGACGCTCTTCCGATCT
 3 | >PrefixPE/2
 4 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
 5 | >PE1
 6 | TACACTCTTTCCCTACACGACGCTCTTCCGATCT
 7 | >PE1_rc
 8 | AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA
 9 | >PE2
10 | GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
11 | >PE2_rc
12 | AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC
13 | >TruSeq_Adapter_Index_end
14 | CGTATGCCGTCTTCTGCTTG
15 | >TruSeq_Adapter_Index_end_rc
16 | CAAGCAGAAGACGGCATACG
17 | >Illumina_Paired_End_Sequencing_Primer_2
18 | CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT
19 | >Illumina_Paired_End_Sequencing_Primer_2_rc
20 | AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCG
21 | >Illumina_Paired_End_Adapter_2
22 | GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
23 | >Illumina_RNA_PCR_Primer_rc
24 | TCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATT
25 | 


--------------------------------------------------------------------------------
/bash-completion/omics:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Regents of The University of Michigan.
 2 | 
 3 | # This file is part of geo-omics-scripts.
 4 | 
 5 | # Geo-omics-scripts is free software: you can redistribute it and/or
 6 | # modify it under the terms of the GNU General Public License as published
 7 | # by the Free Software Foundation, either version 3 of the License, or (at
 8 | # your option) any later version.
 9 | 
10 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
11 | # WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 | # General Public License for more details.
14 | 
15 | # You should have received a copy of the GNU General Public License along
16 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | _omics_completion()
19 | {
20 |     # suppress stderr while running python unless we are debugging
21 |     local hide_stderr=true
22 |     [[ -v OMICS_AUTO_COMPLETE_DEBUG ]] && [[ -n "${OMICS_AUTO_COMPLETE_DEBUG}" ]] && hide_stderr=false
23 |     $hide_stderr && exec {stderr}>&2 2>/dev/null
24 | 
25 |     local -a reply
26 |     local do_file_completion
27 |     if reply=(
28 |         # call omics executable
29 |         $(OMICS_AUTO_COMPLETE="$COMP_CWORD" "${COMP_WORDS[@]}")
30 |     ); then
31 |         # add file completion if requested, marker must be last
32 |         if [[ ${#reply[@]} -eq 0 ]]; then
33 |             do_file_completion=true
34 |         elif [[ ${reply[-1]} == FILE_COMPLETION ]]; then
35 |             unset reply[-1]
36 |             do_file_completion=true
37 |         else
38 |             # normal, non-empty reply
39 |             do_file_completion=false
40 |         fi
41 |     else
42 |         # fall back to file completion
43 |         do_file_completion=true
44 |     fi
45 | 
46 |     # restore stderr as needed
47 |     $hide_stderr && exec 2>&$stderr-
48 | 
49 |     $do_file_completion && reply+=( $(compgen -f -- "${COMP_WORDS[COMP_CWORD]}" ) )
50 | 
51 |     COMPREPLY=("${reply[@]}")
52 | }
53 | complete -F _omics_completion omics
54 | 


--------------------------------------------------------------------------------
/docs/_static/css/custom.css:
--------------------------------------------------------------------------------
1 | /* noredcode: hack to avoid red inline code or literals
2 |  * Sphinx should allow one to amend themes somehow but who knows,
3 |  * so append this file to theme.css of the rtd-theme
4 |  */
5 | code.samp span.pre, code.literal span.pre {color: #444;}
6 | 


--------------------------------------------------------------------------------
/docs/assemble.txt:
--------------------------------------------------------------------------------
  1 | .. program:: omics assemble
  2 | 
  3 | ============================================
  4 | assemble - assemble metagenomes with IDBA_UD
  5 | ============================================
  6 | 
  7 | Synopsis
  8 | ========
  9 | 
 10 | :program:`omics assemble` [OPTIONS]... [SAMPLE_DIR]...
 11 | 
 12 | 
 13 | Description
 14 | ===========
 15 | 
 16 | This script implements the assembly step of the Geomicro Illumina Reads
 17 | processing Pipeline.  It uses :command:`megahit` or :command:`idba_ud` for the
 18 | assembly and parameters for k-mer range and step size can be passed along.  The
 19 | script will check the quality of the assembly with QUAST, attempt to classify
 20 | contigs, and run PhyloSift on the assembly.
 21 | 
 22 | 
 23 | Options
 24 | =======
 25 | 
 26 | .. option:: --assembly-only
 27 | 
 28 |     Stop after finishing the assembly
 29 | 
 30 | .. option:: --contigs FILE
 31 | 
 32 |     Skip the assembly step and but continue downstream analysis steps with the
 33 |     provided assembly (can be contigs or scaffold stage)
 34 | 
 35 | .. option:: --cpus N
 36 | 
 37 |     Specify how many CPUs to use for parallel execution
 38 | 
 39 | .. option:: --force
 40 | 
 41 |     Overwrite existing results, by default will exit with an error message when
 42 |     the output file exist.
 43 | 
 44 | .. option:: --idba-ud
 45 | 
 46 |     Use the IDBA_UD assembler instead of the default MEGAHIT
 47 | 
 48 | .. option:: --mink=N, 
 49 | 
 50 |     parameter for minimum k-mer size
 51 | 
 52 | .. option:: --maxk=N
 53 | 
 54 |     parameter for maximum k-mer size
 55 | 
 56 | .. option:: --step=N
 57 | 
 58 |     parameter for step size for increasing k-mer values
 59 | 
 60 | .. option:: --level LEVEL
 61 | 
 62 |     Specifies the assembly level to use for downstream analysis.  Can be either
 63 |     scaffold (the default) or contig.
 64 | 
 65 | .. option:: --megahit
 66 | 
 67 |     Use the MEGAHIT assembler. This is the default.
 68 | 
 69 | .. option:: --out PATH
 70 | 
 71 |     The directory to which output is saved.  The default value is :file:`ASSEMBLY`
 72 | 
 73 | .. option:: --phylosiftrc FILE
 74 | 
 75 |     Allows to specify a custom PhyloSift configuration file.
 76 | 
 77 | .. option:: --reads FILE
 78 | 
 79 |     Interleaved reads file, by default this is :file:`dt_int.fasta` which is the output of :program:`omics qc`
 80 | 
 81 | .. option:: --skip-blast
 82 | 
 83 |     Skip classification steps, by default run blast
 84 | 
 85 | .. option:: --skip-phylosift
 86 | 
 87 |     Skip PhyloSift run, by default PhyloSift is run
 88 | 
 89 | .. option:: --skip-quast
 90 | 
 91 |     Skip QUAST analysis, by default QUAST is run
 92 | 
 93 | .. option::  --working-dir=DIR
 94 | 
 95 |     Directory under which output is stored. By default this is the current
 96 |     directory.
 97 | 
 98 | .. option:: -h, --help
 99 | 
100 |     Print help.
101 | 
102 | .. option:: --no-color
103 | 
104 |     Disable colorful output.
105 | 
106 | .. option:: -v, --verbosity=N
107 | 
108 |     Use one or multiple ``-v`` to increase verbosity of output or set a
109 |     level of verbosity with ``--verbosity=N``.  By default the verbosity
110 |     level is 1.  Setting verbosity to 0 silences the program.  A level of
111 |     3 prints debugging info.
112 | 
113 | 
114 | Exit Status
115 | ===========
116 | 
117 | Exits with non-zero upon encountering an error.
118 | 
119 | .. only:: man
120 | 
121 |     See Also
122 |     ========
123 | 
124 |     :manpage:`omics-prep(1)`, :manpage:`omics-qc(1)`, :manpage:`omics(7)`, :manpage:`illumina-reads-processing(7)`
125 | 
126 | 


--------------------------------------------------------------------------------
/docs/binning.txt:
--------------------------------------------------------------------------------
 1 | .. program:: omics binning
 2 | 
 3 | ====================================
 4 | binning - bin metagenomic assemblies
 5 | ====================================
 6 | 
 7 | Synopsis
 8 | ========
 9 | 
10 | :program:`omics binning` [OPTIONS]...
11 | 
12 | 
13 | Description
14 | ===========
15 | 
16 | The :program:`omics binning` script implements the binning step for the Geomicro
17 | Illumina Reads Pipeline.
18 | 
19 | Options
20 | =======
21 | 
22 | .. option:: -a, --assembly FILE
23 | 
24 |     Fasta-formatted file containing the assembled contigs, by default this is
25 |     :file:`contigs.fa`
26 | 
27 | .. option:: -c, --coverage-file FILE
28 | 
29 |     Merged/shared per-sample-contig mean coverage file. If this option is not
30 |     present, then this file will be compiled from the files found with the
31 |     --coverage-path argument.
32 | 
33 | .. option:: --coverage-path PATH
34 | 
35 |     Path to coverage files relative to each sample directory; the default is
36 |     :file:`MAPPING/assembly.chop.genomeCovBed.tsv`.  These files are made by the
37 |     mapping script and correspond to the :file:`asm_pair-smds.bam` files made by
38 |     CONCOCTs :program:`map-bowtie2-markduplicates.sh`.  This option is
39 |     incopmpatible with the -c option.
40 | 
41 | .. option:: --force
42 | 
43 |     Overwrite existing data
44 | 
45 | .. option:: -o, --out-dir PATH
46 | 
47 |     Path to output directory, by default this is :file:`BINNING`
48 | 
49 | .. option::  --working-dir=DIR
50 | 
51 |     Directory under which output is stored. By default this is the current
52 |     directory.
53 | 
54 | .. option:: -h, --help
55 | 
56 |     Print help.
57 | 
58 | .. option:: --no-color
59 | 
60 |     Disable colorful output.
61 | 
62 | .. option:: -v, --verbosity=N
63 | 
64 |     Use one or multiple ``-v`` to increase verbosity of output or set a
65 |     level of verbosity with ``--verbosity=N``.  By default the verbosity
66 |     level is 1.  Setting verbosity to 0 silences the program.  A level of
67 |     3 prints debugging info.
68 | 
69 | 
70 | Exit Status
71 | ===========
72 | 
73 | Exits with non-zero upon encountering an error.
74 | 
75 | .. only:: man
76 | 
77 |     See Also
78 |     ========
79 | 
80 |     :manpage:`omics-prep(1)`, :manpage:`omics-qc(1)`, :manpage:`omics-assemble(1)`,
81 |     :manpage:`omics-mapping(1)`, :manpage:`omics(7)`,
82 |     :manpage:`illumina-reads-processing(7)`
83 | 
84 | 


--------------------------------------------------------------------------------
/docs/bins2fasta.txt:
--------------------------------------------------------------------------------
 1 | .. program:: bins2fasta
 2 | 
 3 | ===========================================
 4 | bins2fasta - generate fasta files from bins
 5 | ===========================================
 6 | 
 7 | .. argparse::
 8 |     :module: omics.bins2fasta
 9 |     :func: get_argp
10 |     :prog: bins2fasta
11 |     :nodefault:
12 |     :manpage:
13 | 
14 | 
15 | Exit Status
16 | ===========
17 | 
18 | Exits with non-zero upon encountering an error.
19 | 
20 | .. only:: man
21 | 
22 |     See Also
23 |     ========
24 | 
25 |     :manpage:`omics-binning`, :manpage:`omics(7)`,
26 |     :manpage:`illumina-reads-processing(7)`
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/chop-contigs.txt:
--------------------------------------------------------------------------------
 1 | .. program:: chop-contigs
 2 | 
 3 | ====================================
 4 | chop-contigs
 5 | ====================================
 6 | 
 7 | Synopsis
 8 | ========
 9 | 
10 | :program:`chop-contigs` [OPTIONS]... [-i <input-file>]
11 | 
12 | 
13 | Description
14 | ===========
15 | 
16 | Chop up an assemblies contigs to fixes sizes.
17 | 
18 | 
19 | Options
20 | =======
21 | .. option:: -h, --help
22 | 
23 |     show this help message and exit
24 | 
25 | .. option:: --chunk-size SIZE
26 | 
27 |     Size of chunk into which contigs are divided. Default is 10000
28 | 
29 | .. option:: -i, --input FILE
30 | 
31 |     input, fasta-formatted file with contigs, if not given stdin is used.
32 | 
33 | .. option:: -o, --output FILE
34 | 
35 |     Output file
36 | 
37 | .. option:: --wrap
38 | 
39 |     Wrap output sequences to line of length 60.
40 | 
41 | .. option:: --no-dot-zero
42 | 
43 |     Do not add a .0 to a fasta header of a short sequence that didnot need to
44 |     be chopped up. This option make the output compatible with CONCOCT's
45 |     cut_up_fasta.py script.
46 | 
47 | .. option:: --no-truncate-headers
48 | 
49 |     Do not further manipulate fasta headers beyond adding the chop numbers.  By
50 |     default, the header is truncated at the first whitespace character,
51 |     assuming this still uniquely identifies the contig.  This default behaviour
52 |     is needed for MEGAHIT assemblies and does no harm to IDBA assemblies.
53 | 
54 | .. option:: --debug
55 | 
56 |     Print stack trace on errors.
57 | 
58 | 
59 | Exit Status
60 | ===========
61 | 
62 | Exits with non-zero upon encountering an error.
63 | 
64 | .. only:: man
65 | 
66 |     See Also
67 |     ========
68 | 
69 |     :manpage:`omics-binning(1)`,
70 |     :manpage:`omics-mapping(1)`, :manpage:`omics(7)`,
71 |     :manpage:`illumina-reads-processing(7)`
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/docs/container.txt:
--------------------------------------------------------------------------------
 1 | .. program:: omics container
 2 | 
 3 | ================================================
 4 | container -- start a singularity container
 5 | ================================================
 6 | 
 7 | Synopsis
 8 | ========
 9 | 
10 | :program:`omics container` [OPTIONS]...
11 | 
12 | 
13 | Description
14 | ===========
15 | 
16 | Start a shell in a singularity container environment (typically on Flux HPC) with all
17 | omics scripts and software dependencies available.
18 | 
19 | 
20 | Options
21 | =======
22 | 
23 | .. option:: -i, --container-image PATH
24 | 
25 |     Path to singularity container image.  A sensible default is chosen if this
26 |     option is not provided.
27 | 
28 | .. option:: -k, --keep-modules-loaded
29 | 
30 |     Do not purge environment modules, by default all modules get purged.
31 | 
32 | 
33 | Exit Status
34 | ===========
35 | 
36 | Exits with non-zero upon encountering an error.
37 | 
38 | .. only:: man
39 | 
40 |     See Also
41 |     ========
42 | 
43 |     :manpage:`omics(7)`, :manpage:`illumina-reads-processing(7)`
44 | 


--------------------------------------------------------------------------------
/docs/index.txt:
--------------------------------------------------------------------------------
 1 | .. geo-omics-scripts documentation master file, created by
 2 |    sphinx-quickstart on Thu Mar 16 15:39:32 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to geo-omics-scripts's documentation!
 7 | =============================================
 8 | 
 9 | Contents:
10 | 
11 | .. toctree::
12 |     :maxdepth: 2
13 |     :glob:
14 | 
15 |     *
16 | 
17 | Indices and tables
18 | ==================
19 | 
20 | * :ref:`genindex`
21 | * :ref:`modindex`
22 | * :ref:`search`
23 | 
24 | 


--------------------------------------------------------------------------------
/docs/init.txt:
--------------------------------------------------------------------------------
 1 | .. program:: omics init
 2 | 
 3 | =========================================
 4 | init - initialize omics project directory
 5 | =========================================
 6 | 
 7 | .. argparse::
 8 |     :module: omics.init
 9 |     :func: get_argp
10 |     :prog: omics init
11 |     :nodefault:
12 |     :manpage:
13 | 
14 | 
15 | Exit Status
16 | ===========
17 | 
18 | Exits with non-zero upon encountering an error.
19 | 
20 | .. only:: man
21 | 
22 |     See Also
23 |     ========
24 | 
25 |     :manpage:`omics-prep(1)`, :manpage:`omics(7)`, :manpage:`illumina-reads-processing(7)`
26 | 


--------------------------------------------------------------------------------
/docs/mapping.txt:
--------------------------------------------------------------------------------
  1 | .. program:: omics mapping
  2 | 
  3 | ============================================
  4 | mapping - mapping reads to assembly
  5 | ============================================
  6 | 
  7 | Synopsis
  8 | ========
  9 | 
 10 | :program:`omics mapping` [OPTIONS...] [SAMPLE_DIRS...]
 11 | 
 12 | 
 13 | Description
 14 | ===========
 15 | 
 16 | This script implements the mapping step of the Geomicro Illumina Reads Processing
 17 | Pipline.
 18 | 
 19 | 
 20 | Options
 21 | =======
 22 | 
 23 | .. option:: -a, --assembly FILE
 24 | 
 25 |     Specify assembly i.e. a fasta file of contigs or scaffolds, by default
 26 |     this is :file:`assembly.fa`
 27 | 
 28 | .. option:: -c, --chop
 29 | 
 30 |     Chop up contigs of assembly before mapping.  This should be done before
 31 |     binning with CONCOCT.
 32 | 
 33 | .. option:: --chunk-size N
 34 | 
 35 |     Size of chunks for chopping, the default is $DEFAULT_CHUNK_SIZE,
 36 |     implies ``--chop``.
 37 | 
 38 | .. option:: -f, --fwd-reads FILE
 39 | 
 40 |     Fastq file with either forward reads or single-end reads, by default this
 41 |     is :file:`fwd.fastq`
 42 | 
 43 | .. option:: -r, --rev-reads FILE
 44 | 
 45 |     Fastq file with reverse reads, by default this is :file:`rev.fastq`
 46 | 
 47 | .. option:: -i, --int-reads FILE
 48 | 
 49 |     Fastq file with paired-end, interleaved reads, can also be single-paired
 50 |     reads, The default is dt_int.fastq
 51 | 
 52 | .. option::  -o, --out-dir DIR
 53 | 
 54 |     Name of output directory, default is :file:`MAPPING` in the sample
 55 |     directory
 56 | 
 57 | .. option:: --no-indexing
 58 | 
 59 |     Do not index the assembly but existing index
 60 | 
 61 | .. option:: --cpus N
 62 | 
 63 |     Specify how many CPUs to use for parallel execution
 64 | 
 65 | .. option:: --force
 66 | 
 67 |     Overwrite any previous results.
 68 | 
 69 | .. option::  --working-dir=DIR
 70 | 
 71 |     Directory under which output is stored. By default this is the current
 72 |     directory.
 73 | 
 74 | .. option:: -h, --help
 75 | 
 76 |     Print help.
 77 | 
 78 | .. option:: --no-color
 79 | 
 80 |     Disable colorful output.
 81 | 
 82 | .. option:: -v, --verbosity=N
 83 | 
 84 |     Use one or multiple ``-v`` to increase verbosity of output or set a
 85 |     level of verbosity with ``--verbosity=N``.  By default the verbosity
 86 |     level is 1.  Setting verbosity to 0 silences the program.  A level of
 87 |     3 prints debugging info.
 88 | 
 89 | 
 90 | Exit Status
 91 | ===========
 92 | 
 93 | Exits with non-zero upon encountering an error.
 94 | 
 95 | .. only:: man
 96 | 
 97 |     See Also
 98 |     ========
 99 | 
100 |     :manpage:`omics-prep(1)`, :manpage:`omics-qc(1)`, :manpage:`omics(7)`,
101 |     :manpage:`illumina-reads-processing(7)`, :manpage:`omics-assemble(1)`
102 | 
103 | 


--------------------------------------------------------------------------------
/docs/merge-coverage.txt:
--------------------------------------------------------------------------------
 1 | .. program:: merge-coverage
 2 | 
 3 | ====================================
 4 | merge-coverage
 5 | ====================================
 6 | 
 7 | Synopsis
 8 | ========
 9 | 
10 | :program:`merge-coverage` [OPTIONS]... <input file>...
11 | 
12 | 
13 | Description
14 | ===========
15 | 
16 | Calculates the per-sample per-contig mean coverage from the per-sample coverage
17 | tables made with :program:`omics mapping` into on file suitable as input file for
18 | :program:`concoct`.
19 | 
20 | 
21 | Options
22 | =======
23 | .. option:: -h, --help
24 | 
25 |     show help message and exit
26 | 
27 | .. option:: -a, --assembly FILE
28 | 
29 |     The assembly file. This is to be compatible with the CONCOCT workflow. If
30 |     used then contigs not covered by any sample will appear in the output with
31 |     zeros (unlike the output of bedtool's :program:`genomeCoverageBed`.)
32 | 
33 | .. option:: -o, --out FILE
34 | 
35 |     Output file. By default stdout is used.
36 | 
37 | .. option:: --debug
38 | 
39 |     Print stack trace on errors.
40 | 
41 | .. option:: --length
42 | 
43 |     Insert column with contig length. The default is not to insert lengths.
44 | 
45 | ..option:: -v, --verbose
46 | 
47 |     Report progress to stderr.
48 | 
49 | 
50 | Exit Status
51 | ===========
52 | 
53 | Exits with non-zero upon encountering an error.
54 | 
55 | .. only:: man
56 | 
57 |     See Also
58 |     ========
59 | 
60 |     :manpage:`omics-binning(1)`, :manpage:`omics-mapping(1)`,
61 |     :manpage:`omics(7)`, :manpage:`illumina-reads-processing(7)`
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/docs/prep.txt:
--------------------------------------------------------------------------------
 1 | .. program:: omics prep
 2 | 
 3 | ============================================================
 4 | prep - prepare compressed fastq files for further processing
 5 | ============================================================
 6 | 
 7 | Synopsis
 8 | ========
 9 | 
10 | :program:`omics prep` [OPTIONS]... [READS]...
11 | 
12 | 
13 | Description
14 | ===========
15 | 
16 | To several tools of the Geomicro Illumina Reads Processing Pipeline assume
17 | that raw or intermediate data is available in files following certain naming
18 | conventions and formats and directory layout.  To get started the script
19 | :program:`omics prep` will help following these conventions.  
20 | 
21 | 
22 | Options
23 | =======
24 | 
25 | .. option:: -f, --force
26 | 
27 |     Allow overwriting existing files.
28 | 
29 | .. option:: --keep-lanes-separate
30 | 
31 |     Keep data from different lanes separate. The default is to collect reads
32 |     originating from the same physical sample if sequencing was done using
33 |     several lanes.
34 | 
35 | .. option:: --suffix LIST
36 | 
37 |     Comma-separated list of valid file suffices used for raw reads. This is
38 |     used to find files when a directory is given as positional argument. By
39 |     default .fastq and .fastq.gz files are considered.
40 | 
41 | .. option:: -t N, --threads N, --cpus N
42 | 
43 |     Number of threads / CPUs to employ
44 | 
45 | .. option:: -h, --help
46 | 
47 |     Print help.
48 | 
49 | .. option:: -v, --verbose
50 | 
51 |     Use one or multiple ``-v`` to increase verbosity of output.
52 | 
53 | Exit Status
54 | ===========
55 | 
56 | Exits with non-zero upon encountering an error.
57 | 
58 | .. only:: man
59 | 
60 |     See Also
61 |     ========
62 | 
63 |     :manpage:`omics-qc(1)`, :manpage:`omics(7)`, :manpage:`illumina-reads-processing(7)`
64 | 


--------------------------------------------------------------------------------
/docs/qc-check.txt:
--------------------------------------------------------------------------------
 1 | .. program:: omics qc-check
 2 | 
 3 | ===================================================
 4 | qc-check - Quickly check results of quality control
 5 | ===================================================
 6 | 
 7 | .. argparse::
 8 |     :module: omics.qc_check
 9 |     :func: get_argp
10 |     :prog: omics qc-check
11 |     :nodefault:
12 |     :manpage:
13 | 
14 | 
15 | Exit Status
16 | ===========
17 | 
18 | Exits with non-zero upon encountering an error.
19 | 
20 | .. only:: man
21 | 
22 |     See Also
23 |     ========
24 | 
25 |     :manpage:`omics-qc`, :manpage:`omics(7)`,
26 |     :manpage:`illumina-reads-processing(7)`
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/qc-sample.txt:
--------------------------------------------------------------------------------
  1 | .. program:: omics qc-sample
  2 | 
  3 | ==========================================================
  4 | qc-sample - quality control for metagenomic Illumina reads
  5 | ==========================================================
  6 | 
  7 | Synopsis
  8 | ========
  9 | 
 10 | :program:`omics qc-sample` [OPTIONS]...
 11 | 
 12 | 
 13 | Description
 14 | ===========
 15 | 
 16 | The :program:`omics qc-sample` script takes a pair of fastq-formatted files
 17 | (forward and reverse reads, data from a single sample) and runs them past
 18 | quality assessment (FastQC), dereplication, adapter removal (Scythe), and
 19 | quality-trimming (Sickle) steps and then prepares a FASTA-formatted interleaved
 20 | reads file that can be used as input for the IDBA assembler.  A second run of
 21 | FastQC allows a before-after comparison to see if these steps led to an
 22 | improvement in the quality of the data.
 23 | 
 24 | 
 25 | positional arguments
 26 | ====================
 27 | 
 28 | .. option:: samples
 29 | 
 30 |     List of directories, one per sample that contain the sample's reads. The
 31 |     default is to take the current directory and process a single sample. The
 32 |     names of the reads files must be fwd.fastq and rev.fastq, currently this
 33 |     can not be set manually. Use the omics-qc-sample script directly to
 34 |     specify filenames, omics-qc is just a wrapper after all.
 35 | 
 36 | 
 37 | Options
 38 | =======
 39 | 
 40 | .. option:: -f, --fwd=FILE
 41 | 
 42 |     fastq-formatted file with forward reads, by default this is ``fwd.fastq``
 43 |     as saved by the :program:`prep` script
 44 | 
 45 | .. option:: -r, --rev=FILE
 46 | 
 47 |     fastq-formatted file with reverse reads, by default this is ``rev.fastq``
 48 |     as saved by the :program:`prep` script
 49 | 
 50 | .. option:: --clean-only
 51 | 
 52 |     Remove all files made by a previously run of :program:`qc` and exit.
 53 | 
 54 | .. option:: -a, --adapters=FILE
 55 | 
 56 |     Specify the adapters file used in the adpater trimming step.  By default
 57 |     the Illumina adapter file TruSeq3-PE-2.fa as distributed by the Trimmomatic
 58 |     project will be used.
 59 | 
 60 | .. option:: --keep-all
 61 | 
 62 |     Keep all intermediate files, by default some not-so-important intermediate
 63 |     results will be deleted to save disk space
 64 | 
 65 | .. option:: --less-mem
 66 | 
 67 |     This option will reduce the dominating memory requirements for the
 68 |     de-replication step by half, typically, and double the computation time.
 69 | 
 70 | .. option:: --no-dereplicate
 71 | 
 72 |     Option to skip the de-replication step
 73 | 
 74 | .. option:: --no-fasta-interleave
 75 | 
 76 |     Skip building the interleaved fasta file, interleaved fastq files will
 77 |     still be build.
 78 | 
 79 | .. option:: -S, --scythe-sickle
 80 | 
 81 |     Use scythe + sickle instead of (the default) Trimmomatic
 82 | 
 83 | .. option::  --working-dir=DIR
 84 | 
 85 |     Directory under which output is stored. By default this is the current
 86 |     directory.
 87 | 
 88 | .. option:: -h, --help
 89 | 
 90 |     Print help.
 91 | 
 92 | .. option:: --no-color
 93 | 
 94 |     Disable colorful output.
 95 | 
 96 | .. option:: -v, --verbosity=N
 97 | 
 98 |     Use one or multiple ``-v`` to increase verbosity of output or set a
 99 |     level of verbosity with ``--verbosity=N``.  By default the verbosity
100 |     level is 1.  Setting verbosity to 0 silences the program.  A level of
101 |     3 prints debugging info.
102 | 
103 | 
104 | Exit Status
105 | ===========
106 | 
107 | Exits with non-zero upon encountering an error.
108 | 
109 | .. only:: man
110 | 
111 |     See Also
112 |     ========
113 | 
114 |     :manpage:`omics-prep(1)`, :manpage:`omics-qc`, :manpage:`omics(7)`,
115 |     :manpage:`illumina-reads-processing(7)`
116 | 
117 | 


--------------------------------------------------------------------------------
/docs/qc.txt:
--------------------------------------------------------------------------------
 1 | .. program:: omics qc
 2 | 
 3 | ===================================================
 4 | qc - quality control for metagenomic Illumina reads
 5 | ===================================================
 6 | 
 7 | Synopsis
 8 | ========
 9 | 
10 | :program:`omics qc` [OPTIONS]... [SAMPLES]...
11 | 
12 | 
13 | Description
14 | ===========
15 | 
16 | The :program:`omics qc` script takes raw reads from multiple samples and runs
17 | them past quality assessment (FastQC), dereplication, adapter removal and
18 | quality-trimming with Trimmomatic (or alternatively with Scythe and Sickle)
19 | steps and then prepares a FASTA-formatted interleaved reads file that can be
20 | used as input for the assembler.  A second run of FastQC allows a before-after
21 | comparison to see if these steps led to an improvement in the quality of the
22 | data.
23 | 
24 | 
25 | positional arguments
26 | ====================
27 | 
28 | .. option:: samples
29 | 
30 |     List of directories, one per sample that contain the sample's reads. The
31 |     default is to take the current directory and process a single sample. The
32 |     names of the reads files must be fwd.fastq and rev.fastq, currently this
33 |     can not be set manually. Use the omics-qc-sample script directly to
34 |     specify filenames, omics-qc is just a wrapper after all.
35 | 
36 | 
37 | Options
38 | =======
39 | 
40 | .. option:: --clean-only
41 | 
42 |     Remove all files made by a previously run of :program:`qc` and exit.
43 | 
44 | .. option:: -a, --adapters=FILE
45 | 
46 |     Specify the adapters file used in the adpater trimming step.  By default
47 |     the Illumina adapter file TruSeq3-PE-2.fa as distributed by the Trimmomatic
48 |     project will be used.
49 | 
50 | .. option:: --keep-all
51 | 
52 |     Keep all intermediate files, by default some not-so-important intermediate
53 |     results will be deleted to save disk space
54 | 
55 | .. option:: --no-dereplicate
56 | 
57 |     Option to skip the de-replication step
58 | 
59 | .. option:: --no-fasta-interleave
60 | 
61 |     Skip building the interleaved fasta file, interleaved fastq files will
62 |     still be build.
63 | 
64 | .. option:: -S, --scythe-sickle
65 | 
66 |     Use scythe + sickle instead of (the default) Trimmomatic
67 | 
68 | .. option:: --cpus N, --threads N, -t N
69 |     Number of threads / CPUs to employ
70 | 
71 | .. option:: -h, --help
72 | 
73 |     Print help.
74 | 
75 | .. option:: -v, --verbose
76 | 
77 |     Use one or multiple ``-v`` to increase verbosity of output.
78 | 
79 | Exit Status
80 | ===========
81 | 
82 | Exits with non-zero upon encountering an error.
83 | 
84 | .. only:: man
85 | 
86 |     See Also
87 |     ========
88 | 
89 |     :manpage:`omics-prep(1)`, :manpage:`omics-qc-sample(1)`,
90 |     :manpage:`omics(7)`, :manpage:`illumina-reads-processing(7)`
91 | 
92 | 


--------------------------------------------------------------------------------
/docs/run.txt:
--------------------------------------------------------------------------------
 1 | .. program:: omics run
 2 | 
 3 | ========================================
 4 | run - run command inside omics container
 5 | ========================================
 6 | 
 7 | 
 8 | Synopsis
 9 | ========
10 | 
11 | :program:`omics run` [OPTIONS...] [``--``] COMMAND...
12 | 
13 | 
14 | Description
15 | ===========
16 | 
17 | This is a wrapper around :command:`singularity run`.  It runs the given command
18 | inside the omics container environment.  if :program:`omics run` is called from
19 | a shell then command may need to be protected from the shell with single or
20 | double quotes as needed.  The container will provide a clean environment, so if
21 | environment variables are needed inside, e.g. ``SOMEVAR=foobar``, then set instead
22 | ``SINGULARITYENV_SOMEVAR=foobar``, and the variable will be set with the
23 | ``SINGULARITYENV_`` prefix stripped.
24 | 
25 | Options
26 | =======
27 | 
28 | Any options given to this script must be separated from the COMMAND
29 | by a double dash, otherwise the COMMAND will be interpreted as
30 | options with unintended consequences likely.
31 | 
32 | .. option:: -i PATH, --container-image PATH
33 | 
34 |     Full path to singularity container image.  A default is provided if this
35 |     option is missing.
36 | 
37 | .. option:: -s option, --singularity option
38 | 
39 |     Options passed on to :command:`singularity`.  For instance, to additionally mount a
40 |     path ``/some/path``, add ``--singularity "-B /some/path"`` and the option
41 |     ``-B /some/path`` will be appended to the call to :command:`singularity run`.  The
42 |     empty space between the option and its parameter must be protected by
43 |     quotes to prevent premature word splitting by the shell.  Run
44 |     :command:`singularity run --help` to see what options are supported.
45 | 
46 | .. option:: --working-dir PATH
47 | 
48 |     Set the working directory for the command
49 | 
50 | .. option:: -h, --help
51 | 
52 |     Print help.
53 | 
54 | .. option:: --no-color
55 | 
56 |     Disable colorful terminal output
57 | 
58 | .. option:: -v, --verbosity N
59 | 
60 |     Set verbosity level or use one or multiple :option:`-v` to increase verbosity of output.
61 | 
62 | 
63 | Exit Status
64 | ===========
65 | 
66 | Exits with non-zero upon encountering an error.
67 | 
68 | .. only:: man
69 | 
70 |     See Also
71 |     ========
72 | 
73 |     :manpage:`omics-prep(1)`, :manpage:`omics(7)`, :manpage:`illumina-reads-processing(7)`, :manpage:`singularity(1)`
74 | 


--------------------------------------------------------------------------------
/docs/separate-interleaved.txt:
--------------------------------------------------------------------------------
 1 | .. program:: separate-interleaved
 2 | 
 3 | ====================
 4 | separate-interleaved
 5 | ====================
 6 | 
 7 | Synopsis
 8 | ========
 9 | 
10 | :program:`separate-interleaved` [:option:`-v`] [:option:`-f` FILE] [:option:`-r` FILE] <fastq-file>
11 | 
12 | 
13 | Description
14 | ===========
15 | 
16 | Separate interleaved reads fastq file into forwards and reverse files.
17 | 
18 | Separate interleaved-reads fasta/q file into forwards and reverse files. Input
19 | file must be in FASTQ or FASTA format, Sequence and quality score must be on a
20 | single line each, separated by a '+', read headers must start with '@' or '>'.
21 | The script will auto-detect the file format based on the first header. It is
22 | not checked if two reads are actually paired-end reads, however an error will
23 | be raised if the input file containes an uneven number of sequences.
24 |  
25 | 
26 | 
27 | Options
28 | =======
29 | .. option:: -h, --help
30 | 
31 |     show this help message and exit
32 | 
33 | .. option:: -f FILE, --fwd FILE, --forward-out FILE
34 | 
35 |     Name of forward output file. A value is derived from
36 |     the inputfilename by default.
37 | 
38 | .. option:: -r FILE, --rev FILE, --reverse-out FILE
39 | 
40 |     Name of reverse output file. A value is derived from
41 |     the inputfilename by default.
42 | 
43 | .. option:: -v, --verbose
44 | 
45 |     Print more informative output
46 | 
47 | 
48 | Exit Status
49 | ===========
50 | 
51 | Exits with non-zero upon encountering an error.
52 | 
53 | .. only:: man
54 | 
55 |     See Also
56 |     ========
57 | 
58 |     :manpage:`omics(7)`,
59 |     :manpage:`illumina-reads-processing(7)`
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/docs/template.txt:
--------------------------------------------------------------------------------
 1 | .. program:: $program
 2 | 
 3 | $header_line
 4 | $program - $short_description
 5 | $header_line
 6 | 
 7 | Synopsis
 8 | ========
 9 | 
10 | :program:`$program` $usage_args
11 | 
12 | 
13 | Description
14 | ===========
15 | 
16 | $long_description
17 | 
18 | 
19 | Options
20 | =======
21 | 
22 | $positional_args
23 | 
24 | $optional_args
25 | 
26 | 
27 | Exit Status
28 | ===========
29 | 
30 | Exits with non-zero upon encountering an error.
31 | 
32 | .. only:: man
33 | 
34 |     See Also
35 |     ========
36 | 
37 |     :manpage:`omics(7)`,
38 |     :manpage:`illumina-reads-processing(7)`
39 | 


--------------------------------------------------------------------------------
/docs/unchop-contigs.txt:
--------------------------------------------------------------------------------
 1 | .. program:: unchop-contigs
 2 | 
 3 | =================================================================
 4 | unchop-contigs - Stitch together chopped up contigs after binning
 5 | =================================================================
 6 | 
 7 | Synopsis
 8 | ========
 9 | 
10 | :program:`unchop-contigs` [-h] [-i [BACKUP_SUFFIX] | -o OUT_DIR] [-v] [input [input ...]]
11 | 
12 | 
13 | Description
14 | ===========
15 | 
16 | The CONCOCT binner recommends to chop long contigs into even length chunks to
17 | reduce bias related to varying contigs sizes.  This script glues them back
18 | together for downstream analysis of bins.
19 | 
20 | It is assumed that there is one fasta file per bin and that the fasta headers
21 | consist of the original contig id followed by a dotand a decimal chunk number.
22 | For example if a bin has three contig chunks named::
23 | 
24 |     k141_531759.0
25 |     k141_531759.1
26 |     k141_531759.2
27 | 
28 | they will be replaced by a single contig called::
29 | 
30 |     k141_531759.0-2
31 | 
32 | Contigs that do not have chunk information will be left alone.  However contigs
33 | will be sorted by contig id and numerical chunk number.  A consequence is that
34 | applying unchop-contig a second time may change the order of some contigs.
35 | 
36 | 
37 | Options
38 | =======
39 | 
40 | .. option:: input
41 | 
42 |     List of directories or fasta files. The default is to  take the
43 |     current diorectory.
44 | 
45 | .. option:: -h, --help
46 | 
47 |     show this help message and exit
48 | 
49 | .. option:: -i [BACKUP_SUFFIX], --in-place [BACKUP_SUFFIX]
50 | 
51 |     Replace input file. If provided, backup of each file  is made using
52 |     the provided suffix.
53 | 
54 | .. option:: -o OUT_DIR, --out-dir OUT_DIR
55 | 
56 |     Output directory. The default is the current  directory.
57 | 
58 | .. option:: -v, --verbose
59 | 
60 |     Print diagnostic output.
61 | 
62 | 
63 | Exit Status
64 | ===========
65 | 
66 | Exits with non-zero upon encountering an error.
67 | 
68 | .. only:: man
69 | 
70 |     See Also
71 |     ========
72 | 
73 |     :manpage:`omics(7)`,
74 |     :manpage:`illumina-reads-processing(7)`
75 | 
76 | 


--------------------------------------------------------------------------------
/lib/Makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 
 3 | # get all python files
 4 | py_files = $(shell find -name "*.py")
 5 | 
 6 | lib_files = liba.sh
 7 | 
 8 | EXTRA_DIST = Makefile
 9 | 
10 | install: installdir = $(DESTDIR)$(datadir)/$(package_name)
11 | install: install-py
12 | 	$(info Installing lib files ...)
13 | 	mkdir -p -- "$(installdir)"
14 | 	$(INSTALL_DATA) -t $(installdir) $(lib_files)
15 | 
16 | install-py: installdir = $(DESTDIR)$(prefix)/lib/python3.5/site-packages
17 | install-py:
18 | 	$(info Installing python packages ...)
19 | 	for i in $(py_files); do \
20 | 	    $(INSTALL_DATA) -D $$i $(installdir)/$$i; \
21 | 	done
22 | 
23 | distdir:
24 | 	$(info Copying lib files ...)
25 | 	mkdir -p -- "../$(dist_dir)/lib"
26 | 	cp -a $(lib_files) $(EXTRA_DIST) ../$(dist_dir)/lib/
27 | 	# copy each python package individually
28 | 	for i in $(shell find -name __init__.py -printf "%h "); do \
29 | 	    mkdir -p ../$(dist_dir)/lib/$$i && \
30 | 	    cp -p $$i/*.py ../$(dist_dir)/lib/$$i/ ; \
31 | 	done
32 | 


--------------------------------------------------------------------------------
/lib/omics/__main__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Regents of The University of Michigan.
 2 | 
 3 | # This file is part of geo-omics-scripts.
 4 | 
 5 | # Geo-omics-scripts is free software: you can redistribute it and/or
 6 | # modify it under the terms of the GNU General Public License as published
 7 | # by the Free Software Foundation, either version 3 of the License, or (at
 8 | # your option) any later version.
 9 | 
10 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
11 | # WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 | # General Public License for more details.
14 | 
15 | # You should have received a copy of the GNU General Public License along
16 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | """
19 | Invoke omics scripts and sub-commands
20 | """
21 | 
22 | from pathlib import Path
23 | import os
24 | 
25 | from . import process_command_line, get_main_arg_parser, get_available_commands
26 | from . import launch_cmd_as_sub_module
27 | 
28 | 
29 | def main():
30 |     argp = get_main_arg_parser(description=__doc__)
31 |     args = argp.parse_args()
32 |     args.script_dir = Path(args.script_dir)
33 |     if not args.script_dir.is_dir():
34 |         argp.error('Not a directory: {}'.format(args.script_dir))
35 | 
36 |     if args.command:
37 |         import_err = launch_cmd_as_sub_module(args, argp)
38 | 
39 |         # try calling as shellscript
40 |         cmd = args.command[0]
41 |         cmd_opts = args.command[1:]
42 |         cmdline = process_command_line(
43 |             cmd,
44 |             cmd_opts,
45 |             script_dir=args.script_dir
46 |         )
47 |         if args.dry_run:
48 |             print(*cmdline)
49 |         else:
50 |             try:
51 |                 p = os.execv(cmdline[0], cmdline)
52 |             except FileNotFoundError as e:
53 |                 if args.traceback:
54 |                     raise
55 |                 else:
56 |                     msg1 = '\n  {}: {}'.format(import_err.__class__.__name__,
57 |                                                import_err)
58 |                     msg2 = '  {}: {}'.format(e.__class__.__name__, e)
59 |                     msg3 = '  ==> Not a valid omics command: {}'.format(cmd)
60 |                     argp.error('\n'.join([msg1, msg2, msg3]))
61 |             except Exception as e2:
62 |                 if args.traceback:
63 |                     raise
64 |                 else:
65 |                     argp.error('Command "{}" failed: {}: {}'
66 |                                ''.format(cmdline, e2.__class__.__name__, e2))
67 |             else:
68 |                 argp.exit(status=p.returncode)
69 |     else:
70 |         subcmds = get_available_commands()
71 |         if subcmds:
72 |             print('Available commands:')
73 |             for i in subcmds:
74 |                 print('  ', i)
75 |             print('Type `omics -h` or `omics <cmd> -h` to get help.')
76 |         else:
77 |             argp.print_help()
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     main()
82 | 


--------------------------------------------------------------------------------
/lib/omics/_version.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Regents of The University of Michigan.
 2 | 
 3 | # This file is part of geo-omics-scripts.
 4 | 
 5 | # Geo-omics-scripts is free software: you can redistribute it and/or
 6 | # modify it under the terms of the GNU General Public License as published
 7 | # by the Free Software Foundation, either version 3 of the License, or (at
 8 | # your option) any later version.
 9 | 
10 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
11 | # WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 | # General Public License for more details.
14 | 
15 | # You should have received a copy of the GNU General Public License along
16 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | import os.path
19 | import subprocess
20 | 
21 | # Set to real version when distribute outside of git vcs
22 | VERSION = None
23 | 
24 | 
25 | def get_version(version=VERSION, raise_on_error=False):
26 |     """
27 |     Get the version string
28 | 
29 |     Get the hard-coded version if possible, then fall back to ask git.  If that
30 |     fails raise an exeception or return an 'unknown' depending on the
31 |     raise_on_error flag.
32 |     """
33 |     if version is not None:
34 |         return version
35 | 
36 |     try:
37 |         p = subprocess.run(
38 |             ['git', 'describe'],
39 |             cwd=os.path.dirname(__file__),
40 |             stdout=subprocess.PIPE,
41 |             stderr=subprocess.PIPE,
42 |             check=raise_on_error,
43 |         )
44 |     except Exception as e:
45 |         out = e.stdout.decode()
46 |         err = e.stderr.decode()
47 |         raise RuntimeError(
48 |             'Failed to get version info from git: {}: {}\n{}{}'
49 |             ''.format(e.__class__.__name__, e, out, err))
50 |     else:
51 |         version = p.stdout.decode().strip()
52 |         # version should be like 1.0.134-42-gd3adb33f
53 |         # make this a PEP440 local version like 1.0.134+42-gd3adb33f
54 |         version = version.replace('-', '+', 1)
55 |         if version:
56 |             return version
57 |     return 'unknown'
58 | 


--------------------------------------------------------------------------------
/lib/omics/checkm.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Regents of The University of Michigan.
  2 | 
  3 | # This file is part of geo-omics-scripts.
  4 | 
  5 | # Geo-omics-scripts is free software: you can redistribute it and/or
  6 | # modify it under the terms of the GNU General Public License as published
  7 | # by the Free Software Foundation, either version 3 of the License, or (at
  8 | # your option) any later version.
  9 | 
 10 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 11 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13 | # General Public License for more details.
 14 | 
 15 | # You should have received a copy of the GNU General Public License along
 16 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 17 | 
 18 | """
 19 | Utilities to work with CheckM output
 20 | """
 21 | 
 22 | import argparse
 23 | from collections import OrderedDict
 24 | import sys
 25 | 
 26 | from . import OmicsArgParser
 27 | 
 28 | ACTIONS = ['convert']
 29 | DEFAULT_ACTION = ACTIONS[0]
 30 | 
 31 | 
 32 | def load_tsv(file):
 33 |     """
 34 |     Load data from CheckM-style tsv output file
 35 | 
 36 |     :param file: Filehandle
 37 |     :return: List of OrderedDicts, one per rows
 38 |     """
 39 |     cols = None
 40 | 
 41 |     data = []
 42 |     for line in file:
 43 |         line = line.split('\t')
 44 |         dat = line[1].strip().lstrip('{').rstrip('}')
 45 |         dat = dat.split(', ')
 46 |         dat = map(lambda x: x.split(': '), dat)
 47 | 
 48 |         dd = OrderedDict()
 49 |         dd['bin'] = line[0]
 50 | 
 51 |         for k, v in dat:
 52 |             dd[k.strip("'")] = v
 53 | 
 54 |         if cols is None:
 55 |             cols = dd.keys()
 56 |         else:
 57 |             if cols != dd.keys():
 58 |                 raise RuntimeError('data keys, inconsistency: {} vs. {}'
 59 |                                    ''.format(cols, dd.keys()))
 60 | 
 61 |         for k, v in dd.items():
 62 |             try:
 63 |                 v = int(v)
 64 |             except ValueError:
 65 |                 try:
 66 |                     v = float(v)
 67 |                 except ValueError:
 68 |                     pass
 69 |             dd[k] = v
 70 | 
 71 |         data.append(dd)
 72 |     return data
 73 | 
 74 | 
 75 | def main(argv=None, namespace=None):
 76 |     prog = __loader__.name.replace('.', ' ')
 77 |     argp = OmicsArgParser(prog=prog, description=__doc__, threads=False)
 78 |     argp.add_argument(
 79 |         '-c', '--convert',
 80 |         action='store_true',
 81 |         help='Convert input to real tab-separated table',
 82 |     )
 83 |     argp.add_argument(
 84 |         'inputfile',
 85 |         metavar='FILE',
 86 |         type=argparse.FileType(),
 87 |         default=sys.stdin,
 88 |         help='Input file, usually a .tsv file written by CheckM',
 89 |     )
 90 |     args = argp.parse_args(args=argv, namespace=namespace)
 91 | 
 92 |     if args.convert:
 93 |         data = load_tsv(args.inputfile)
 94 |         print(*data[0].keys(), sep='\t')
 95 |         for row in data:
 96 |             print(*row.values(), sep='\t')
 97 |     else:
 98 |         argp.error('no action specified, e.g. --convert')
 99 | 
100 | 
101 | if __name__ == '__main__':
102 |     main()
103 | 


--------------------------------------------------------------------------------
/lib/omics/db/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This is the omics database module
3 | 
4 | This is 'stand-alone' Django ORM and db backend for the geo-omics-scripts.
5 | """
6 | from .manage import main, setup
7 | 


--------------------------------------------------------------------------------
/lib/omics/db/apps.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Regents of The University of Michigan.
 2 | 
 3 | # This file is part of geo-omics-scripts.
 4 | 
 5 | # Geo-omics-scripts is free software: you can redistribute it and/or
 6 | # modify it under the terms of the GNU General Public License as published
 7 | # by the Free Software Foundation, either version 3 of the License, or (at
 8 | # your option) any later version.
 9 | 
10 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
11 | # WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 | # General Public License for more details.
14 | 
15 | # You should have received a copy of the GNU General Public License along
16 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | from django.apps import AppConfig
19 | 
20 | class OmicsDBConfig(AppConfig):
21 |     name = 'omics.db'
22 |     label = 'omics_db'
23 |     verbose_name = 'geo-omics-scripts data base'
24 | 


--------------------------------------------------------------------------------
/lib/omics/db/management/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Geo-omics/scripts/5f20a3418096a11604880b9c38565c9f4c9546eb/lib/omics/db/management/__init__.py


--------------------------------------------------------------------------------
/lib/omics/db/management/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Geo-omics/scripts/5f20a3418096a11604880b9c38565c9f4c9546eb/lib/omics/db/management/commands/__init__.py


--------------------------------------------------------------------------------
/lib/omics/db/models.py:
--------------------------------------------------------------------------------
1 | from django.db import models
2 | 


--------------------------------------------------------------------------------
/lib/omics/fastq2fasta.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Regents of The University of Michigan.
 2 | 
 3 | # This file is part of geo-omics-scripts.
 4 | 
 5 | # Geo-omics-scripts is free software: you can redistribute it and/or
 6 | # modify it under the terms of the GNU General Public License as published
 7 | # by the Free Software Foundation, either version 3 of the License, or (at
 8 | # your option) any later version.
 9 | 
10 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
11 | # WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 | # General Public License for more details.
14 | 
15 | # You should have received a copy of the GNU General Public License along
16 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
17 | 
18 | """
19 | Convert fastq into fasta
20 | """
21 | 
22 | import argparse
23 | import sys
24 | 
25 | from . import OmicsArgParser
26 | 
27 | 
28 | def convert(data, output, check=True):
29 |     """
30 |     Convert data from FASTQ into FASTA format
31 | 
32 |     :param data: File-like object with input data
33 |     :param output: File-like object for output
34 |     """
35 |     state = 0
36 |     for line in data:
37 |         if check:
38 |             if state == 0 and not line.startswith('@'):
39 |                 raise RuntimeError('Input not in FASTQ format? Expected line '
40 |                                    'to start with @: {}'.format(line))
41 |             elif state == 2 and not line == '+\n':
42 |                 raise RuntimeError('Input not in FASTQ format? Expected + '
43 |                                    'separator line: {}'.format(line))
44 | 
45 |         if state == 0:
46 |             output.write('>' + line[1:])
47 |         elif state == 1:
48 |             output.write(line)
49 | 
50 |         state = (state + 1) % 4
51 | 
52 |     if state != 0:
53 |         raise RuntimeError('Input not in FASTQ format? Expected total number '
54 |                            'of lines to be multiple of 4, last line: {}'
55 |                            ''.format(line))
56 | 
57 | 
58 | def main(argv=None, namespace=None):
59 |     argp = OmicsArgParser(
60 |         prog=__loader__.name.replace('.', ' '),
61 |         description=__doc__,
62 |         project_home=False,
63 |         threads=False,
64 |     )
65 |     argp.add_argument(
66 |         'inputfile',
67 |         metavar='FILE',
68 |         nargs='?',
69 |         type=argparse.FileType('r'),
70 |         default=sys.stdin,
71 |         help='Fastq file to be converted, by default data is read from stdin.'
72 |     )
73 |     argp.add_argument(
74 |         '-o', '--output',
75 |         metavar='FILE',
76 |         nargs='?',
77 |         type=argparse.FileType('w'),
78 |         default=sys.stdout,
79 |         help='Name of output filie.  Write to stdout by default.'
80 |     )
81 |     argp.add_argument(
82 |         '--force', '-f',
83 |         action='store_true',
84 |         help='Overwrite existing files',
85 |     )
86 |     argp.add_argument(
87 |         '--no-check',
88 |         action='store_false',
89 |         dest='check',
90 |         help='Skip sanity check on input data.  By default it is checked that '
91 |              'the input is indeed in fastq format.',
92 |     )
93 |     args = argp.parse_args(args=argv, namespace=namespace)
94 |     convert(args.inputfile, args.output, check=args.check)
95 | 
96 | 
97 | if __name__ == '__main__':
98 |     main()
99 | 


--------------------------------------------------------------------------------
/lib/omics/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Regents of The University of Michigan.
  2 | 
  3 | # This file is part of geo-omics-scripts.
  4 | 
  5 | # Geo-omics-scripts is free software: you can redistribute it and/or
  6 | # modify it under the terms of the GNU General Public License as published
  7 | # by the Free Software Foundation, either version 3 of the License, or (at
  8 | # your option) any later version.
  9 | 
 10 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 11 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13 | # General Public License for more details.
 14 | 
 15 | # You should have received a copy of the GNU General Public License along
 16 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 17 | 
 18 | """
 19 | Omics utilities collection
 20 | """
 21 | 
 22 | from collections import defaultdict
 23 | from matplotlib.pyplot import subplots
 24 | from pathlib import Path
 25 | 
 26 | 
 27 | def load_read_coordinates(reads_file, file_format='fastq'):
 28 |     """
 29 |     Load read coordinates into dict of lists of points
 30 | 
 31 |     :param reads_file: File object or str or Path of inout file
 32 |     :param str file_format: Either 'fasta' or 'fastq'
 33 |     """
 34 |     if file_format == 'fastq':
 35 |         mark = '@'
 36 |     elif file_format == 'fasta':
 37 |         mark = '>'
 38 |     else:
 39 |         raise ValueError('Illegal file format specified: {}'
 40 |                          ''.format(file_format))
 41 | 
 42 |     # handle input parameter type
 43 |     keep_open = False
 44 |     if isinstance(reads_file, str):
 45 |         reads_file = open(reads_file)
 46 |     elif isinstance(reads_file, Path):
 47 |         reads_file = reads_file.open()
 48 |     else:
 49 |         keep_open = True
 50 | 
 51 |     try:
 52 |         data = defaultdict(list)
 53 |         for line in reads_file:
 54 |             if line.startswith(mark):
 55 |                 try:
 56 |                     point = line.split(':')[4:7]
 57 |                 except Exception:
 58 |                     raise RuntimeError('Failed parsing sequence header '
 59 |                                        '(split by :): {}'.format(line))
 60 |                 point[2] = point[2].split()[0]
 61 | 
 62 |                 try:
 63 |                     data[int(point[0])].append((int(point[1]), int(point[2])))
 64 |                 except Exception:
 65 |                     raise RuntimeError('Failed parsing sequence header '
 66 |                                        '(int conversion): {}'.format(line))
 67 |     except Exception:
 68 |         raise
 69 |     else:
 70 |         return data
 71 |     finally:
 72 |         if not keep_open:
 73 |             reads_file.close()
 74 | 
 75 | 
 76 | def scatter(points):
 77 |     """
 78 |     Diplay scatterplot
 79 |     """
 80 |     fig, ax = subplots()
 81 |     ax.scatter(
 82 |         [i[0] for i in points],
 83 |         [i[1] for i in points],
 84 |         marker='.',
 85 |     )
 86 |     fig.show()
 87 | 
 88 | 
 89 | def hist(data):
 90 |     """
 91 |     Diplay histogram
 92 |     """
 93 |     import matplotlib.pyplot as plt
 94 |     fig = plt.figure()
 95 |     ax = fig.add_subplot(111)
 96 |     ax.hist(
 97 |         data,
 98 |         bins='auto',
 99 |     )
100 |     plt.show()
101 |     plt.close()
102 | 


--------------------------------------------------------------------------------
/localenv:
--------------------------------------------------------------------------------
 1 | # To use this repo as a local installation run
 2 | #
 3 | #   $ source localenv
 4 | #
 5 | # from a bash prompt
 6 | 
 7 | base=$(readlink -f "$(dirname "${BASH_SOURCE[0]}")")
 8 | export PATH="$base/scripts:$PATH"
 9 | export PYTHONPATH="$base/lib:$PYTHONPATH"
10 | mkdir -p -- "$base/share/geo-omics-scripts"
11 | ln -f -s -t "$base/share/geo-omics-scripts/" ../../lib/liba.sh
12 | ln -f -s -t "$base/share/geo-omics-scripts/" ../../TruSeq3-PE-2+omics.fa
13 | ln -f -s -t "$base/share/geo-omics-scripts/" ../../phylosiftrc
14 | source "$base/bash-completion/omics"
15 | 


--------------------------------------------------------------------------------
/modulefiles/flux.omics/.version:
--------------------------------------------------------------------------------
1 | #%Module1.0
2 | 
3 | set ModulesVersion "1"
4 | 


--------------------------------------------------------------------------------
/modulefiles/flux.omics/1:
--------------------------------------------------------------------------------
 1 | #%Module1.0
 2 | # 
 3 | # to be installed as /dept/geology/geomicro/data9/flux/modulefiles/geomicro/omics/1
 4 | #
 5 | # this module's maintainer's email: heinro@umich.edu
 6 | # 
 7 | proc ModulesHelp { } {
 8 |     puts stderr "Load this module to use the 'omics scripts."
 9 | }
10 | 
11 | module-whatis "All-in-one omics module"
12 | 
13 | if { [module-info mode load] } {
14 |     # prerequsites for quast module
15 |     module load boost
16 |     # gsl libary needed by concoct
17 |     module load gsl
18 |     # prerequsites for quast, concoct module
19 |     module load python-anaconda2/latest
20 |     # prerequisite for bedtools2 module
21 |     module load samtools
22 |     # Load standard software packages
23 |     # (required for geo-omics scripts)
24 |     module load bedtools2
25 |     module load bwa
26 |     module load fastqc
27 |     module load geomicro/idba
28 |     module load geomicro/scythe
29 |     module load ncbi-blast
30 |     module load phylosift
31 |     module load python-anaconda3
32 |     module load quast
33 |     module load sickle
34 |     module load megahit
35 | }
36 | 
37 | set OMICS_ROOT /dept/geology/geomicro/data9/flux/apps/omics_root
38 | 
39 | append-path PATH $OMICS_ROOT/bin
40 | append-path MANPATH $OMICS_ROOT/share/man
41 | append-path PYTHONPATH $OMICS_ROOT/lib/python3.5/site-packages
42 | 
43 | setenv PYTHONUSERBASE $OMICS_ROOT
44 | 


--------------------------------------------------------------------------------
/modulefiles/flux.omics/2:
--------------------------------------------------------------------------------
 1 | #%Module1.0
 2 | # 
 3 | # to be installed as /dept/geology/geomicro/data9/flux/modulefiles/geomicro/omics/2
 4 | #
 5 | # this module's maintainer's email: heinro@umich.edu
 6 | # 
 7 | proc ModulesHelp { } {
 8 |     puts stderr "Module to enable the 'comics' command to enter the 'omics container"
 9 | }
10 | 
11 | module-whatis "Allows access to the 'omics container"
12 | 
13 | set COMICS_ROOT /dept/geology/geomicro/data9/flux/apps/comics
14 | 
15 | prepend-path PATH $COMICS_ROOT/bin
16 | prepend-path MANPATH $COMICS_ROOT/share/man
17 | 


--------------------------------------------------------------------------------
/modulefiles/install:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # --- CAUTION ---
 4 | #
 5 | # This script will install module files to production !
 6 | #
 7 | # --- CAUTION ---
 8 | 
 9 | set -e
10 | 
11 | # this will fail, no root rights after all
12 | # contact Mike Messina
13 | #scp -p omics/1 cayman:/usr/share/Modules/modulefiles/omics/1
14 | #scp -p omics/1 vondamm:/usr/share/Modules/modulefiles/omics/1
15 | 
16 | # should work as heinro user
17 | scp -p flux.omics/1 guaymas.earth.lsa.umich.edu:/gmb/data9/flux/modulefiles/geomicro/omics/1
18 | 


--------------------------------------------------------------------------------
/modulefiles/omics/.version:
--------------------------------------------------------------------------------
1 | #%Module1.0
2 | 
3 | set ModulesVersion "1"
4 | 


--------------------------------------------------------------------------------
/modulefiles/omics/1:
--------------------------------------------------------------------------------
 1 | #%Module1.0
 2 | #
 3 | # to be installed as /usr/share/Modules/modulefiles/omics/1 on vondamm, cayman
 4 | #
 5 | # this module's maintainer's email: heinro@umich.edu
 6 | #
 7 | 
 8 | proc ModulesHelp { } {
 9 |     puts stderr "Load this module to use the (geo-)omics scripts."
10 | }
11 | 
12 | module-whatis "All-in-one omics module"
13 | 
14 | if { [module-info mode load] } {
15 |     # Load standard software packages
16 |     module load AnacondaPython3
17 |     module load AnacondaPython
18 |     module load Scythe
19 |     module load blast
20 |     module load PhyloSift
21 |     module load idba
22 |     module load QUAST
23 |     module load bwa
24 |     module load samtools
25 |     module load bedtools
26 |     module load megahit
27 | }
28 | 
29 | set OMICS_ROOT /geomicro/data9/flux/apps/omics_root
30 | 
31 | append-path PATH $OMICS_ROOT/bin
32 | append-path MANPATH $OMICS_ROOT/share/man
33 | append-path PYTHONPATH $OMICS_ROOT/lib/python3.5/site-packages
34 | 
35 | setenv PYTHONUSERBASE $OMICS_ROOT
36 | 


--------------------------------------------------------------------------------
/phylosiftrc:
--------------------------------------------------------------------------------
 1 | # PhyloSift run control file
 2 | #
 3 | # see also:
 4 | # https://phylosift.wordpress.com/tutorials/running-phylosift/phylosift-run-control-file/
 5 | #
 6 | 
 7 | use Env qw($OMICS_REFERENCE_DATA);
 8 | 
 9 | my $common = "data9/flux/reference-data/phylosift";
10 | my @ref_alternatives = (
11 |     "$OMICS_REFERENCE_DATA/phylosift",
12 |     "/geomicro/$common",
13 |     "/gmb/$common",
14 |     "/dept/geology/geomicro/$common",
15 | );
16 | 
17 | foreach (@ref_alternatives) {
18 |     $ref_data_path = $_ if (-d $_);
19 | }
20 | print "[phylosiftrc] using reference data from: $ref_data_path\n";
21 | 
22 | $marker_path = "$ref_data_path";
23 | $ncbi_path = "$ref_data_path";
24 | 
25 | # prevent this:
26 | #     Error: requested HMM banded DP mx of 4749.29 Mb > 2500.00 Mb limit.
27 | #     Increase limit with --mxsize or tau with --tau.
28 | $cm_align_long_mxsize = "10000";
29 | $cm_align_short_mxsize = "10000";
30 | 
31 | # Use last-align bundled binaries if available
32 | 
33 | $lastdb        = "/usr/lib/phylosift/lastdb";
34 | $lastal        = "/usr/lib/phylosift/lastal";
35 | 
36 | -e $lastdb or $lastdb="";
37 | -e $lastal or $lastal="";
38 | 


--------------------------------------------------------------------------------
/scripts/COPYRIGHT.tetramer_freqs_esom:
--------------------------------------------------------------------------------
 1 | For tetramer_freqs_esom:
 2 | 
 3 | ###############################################################################
 4 | Copyright (C) 2007 Anders Andersson (anders.andersson@scilifelab.se)
 5 | 
 6 | This program is free software; you can redistribute it and/or
 7 | modify it under the terms of the GNU General Public License
 8 | as published by the Free Software Foundation; either version 2
 9 | of the License, or (at your option) any later version.
10 | 
11 | This program is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | GNU General Public License for more details.
15 | 
16 | You should have received a copy of the GNU General Public License
17 | along with this program. If not, see <http://www.gnu.org/licenses/>.
18 | 
19 | Anders Andersson
20 | Assistant Professor
21 | SciLifeLab
22 | School of Biotechnology
23 | KTH Royal Institute of Technology
24 | Stockholm, Sweden
25 | Email: anders.andersson@scilifelab.se
26 | ###############################################################################
27 | 


--------------------------------------------------------------------------------
/scripts/ESOM_binning_results_parser:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2019 Derek Smith
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | set -e
21 | 
22 | cat ./*.conf > esom_scaffolds2bin.tsv
23 | sed '/^#/ d' esom_scaffolds2bin.tsv > esom_scaffolds2bin.cleaned.tsv
24 | awk 'BEGIN{OFS="\t"}{print $2,$1}' esom_scaffolds2bin.cleaned.tsv > esom_scaffolds2bin.tsv
25 | awk 'BEGIN{OFS="\t"}{$2="Bin_"$2; print}' esom_scaffolds2bin.tsv > ESOM_binning_results.txt
26 | perl -pe 's/(?<=\d)_(?=\d)/./g' ESOM_binning_results.txt > ESOM_binning_results.txt.fixed
27 | sed 's/k141\./k141_/g' ESOM_binning_results.txt.fixed > ESOM_binning_results.txt
28 | rm esom_scaffolds2bin.cleaned.tsv
29 | rm esom_scaffolds2bin.tsv
30 | rm ESOM_binning_results.txt.fixed
31 | 


--------------------------------------------------------------------------------
/scripts/Ebot.Output.Extract.Gi.Title.Rev3:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | """
 21 | This python script will take the output from ebot obtain summaries and
 22 | extract the title line and the gi line.
 23 | """
 24 | 
 25 | from sys import argv
 26 | import re
 27 | 
 28 | 
 29 | def main():
 30 |     ebotFile = open(argv[1], 'r')
 31 | 
 32 |     currentLine = ""
 33 |     gi = ""
 34 |     title = ""
 35 |     journal = ""
 36 |     titleTrue = False
 37 |     titleCounter = 0
 38 |     JournalCounter = 0
 39 |     currentLine = ""
 40 |     firstEntry = True
 41 |     sequence = False
 42 |     fasta = ""
 43 | 
 44 |     for line in ebotFile:
 45 |         if line.startswith("VERSION") and not titleTrue:
 46 |             if not firstEntry and journal != "Unpublished" \
 47 |                     and title != "Direct Submission ":
 48 |                 # print "a." + fasta + ".b"
 49 |                 print(gi + "\t" + title + "\t" + journal + "\t" + fasta)
 50 |                 # print "\t" + "fail"
 51 |                 # print "\n" + journal
 52 |                 title = ""
 53 |                 currentLine = ""
 54 |                 fasta = ""
 55 |                 # exit()
 56 |             else:
 57 |                 title = ""
 58 |                 currentLine = ""
 59 |                 fasta = ""
 60 |                 # exit()
 61 |             line = line.split()
 62 |             gi = line[2].replace("GI:", "")
 63 |             titleCounter = 0
 64 |             JournalCounter = 0
 65 |             firstEntry = False
 66 | 
 67 |         elif line.startswith("  TITLE") and titleCounter == 0:
 68 |             title = line
 69 |             title = title.replace("  TITLE     ", "")
 70 |             title = title.replace("\n", " ")
 71 |             title = title.strip("\n")
 72 |             titleTrue = True
 73 |             titleCounter += 1
 74 |             firstEntry = False
 75 | 
 76 |         elif line.find("  JOURNAL") != -1 and titleTrue:
 77 |             titleTrue = False
 78 |             journal = line
 79 |             journal = journal.replace("  JOURNAL   ", "")
 80 |             journal = journal.strip("\n")
 81 |             # print "a." + journal + ".b"
 82 | 
 83 |         elif titleTrue:
 84 |             currentLine = line.replace("\n", "")
 85 |             title += currentLine.replace("            ", "")
 86 |             title += " "
 87 |             title = title.strip("\n")
 88 | 
 89 |         elif line.startswith("ORIGIN"):
 90 |             sequence = True
 91 | 
 92 |         elif sequence and line.find("//") == -1:
 93 |             splitLine = line.split(" ")
 94 |             for element in splitLine:
 95 |                 if re.match("\D", element):
 96 |                     # print element
 97 |                     fasta += element.strip("\t\n")
 98 | 
 99 |         elif line.find("//") != -1:
100 |             sequence = False
101 | 
102 |     # print gi + "\t" + title
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     main()
107 | 


--------------------------------------------------------------------------------
/scripts/GI_info_XMLParser:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | use strict;
22 | 
23 | my $in= $ARGV[0];
24 | my $out= $ARGV[1];
25 | 
26 | open( SUMM, $in)|| die "$!\n";
27 | open (OUT, ">".$out);
28 | 
29 | while(my $line=<SUMM>){
30 | 	my ($id);
31 | 	if($line=~ m/<ID>(\d*)<\/ID>/i){
32 | 		print OUT $1."\t";	
33 | 	}
34 | 	elsif($line=~ m/<Item Name\=\"Title\" Type\=\"String\">([\w\W]*)<\/Item>/i){
35 | 		print OUT $1;
36 | 	}
37 | 	elsif($line=~ m/<\/DocSum>/i){
38 | 		print OUT "\n";
39 | 	}
40 | 	else{
41 | 		next;
42 | 	}
43 | }
44 | 


--------------------------------------------------------------------------------
/scripts/Metabat_to_anvio_parser:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2019 Derek Smith
 4 | # Copyright 2019 Regents of The University of Michigan.
 5 | 
 6 | # This file is part of geo-omics-scripts.
 7 | 
 8 | # Geo-omics-scripts is free software: you can redistribute it and/or
 9 | # modify it under the terms of the GNU General Public License as published
10 | # by the Free Software Foundation, either version 3 of the License, or (at
11 | # your option) any later version.
12 | 
13 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
14 | # WITHOUT ANY WARRANTY; without even the implied warranty of
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 | # General Public License for more details.
17 | 
18 | # You should have received a copy of the GNU General Public License along
19 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
20 | 
21 | #############################################################################################
22 | #
23 | # Parse metabat output into format that can be import as a collection into the anvio profile
24 | #
25 | ############################################################################################
26 | 
27 | set -eu
28 | 
29 | [[ "$#" -gt 0 ]] || { echo "Arguments required: Metabat output files"; exit 1; }
30 | # This command will add the Bin ID in a column after the split name,
31 | # replace the dots with underscores to make anvio happy...,
32 | # and concatenate all the files into one binning results file for anvio.
33 | # (Assumes that filenames (if coming from different directories) don't collide)
34 | for i in "$@"; do
35 |     binid=$(basename -s .fa "$i")
36 |     binid=${binid/./_}
37 |     sed "s/$/\t$binid/" "$i"
38 | done > Metabat_binning_results.txt
39 | 


--------------------------------------------------------------------------------
/scripts/U2T:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl
 2 | 
 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | use strict;
22 | use Getopt::Long;
23 | 
24 | =head1 NAME
25 | 
26 | U2T - Converts U -> T and removes gaps
27 | 
28 | 
29 | =head1 SYNOPSIS
30 | 
31 | B<U2F> B<-in> I<input fasta file> B<-out> I<output fasta file>
32 | 
33 | 
34 | =head1 SEE ALSO
35 | 
36 | L<omics(1)>, L<illumina-reads-processing(7)>
37 | 
38 | =head2 Other local resources
39 | 
40 | =over
41 | 
42 | =item [1]
43 | 
44 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
45 | 
46 | =item [2]
47 | 
48 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
49 | 
50 | =back
51 | 
52 | =head2 Web
53 | 
54 | =over
55 | 
56 | =item [3]
57 | 
58 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
59 | 
60 | =item [4]
61 | 
62 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
63 | 
64 | =item [5]
65 | 
66 | L<Github repository|https://github.com/Geo-omics/scripts>
67 | 
68 | =back
69 | 
70 | =cut
71 | 
72 | my $seqFile;
73 | my $out;
74 | 
75 | GetOptions(
76 | 	"in:s" => \$seqFile,
77 | 	"out:s"	=>	\$out,
78 | 	"h|help"	=>	sub {system('perldoc', $0); exit;},
79 | );
80 | 
81 | $/=">";
82 | open (SEQ, $seqFile) || die "Couldn't open $seqFile\n";
83 | open (OUT, ">".$out);
84 | while (my $line = <SEQ>) {
85 | 	next if $line=~ m/^#/;
86 |     chomp $line;
87 | 	$line=~ s/\r//;
88 |     next unless $line;
89 | 
90 | 	my($seqDesc, @sequence)=split(/\n/, $line);
91 | 	my $seq=join("", @sequence);
92 | 	
93 | 	$seq=~ tr/ACGTU/ACGTT/;
94 | 	$seq=~ s/[\.\-\s]//g;
95 | 	print OUT ">". $seqDesc."\n".$seq."\n";
96 | }
97 | close SEQ;
98 | close OUT;
99 | 


--------------------------------------------------------------------------------
/scripts/VizBin_parser:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2019 Derek Smith
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | #################################################################################
21 | #
22 | # DJS 5 September 2018
23 | #
24 | # Run this shell script in a directory containing bin fasta files to get a
25 | # summary tab-delimited file to import the bins as a collection in ANVIO.  It
26 | # was originally written for VizBin collections, but will work for any group of
27 | # Bin fastas generated from any binning program
28 | #
29 | #################################################################################
30 | 
31 | set -eu
32 | 
33 | # Make a list of contigs in each bin, and add the bin file in a column next to the contig:
34 | for i in *.fa; do
35 | 	grep ">" "$i" | sed "s/$/	$i/" > "${i}".list;
36 | done
37 | 
38 | # Concactenate the data into one list file:
39 | cat ./*.list > cat.list
40 | 
41 | # remove the file extension from the bin name:
42 | sed 's/.fa//g' cat.list > cat2.list
43 | 
44 | # Remove the ">" leftover from fasta headers:
45 | sed 's/>//g' cat2.list > VizBin_binning_results.txt
46 | 
47 | # delete intermediate files:
48 | rm ./*.list
49 | 


--------------------------------------------------------------------------------
/scripts/antiSmash_summary:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | set -e
22 | 
23 | echo "GeneClusters: $(grep -v "^>" Overview.geneclusters.txt | wc -l)"
24 | echo "smcogs: $(grep -c "^>>" Overview.smcogs.txt)"
25 | 
26 | ls */structures/* | cut -f 1 -d "/" | sort -u > structures.list
27 | echo "Structures: `wc -l structures.list`"
28 | 


--------------------------------------------------------------------------------
/scripts/assemblyModules:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | 
 3 | # Copyright 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | module load AMOS/3.1.0
22 | module load velvet/1.1.07-MAX99-OPENMP
23 | module load MetaVelvet/1.0.01
24 | 
25 | #meta-velvetg
26 | 
27 | 


--------------------------------------------------------------------------------
/scripts/asv-map-update:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2020 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | """
20 | Update fasta and map files with existing ASV accessions
21 | 
22 | Takes a reference fasta file of known ASV sequences, and from a Mothur run a
23 | sequence id->OTU map and fasta file and replaces the OTUs in the map with ASVs.
24 | The output is a two-column table, mapping mothur OTUs to ASVs from the
25 | reference.
26 | 
27 | All fasta input must be 1-line per sequence.
28 | 
29 | This script is part of the Schmidt Lab 16S mothur pipeline.
30 | """
31 | import argparse
32 | import sys
33 | 
34 | argp = argparse.ArgumentParser(description=__doc__)
35 | argp.add_argument(
36 |     'asv_fa',
37 |     metavar='asv-fasta',
38 |     type=argparse.FileType(),
39 |     help='Fasta file sequences with ASV accessions',
40 | )
41 | argp.add_argument(
42 |     'map',
43 |     metavar='seq-otu-map',
44 |     type=argparse.FileType(),
45 |     help='Mothur Sequence ID to OTU mapping, a two-column table, '
46 |          'usually with suffix asv0.precluster.denovo.uchime.pick.map',
47 | )
48 | argp.add_argument(
49 |     'fasta',
50 |     type=argparse.FileType(),
51 |     help='Mothur fasta file.  Fasta headers should contain sequence IDs.',
52 | )
53 | argp.add_argument(
54 |     '-o', '--output',
55 |     default=sys.stdout,
56 |     type=argparse.FileType('w'),
57 |     help='A two-column output mapping OTUs to ASVs, this file can then '
58 |          'be used with the --map option for the shared-set-accessions script',
59 | )
60 | argp.add_argument('--version', action='version', version='%(prog)s '
61 |                   'is part of geo-omics-scripts VERSION_PLACEHOLDER')
62 | args = argp.parse_args()
63 | 
64 | 
65 | # Load ASV fasta
66 | asvs = {}
67 | for line in args.asv_fa:
68 |     line = line.strip()
69 |     if line.startswith('>'):
70 |         cur = line.lstrip('>').split()[0]  # take "first word" as asv
71 |         continue
72 |     else:
73 |         asvs[line] = cur
74 |         del cur
75 | 
76 | # Load mothur fasta
77 | seqs = {}
78 | for line in args.fasta:
79 |     line = line.strip()
80 |     if line.startswith('>'):
81 |         cur = line.lstrip('>').split()[0]  # take "first word" as id
82 |         continue
83 |     else:
84 |         seqs[cur] = line.replace('-', '').strip('.')  # rm alignment
85 |         del cur
86 | 
87 | otu2asv = {}
88 | for line in args.map:
89 |     seqid, otu = line.strip().split('\t')
90 |     try:
91 |         otu2asv[otu] = asvs[seqs[seqid]]
92 |     except KeyError:
93 |         continue
94 | 
95 | for otu, asv in sorted(otu2asv.items(), key=lambda x: x[0]):
96 |     args.output.write(f'{otu}\t{asv}\n')
97 | 
98 | args.output.close()
99 | 


--------------------------------------------------------------------------------
/scripts/bins2fasta:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | from omics import bins2fasta
21 | 
22 | 
23 | bins2fasta.main()
24 | 


--------------------------------------------------------------------------------
/scripts/calcN50:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl
 2 | 
 3 | # Copyright 2013, 2017, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | use strict;
22 | 
23 | my $fasta=$ARGV[0];
24 | 
25 | ## Read Fasta File and compute N50, L50, N95 and L95 ##
26 | my $length;
27 | my $totalLength;
28 | my $totalContigs;
29 | my @allLen;
30 | open(FASTA, $fasta)|| die $!;
31 | $/=">";
32 | while(my $line=<FASTA>){
33 | 	chomp $line;
34 | 	next unless $line;
35 | 
36 | 	my ($header, @sequence)=split(/\n/, $line);
37 | 	my $length=length(join("", @sequence));
38 | 
39 | 	push (@allLen, $length);
40 | 	$totalLength += $length;
41 | 	$totalContigs++;
42 | }
43 | $/="\n";
44 | close(FASTA);
45 | 
46 | my @sortedLen = sort {$b <=> $a} @allLen;
47 | my $cumLen;
48 | my $numContig;
49 | print "Total_Contigs:\t$totalContigs\n";
50 | foreach my $len(@sortedLen){
51 | 	$cumLen+=$len;
52 | 	$numContig++;
53 | 	if ($cumLen >= $totalLength * 0.95) {
54 | 		print "N95:\t$len\n";
55 | 		print "L95:\t$numContig\n";
56 | 	}
57 | 	if($cumLen >= $totalLength * 0.50){
58 | 		print "N50:\t$len\n";
59 | 		print "L50:\t$numContig\n";
60 | 		last;
61 | 	}
62 | }
63 | 


--------------------------------------------------------------------------------
/scripts/clusterDensity:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | 
 21 | =head1 NAME
 22 | 
 23 | clusterDensity - Do this.
 24 | 
 25 | 
 26 | =head1 SYNOPSIS
 27 | 
 28 | B<clusterDensity>
 29 | 
 30 | 
 31 | =head1 DESCRIPTION
 32 | 
 33 | Do this.
 34 | 
 35 | 
 36 | =head1 OPTIONS
 37 | 
 38 | =over 8
 39 | 
 40 | =item B<-version>, B<-v> I<BOOLEAN>
 41 | 
 42 | version of the current script
 43 | 
 44 | =item B<-help>, B<-h> I<BOOLEAN>
 45 | 
 46 | This message.
 47 | 
 48 | =back
 49 | 
 50 | 
 51 | =head1 AUTHOR
 52 | 
 53 | Sunit Jain, (Mon Mar  2 16:41:57 EST 2015)
 54 | sunitj [AT] umich [DOT] edu
 55 | 
 56 | 
 57 | =head1 SEE ALSO
 58 | 
 59 | L<omics(1)>, L<illumina-reads-processing(7)>
 60 | 
 61 | =head2 Other local resources
 62 | 
 63 | =over
 64 | 
 65 | =item [1]
 66 | 
 67 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
 68 | 
 69 | =item [2]
 70 | 
 71 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
 72 | 
 73 | =back
 74 | 
 75 | =head2 Web
 76 | 
 77 | =over
 78 | 
 79 | =item [3]
 80 | 
 81 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
 82 | 
 83 | =item [4]
 84 | 
 85 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
 86 | 
 87 | =item [5]
 88 | 
 89 | L<Github repository|https://github.com/Geo-omics/scripts>
 90 | 
 91 | =back
 92 | 
 93 | =cut
 94 | 
 95 | use strict;
 96 | use Getopt::Long;
 97 | use FileHandle;
 98 | use File::Basename;
 99 | 
100 | my $help;
101 | my $version=fileparse($0)."\tv0.0.1b";
102 | my $clustFile="results.txt";
103 | GetOptions(
104 |         'c|clusters:s'=>\$clustFile,
105 | 	'v|version'=>sub{print $version."\n"; exit;},
106 | 	'h|help'=>sub{system("perldoc $0 \| cat"); exit;},
107 | );
108 | print "\# $version\n";
109 | 
110 | my $FILE=FileHandle->new();
111 | open( $FILE, "<", $clustFile) || die $!;
112 | while(my $line=<$FILE>){
113 |     chomp $line;
114 |     next unless $line;
115 |     
116 |     my($thresh, $fMeasure, $combinedClust)=split(/\t/, $line);
117 |     my @clustComma=split(/\,/,$combinedClust);
118 |     my @clustSemiColon=split(/\;/, $combinedClust);
119 |     my $totalClusters=scalar(@clustSemiColon);
120 |     my $totalNodes=scalar(@clustComma);
121 |     print $thresh."\t".$totalNodes."\t".$totalClusters."\t".($totalNodes/$totalClusters)."\n";
122 | }
123 | close $FILE;
124 | 
125 | 
126 | 


--------------------------------------------------------------------------------
/scripts/countInstances:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | 
 21 | =head1 NAME
 22 | 
 23 | countInstances - count the number of times a value is seen in a column
 24 | 
 25 | 
 26 | =head1 SYNOPSIS
 27 | 
 28 | B<countInstances> B<-in> tab-delimitedB<-in>put.txt B<-out> output-filename.txt B<-col> column-number
 29 | 
 30 | 
 31 | =head1 DESCRIPTION
 32 | 
 33 | count the number of times a value is seen in a column
 34 | 
 35 | 
 36 | =head1 OPTIONS
 37 | 
 38 | =over 8
 39 | 
 40 | =item B<-in> I<character>
 41 | 
 42 | input file name
 43 | 
 44 | =item B<-out> I<character>
 45 | 
 46 | output file name
 47 | 
 48 | =item B<-col> I<integer>
 49 | 
 50 | column to subtotal; start counting from 1.
 51 | 
 52 | =item B<-sum> I<integer>
 53 | 
 54 | sum the column # instead of incrementing by 1.
 55 | 
 56 | =back
 57 | 
 58 | 
 59 | =head1 AUTHOR
 60 | 
 61 | Sunit Jain, (Thu Jul 18 20:37:35 EDT 2013)
 62 | sunitj [AT] umich [DOT] edu
 63 | 
 64 | 
 65 | =head1 SEE ALSO
 66 | 
 67 | L<omics(1)>, L<illumina-reads-processing(7)>
 68 | 
 69 | =head2 Other local resources
 70 | 
 71 | =over
 72 | 
 73 | =item [1]
 74 | 
 75 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
 76 | 
 77 | =item [2]
 78 | 
 79 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
 80 | 
 81 | =back
 82 | 
 83 | =head2 Web
 84 | 
 85 | =over
 86 | 
 87 | =item [3]
 88 | 
 89 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
 90 | 
 91 | =item [4]
 92 | 
 93 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
 94 | 
 95 | =item [5]
 96 | 
 97 | L<Github repository|https://github.com/Geo-omics/scripts>
 98 | 
 99 | =back
100 | 
101 | =cut
102 | 
103 | use strict;
104 | use Getopt::Long;
105 | 
106 | my $version="0.0.1b";
107 | my $col=2;
108 | my ($in, $out, $sum);
109 | GetOptions(
110 | 	'in=s'=>\$in,
111 | 	'o|out=s'=>\$out,
112 | 	'col:i'=>\$col,
113 | 	'sum:i'=>\$sum,
114 | 	'v|version'=>sub{print $version."\n"; exit;},
115 | 	'h|help'=>sub{system('perldoc', $0); exit;},
116 | );
117 | 
118 | $col--;
119 | #$sum--;
120 | 
121 | my %counts;
122 | open(IN, "<".$in)|| die $!;
123 | while(my $line=<IN>){
124 | 	chomp $line;
125 | 	next if $line=~ /^#/;
126 | 	
127 | 	my @cols=split(/\t/, $line);
128 | 	if($sum){
129 | 		$sum--;
130 | 		$counts{$cols[$col]}+=$cols[$sum];
131 | 	}
132 | 	else{
133 | 		$counts{$cols[$col]}++;
134 | 	}
135 | }
136 | close IN;
137 | 
138 | open(OUT, ">".$out)||die $!;
139 | foreach my $keys(keys %counts){
140 | 	print OUT $keys."\t".$counts{$keys}."\n";
141 | }
142 | close OUT;
143 | 


--------------------------------------------------------------------------------
/scripts/createFastq:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | use warnings;
22 | use strict;
23 | use File::Basename;
24 | 
25 | my $inFasta = $ARGV[0];
26 | my $baseName = basename($inFasta, qw/.fasta .fna/);
27 | my $inQual = $baseName . ".qual";
28 | my $outFastq = $baseName . ".fastq";
29 | 
30 | my %seqs;
31 | 
32 | $/ = ">";
33 | 
34 | open (FASTA, "<$inFasta");
35 | my $junk = (<FASTA>);
36 | 
37 | while (my $frecord = <FASTA>) {
38 | 	chomp $frecord;
39 | 	my ($fdef, @seqLines) = split /\n/, $frecord;
40 | 	my $seq = join '', @seqLines;
41 | 	$seqs{$fdef} = $seq;
42 | }
43 | 
44 | close FASTA;
45 | 
46 | open (QUAL, "<$inQual");
47 | $junk = <QUAL>;
48 | open (FASTQ, ">$outFastq");
49 | 
50 | while (my $qrecord = <QUAL>) {
51 | 	chomp $qrecord;
52 | 	my ($qdef, @qualLines) = split /\n/, $qrecord;
53 | 	my $qualString = join ' ', @qualLines;
54 | 	my @quals = split / /, $qualString;
55 | 	print FASTQ "@","$qdef\n";
56 | 	print FASTQ "$seqs{$qdef}\n";
57 | 	print FASTQ "+\n";
58 | 	foreach my $qual (@quals) {
59 | 		print FASTQ chr($qual + 33);
60 | 	}
61 | 	print FASTQ "\n";
62 | }
63 | 
64 | close QUAL;
65 | close FASTQ;
66 | 


--------------------------------------------------------------------------------
/scripts/createNodes:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | 
 21 | =head1 NAME
 22 | 
 23 | createNodes - Do this.
 24 | 
 25 | 
 26 | =head1 SYNOPSIS
 27 | 
 28 | B<createNodes>
 29 | 
 30 | 
 31 | =head1 DESCRIPTION
 32 | 
 33 | Do this.
 34 | 
 35 | 
 36 | =head1 OPTIONS
 37 | 
 38 | =over 8
 39 | 
 40 | =item B<-config>, B<-c> I<CHAR>
 41 | 
 42 | Config file explicitly declaring which columns to make nodes out of and which the properties; Recommended but Optional; default = guess.
 43 | 
 44 | =item B<-nodes>, B<-n> I<CHAR>
 45 | 
 46 | Read this nodes file created by one of the parsers and create nodes in the database; Required.
 47 | 
 48 | =item B<-port>, B<-p> I<INT>
 49 | 
 50 | Port which the database is listening to; default: 7474.
 51 | 
 52 | =item B<-version>, B<-v> I<BOOLEAN>
 53 | 
 54 | version of the current script
 55 | 
 56 | =item B<-help>, B<-h> I<BOOLEAN>
 57 | 
 58 | This message.
 59 | 
 60 | =back
 61 | 
 62 | 
 63 | =head1 AUTHOR
 64 | 
 65 | Sunit Jain, (Tue Jun 30 08:14:43 EDT 2015)
 66 | sunitj [AT] umich [DOT] edu
 67 | 
 68 | 
 69 | =head1 SEE ALSO
 70 | 
 71 | L<omics(1)>, L<illumina-reads-processing(7)>
 72 | 
 73 | =head2 Other local resources
 74 | 
 75 | =over
 76 | 
 77 | =item [1]
 78 | 
 79 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
 80 | 
 81 | =item [2]
 82 | 
 83 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
 84 | 
 85 | =back
 86 | 
 87 | =head2 Web
 88 | 
 89 | =over
 90 | 
 91 | =item [3]
 92 | 
 93 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
 94 | 
 95 | =item [4]
 96 | 
 97 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
 98 | 
 99 | =item [5]
100 | 
101 | L<Github repository|https://github.com/Geo-omics/scripts>
102 | 
103 | =back
104 | 
105 | =cut
106 | 
107 | use strict;
108 | use Getopt::Long;
109 | use FileHandle;
110 | use File::Basename;
111 | 
112 | my $help;
113 | my $version=fileparse($0)."\tv0.0.1b";
114 | my ($configFile, $nodesFile);
115 | my $port = 7474;
116 | GetOptions(
117 | 	'c|config:s'=>\$configFile,
118 | 	'n|nodes:n'=>\$nodesFile,
119 | 	'p|port:i'=>\$port,
120 | 	'v|version'=>sub{print $version."\n"; exit;},
121 | 	'h|help'=>sub{system("perldoc $0 \| cat"); exit;},
122 | );
123 | print "\# $version\n";
124 | 
125 | my $NODES=FileHandle->new();
126 | open( $NODES, "<", $nodesFile) || die $!;
127 | while(my $line=<$NODES>){
128 | 
129 | }
130 | close $NODES;
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/scripts/curateDB:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2013, 2014, 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | use strict;
 21 | use Getopt::Long;
 22 | 
 23 | my $in;
 24 | my $out = $$.".fasta";
 25 | my $delTmp;
 26 | my $hasGItags;
 27 | my $printDuplicates;
 28 | 
 29 | GetOptions(
 30 | 	'i|in:s'=> \$in,
 31 | 	'o|out:s'=> \$out,
 32 | 	't|del'=> \$delTmp,
 33 | 	'g|gi'=>\$hasGItags,
 34 | 	'd|duplicates'=>\$printDuplicates,
 35 | 	'h|help'=> sub{	system('perldoc', $0); exit; },
 36 | );
 37 | 
 38 | if (! $in){	system('perldoc', $0); exit; }
 39 | 
 40 | open (IN, $in)||die "[ERROR] $in: $!\n";
 41 | my $tmpFile= $$.".tmp";
 42 | open (TMPW, ">".$tmpFile);
 43 | my $u=0;
 44 | my $s=0;
 45 | my $t=0;
 46 | my %seen;
 47 | 
 48 | while(my $line=<IN>){
 49 | 	chomp $line;
 50 | 	next unless $line;
 51 | 	if ($line=~ m/^>/){
 52 | 		$t++;
 53 | 		$line=~ s/\>/\+/g;
 54 | 		$line=~ s/^\+/>/;
 55 | 		print TMPW $line."\n";
 56 | 	}
 57 | 	else{
 58 | 		print TMPW $line."\n";
 59 | 	}
 60 | }
 61 | close IN;
 62 | close TMPW;
 63 | print "#Number of Headers before Curation: $t\n";
 64 | 
 65 | open (TMPR, $tmpFile) || die "[ERROR] $tmpFile: $!\n";
 66 | open (OUT, ">".$out);
 67 | my $printSeq= 0;
 68 | $/= ">";
 69 | while (my $line=<TMPR>){
 70 | 	chomp $line;
 71 | 	next unless $line;
 72 | 
 73 | 	if ($hasGItags){
 74 | 		my ($giTag, $giNum, @etc)=split(/\|/, $line);
 75 | 		print OUT ">".$line unless ($seen{$giNum});
 76 | 		$seen{$giNum}++;
 77 | 	}
 78 | 	else{
 79 | 		my($name, @seqs)=split(/\n/,$line);
 80 | 		print OUT ">".$line."\n" unless $seen{$name};
 81 | 		$seen{$name}++;
 82 | 	}
 83 | }
 84 | $/= "\n";
 85 | close TMPR;
 86 | close OUT;
 87 | print "#Number of Sequences after Curation: ".keys(%seen)."\n";
 88 | if ($printDuplicates){
 89 | 	
 90 | 	foreach my $k(keys %seen){
 91 | 		#print ".";
 92 | 		print $k."\t".$seen{$k}."\n" if ($seen{$k} >1);
 93 | 	}
 94 | }
 95 | if (! $delTmp){
 96 | 	print "#Deleting temporary file...\n";
 97 | 	unlink $tmpFile;
 98 | }
 99 | exit;
100 | 


--------------------------------------------------------------------------------
/scripts/dada2shared:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2020 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | """
21 | Convert a DADA2 sequence table into a fasta file and a mothur shared file
22 | """
23 | import argparse
24 | from pathlib import Path
25 | import re
26 | 
27 | 
28 | argp = argparse.ArgumentParser(description=__doc__)
29 | argp.add_argument(
30 |     'infile',
31 |     metavar='dada2_sequence_table',
32 |     type=argparse.FileType(),
33 |     help='Sequence table from dada2, saved with write.table()',
34 | )
35 | argp.add_argument('--version', action='version', version='%(prog)s '
36 |                   'is part of geo-omics-scripts VERSION_PLACEHOLDER')
37 | args = argp.parse_args()
38 | 
39 | # compute output file names
40 | basename = Path(args.infile.name).stem
41 | outfasta = Path() / (basename + '.fa')
42 | outshared = Path() / (basename + '.shared')
43 | 
44 | # read sequences
45 | seqs = enumerate(args.infile.readline().strip().strip('"').split('" "'),
46 |                  start=1)
47 | seqs = [('{}_{}'.format(basename, i), s) for i, s in seqs]
48 | 
49 | print('Total ASVs:', len(seqs))
50 | 
51 | with outfasta.open('w') as fasta:
52 |     for id, seq in seqs:
53 |         fasta.write('>{}\n{}\n'.format(id, seq))
54 | 
55 | print('Fasta file written:', outfasta)
56 | 
57 | pat = re.compile(r'"([^"])"')
58 | with outshared.open('w') as shared:
59 |     header = ['label', 'Group', 'numOtus'] + [i for i, _ in seqs]
60 |     shared.write('\t'.join(header) + '\n')
61 |     total = len(seqs)
62 |     replace_warning_shown = False
63 |     for row in args.infile:
64 |         sample, _, counts = row.strip().partition(' ')
65 |         sample = sample.strip('"')
66 |         if '-' in sample:
67 |             sample = sample.replace('-', '_')
68 |             if not replace_warning_shown:
69 |                 print('Some sample ids have dashes replaced by underscores to '
70 |                       'make them mothur-compatible')
71 |                 replace_warning_shown = True
72 |         counts = counts.replace(' ', '\t')
73 |         shared.write('dada2asv\t{}\t{}\t{}\n'.format(sample, total, counts))
74 | 
75 | print('Shared file written:', outshared)
76 | 


--------------------------------------------------------------------------------
/scripts/derep_getReadAbundance:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl
  2 | 
  3 | # Copyright 2013, 2014, 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | 
 21 | use strict;
 22 | use Getopt::Long;
 23 | 
 24 | my ($list,$bam, $stats, $fwdClust, $revClust);
 25 | my $out=$$.".abundance";
 26 | 
 27 | GetOptions(
 28 | 	'b|bam=s'=>\$list,
 29 | 	'fwd=s'=>\$fwdClust,
 30 | 	'rev=s'=>\$revClust,
 31 | 	'stats=s'=>\$stats,
 32 | 	'o|out:s'=>\$out,
 33 | 	'h'=>sub{system('perldoc', $0); exit;},
 34 | );
 35 | my %clust;
 36 | 
 37 | open (FCLUST, $fwdClust) || die "[ERROR] $fwdClust:$!\n";
 38 | while(my $line=<FCLUST>){
 39 | 	next if ($line=~ /^#/);
 40 | 	chomp($line);
 41 | 	$line=~ s/\r//;
 42 | 	next unless $line;
 43 | 
 44 | 	my($cNum, $size, $rep, @seqNames)=split(/\t/, $line);
 45 | 	my ($name, $strand)=split(/\s/, $rep);
 46 | 	$name=~ s/^@//;
 47 | 
 48 | 	$clust{$name}=$size;
 49 | }
 50 | close FCLUST;
 51 | 
 52 | open (RCLUST, $revClust) || die "[ERROR] $revClust:$!\n";
 53 | while(my $line=<RCLUST>){
 54 | 	next if ($line=~ /^#/);
 55 | 	chomp($line);
 56 | 	$line=~ s/\r//;
 57 | 	next unless $line;
 58 | 
 59 | 	my($cNum, $size, $rep, @seqNames)=split(/\t/, $line);
 60 | 	my ($name, $strand)=split(/\s/, $rep);
 61 | 	$name=~ s/^@//;
 62 | 
 63 | 	if ($clust{$name}){
 64 | 		$clust{$name}+=$size;
 65 | 	}
 66 | 	else{
 67 | 		$clust{$name}=$size;
 68 | 	}
 69 | }
 70 | close RCLUST;
 71 | 
 72 | print "All Clusters read into memory...\n";
 73 | 
 74 | open (STATS, $stats);
 75 | my %index;
 76 | while(my $line=<STATS>){
 77 | 	chomp $line;
 78 | 	next unless $line;
 79 | 
 80 | 	my ($name, $len, $mapped, $unmapped)=split(/\t/, $line);
 81 | 	$name=~ s/\|/\\\|/;
 82 | 
 83 | 	$index{$name}=$mapped;
 84 | }
 85 | close STATS;
 86 | 
 87 | print "All Stats read into memory...\n";
 88 | 
 89 | open(OUT, ">".$out);
 90 | print OUT "Gene\tDerepMapped\tTotalMapped\n";
 91 | my $allReads=0;
 92 | foreach my $i(keys %index){
 93 | 	my @list=`samtools view -F0x4 $bam $i | cut -f 1`;
 94 | 	chomp @list;
 95 | 
 96 | 	my $totalMapped=0;
 97 | 	foreach my $l(@list){
 98 | 		$totalMapped+= $clust{$l};
 99 | 		$allReads+=$clust{$l};
100 | 	}
101 | 	my $derepMapped= $index{$i};
102 | 	print OUT $i."\t".$derepMapped."\t".$totalMapped."\n";
103 | }
104 | 
105 | print "Total Reads mapped: $allReads\n";
106 | 


--------------------------------------------------------------------------------
/scripts/do2folder:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | 
 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | #function do2all() {
21 | if [ -z $1 ]; then echo "command failed: Give a directroy name with files"; exit; fi
22 | if [ ! -d $1 ]; then echo "$1 does not exist"; exit; fi
23 | for i in $1/*; do
24 |         if [ ! -s $i ]; then echo "$i is empty; skipping..."; continue; fi
25 |         OUT="$i.out"
26 |         # replace the following line with the desired command and $i as input and $OUT as output
27 | 		echo "IN: $i, OUT:$OUT"
28 | done
29 | #}
30 | 


--------------------------------------------------------------------------------
/scripts/do2list:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | 
 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | if [ -z $1 ]; then echo "command failed: Give a file name with list"; exit; fi
21 | if [ ! -s $1 ]; then echo "$1 does not exist"; exit; fi
22 | for i in $(grep "^" $1); do
23 |     if [ ! -s $i ]; then echo "$i is empty; skipping..."; continue; fi
24 |     OUT="$i.out"
25 |     # replace the following line with the desired command and $i as input and $OUT as output
26 | 	echo "IN: $i, OUT:$OUT"
27 | done
28 | 


--------------------------------------------------------------------------------
/scripts/extractEuks:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | legacy_consolidateJGIdata -DIR . -OUTDIR consolidated
21 | awk < consolidated/Unclassified.tsv -F'\t' '{ print $6, t, $2 }' > locus_contig.list
22 | awk < *phylodist -F'\t' '{ print $1, "\t", $5 }' | cut -f 1 -d ";" | grep "Eukaryota" | cut -f 1 > eukaryota.list
23 | fgrep -f eukaryota.list locus_contig.list | cut -f 2 | sort -u | sed "s# ##" >  eukaryota_contigs.list
24 | extractSeqs -e -l eukaryota_contigs.list -f *.fna -o euksRemoved.fasta
25 | 


--------------------------------------------------------------------------------
/scripts/firefox_already_running:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | LOCK=$(find $HOME/.mozilla/firefox/ -name lock)
22 | PLOCK=$(find $HOME/.mozilla/firefox/ -name \.parentlock)
23 | 
24 | echo "Deleting Lock file:	$LOCK"
25 | rm -f $LOCK
26 | echo "Deleting Parent Lock file: $PLOCK"
27 | rm -f $PLOCK
28 | 
29 | echo "To see why these files had to be deleted, see: http://www.mattcutts.com/blog/how-to-fix-firefox-is-already-running-error/"
30 | echo "Try running firefox again..."
31 | 


--------------------------------------------------------------------------------
/scripts/fixpod2:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | """
21 | Fix additional stuff in POD in perl scripts
22 | 
23 | Note: Makes assumptions about input, little error checking
24 | """
25 | import argparse
26 | import re
27 | import sys
28 | 
29 | 
30 | argp = argparse.ArgumentParser(description=__doc__)
31 | argp.add_argument('inputfile', type=argparse.FileType())
32 | argp.add_argument('-w', '--write-to-file', action='store_true')
33 | 
34 | args = argp.parse_args()
35 | 
36 | out = ''
37 | 
38 | for line in args.inputfile:
39 |     if line.startswith('=item '):
40 |         item_line = line.strip()
41 | 
42 |         empty_line = args.inputfile.readline()  # empty line following =item
43 |         if empty_line.strip():
44 |             raise RuntimeError('non-empty line following =head')
45 | 
46 |         descr = args.inputfile.readline()
47 |         m = re.match(r'^(or|OR)\s*-(?P<opt>\w+)\s*:?\s*(?P<descr>.*)$', descr)
48 |         if m is None:
49 |             # remove any leading ':\t+' from description
50 |             descr = re.sub('^:\s+', '',  descr.strip())
51 |         else:
52 |             # Fix options with 'or' between long and short option name
53 |             _, opt, descr = m.groups()
54 |             item_line = item_line + ', B<-{}>'.format(opt)
55 | 
56 |         out += item_line + '\n\n' + descr + '\n'
57 |     else:
58 |         out += line
59 | 
60 | # write output
61 | if args.write_to_file:
62 |     args.inputfile.close()
63 |     outfile = open(args.inputfile.name, 'w')
64 | else:
65 |     outfile = sys.stdout
66 | 
67 | outfile.write(out)
68 | 


--------------------------------------------------------------------------------
/scripts/fixpod3:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | """
21 | Fix additional stuff in POD in perl scripts
22 | 
23 | Note: Makes assumptions about input, little error checking
24 | """
25 | import argparse
26 | import re
27 | import sys
28 | 
29 | 
30 | argp = argparse.ArgumentParser(description=__doc__)
31 | argp.add_argument('inputfile', type=argparse.FileType())
32 | argp.add_argument('-w', '--write-to-file', action='store_true')
33 | 
34 | args = argp.parse_args()
35 | 
36 | out = ''
37 | 
38 | for line in args.inputfile:
39 |     if line.startswith('=item '):
40 |         item_line = line.strip()
41 | 
42 |         empty_line = args.inputfile.readline()  # empty line following =item
43 |         if empty_line.strip():
44 |             raise RuntimeError('non-empty line following =head')
45 | 
46 |         descr = args.inputfile.readline()
47 |         m = re.match(
48 |             r'^\[(boolean|characters?|flag|float|integers?|real number)\]'
49 |             r'\s*:?\s*(?P<descr>.*)$',
50 |             descr,
51 |             flags=re.IGNORECASE
52 |         )
53 |         if m is not None:
54 |             # move non-boolean types to =item line
55 |             valtype, descr = m.groups()
56 |             descr += '\n'  # add nl lost by matching
57 |             valtype = valtype.lower().rstrip('s')
58 |             if valtype not in ['flag', 'boolean']:
59 |                 opts = item_line[6:]  # remove leading '=item '
60 |                 opts = opts.split(', ')
61 |                 opts = [i + ' I<{}>'.format(valtype) for i in opts]
62 |                 opts = ', '.join(opts)
63 |                 item_line = '=item ' + opts
64 | 
65 |         out += item_line + '\n\n' + descr
66 |     else:
67 |         out += line
68 | 
69 | # write output
70 | if args.write_to_file:
71 |     args.inputfile.close()
72 |     outfile = open(args.inputfile.name, 'w')
73 | else:
74 |     outfile = sys.stdout
75 | 
76 | outfile.write(out)
77 | 


--------------------------------------------------------------------------------
/scripts/fixpod4:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | """
21 | Fix additional stuff in POD in perl scripts
22 | 
23 | Note: Makes assumptions about input, little error checking
24 | """
25 | import argparse
26 | import re
27 | import sys
28 | 
29 | 
30 | argp = argparse.ArgumentParser(description=__doc__)
31 | argp.add_argument('inputfile', type=argparse.FileType())
32 | argp.add_argument('-w', '--write-to-file', action='store_true')
33 | argp.add_argument('-c', '--check-name-only', action='store_true')
34 | 
35 | args = argp.parse_args()
36 | 
37 | out = ''
38 | 
39 | for line in args.inputfile:
40 |     if line.startswith('=head1 NAME'):
41 |         out += line
42 | 
43 |         empty_line = args.inputfile.readline()  # empty line following =head
44 |         if empty_line.strip():
45 |             raise RuntimeError('non-empty line following =head')
46 | 
47 |         short_descr = args.inputfile.readline()
48 |         m = re.match(r'^{} - '.format(args.inputfile.name), short_descr)
49 |         if m is None:
50 |             if args.check_name_only:
51 |                 print('bad NAME section:', args.inputfile.name,
52 |                       '=>', short_descr[:90].strip())
53 |                 continue
54 |             # best effort sub '^something -- ' ===> 'realname - '
55 |             short_descr = re.sub(
56 |                 r'^.*\s+--\s+',
57 |                 '{} - '.format(args.inputfile.name),
58 |                 short_descr
59 |             )
60 |         else:
61 |             # all good
62 |             pass
63 | 
64 |         out += '\n' + short_descr
65 |     else:
66 |         out += line
67 | 
68 | if args.check_name_only:
69 |     sys.exit()
70 | 
71 | # write output
72 | if args.write_to_file:
73 |     args.inputfile.close()
74 |     outfile = open(args.inputfile.name, 'w')
75 | else:
76 |     outfile = sys.stdout
77 | 
78 | outfile.write(out)
79 | 


--------------------------------------------------------------------------------
/scripts/fixpod5:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | """
21 | Add SEE ALSO section to POD in perl scripts
22 | 
23 | Note: Makes assumptions about input, little error checking
24 | """
25 | import argparse
26 | import sys
27 | 
28 | 
29 | argp = argparse.ArgumentParser(description=__doc__)
30 | argp.add_argument('inputfile', type=argparse.FileType())
31 | argp.add_argument('-w', '--write-to-file', action='store_true')
32 | 
33 | args = argp.parse_args()
34 | 
35 | see_also = """
36 | =head1 SEE ALSO
37 | 
38 | L<omics(1)>, L<illumina-reads-processing(7)>
39 | 
40 | =head3 Other local resources
41 | 
42 | =over
43 | 
44 | =item 1
45 | 
46 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
47 | 
48 | =item 2
49 | 
50 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
51 | 
52 | =back
53 | 
54 | =head3 Web
55 | 
56 | =over
57 | 
58 | =item 1
59 | 
60 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
61 | 
62 | =item 2
63 | 
64 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
65 | 
66 | =item 3
67 | 
68 | L<Github repository|https://github.com/Geo-omics/scripts>
69 | 
70 | =back
71 | 
72 | """
73 | 
74 | out = ''
75 | done = False
76 | 
77 | for line in args.inputfile:
78 |     if line.startswith('=cut') and not done:
79 |         # ensure consistent empty lines between sections
80 |         out = out.rstrip() + '\n\n'
81 |         # add section at the end
82 |         out += see_also
83 |         done = True  # in case we find a second =cut
84 | 
85 |     out += line
86 | 
87 | # write output
88 | if args.write_to_file:
89 |     args.inputfile.close()
90 |     outfile = open(args.inputfile.name, 'w')
91 | else:
92 |     outfile = sys.stdout
93 | 
94 | outfile.write(out)
95 | 


--------------------------------------------------------------------------------
/scripts/fixpod6:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | """
21 | Improve SEE ALSO section to POD in perl scripts
22 | 
23 | Note: Makes assumptions about input, little error checking
24 | """
25 | import argparse
26 | import re
27 | import sys
28 | 
29 | 
30 | argp = argparse.ArgumentParser(description=__doc__)
31 | argp.add_argument('inputfile', type=argparse.FileType())
32 | argp.add_argument('-w', '--write-to-file', action='store_true')
33 | 
34 | args = argp.parse_args()
35 | 
36 | out = ''
37 | in_see_also = False
38 | ref_count = 1
39 | 
40 | for line in args.inputfile:
41 |     if not in_see_also and line.startswith('=head1 SEE ALSO'):
42 |         in_see_also = True
43 |     if in_see_also and line.startswith('=cut'):
44 |         in_see_also = False
45 | 
46 |     if in_see_also:
47 |         line = re.sub(r'^=head3', '=head2', line)
48 |         if line.startswith('=item'):
49 |             line = '=item [{}]\n'.format(ref_count)
50 |             ref_count += 1
51 | 
52 |     out += line
53 | 
54 | # write output
55 | if args.write_to_file:
56 |     args.inputfile.close()
57 |     outfile = open(args.inputfile.name, 'w')
58 | else:
59 |     outfile = sys.stdout
60 | 
61 | outfile.write(out)
62 | 


--------------------------------------------------------------------------------
/scripts/folderLevelSize:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | # This script will only look at folders and sub-folders of the present working directory. You'll HAVE TO paste this script to the other folder if you want it's stats. Also make sure you have read permissions for the folders and sub folders before you run this script.
22 | 
23 | use strict;
24 | 
25 | #my $path= `pwd`;
26 | 
27 | my $level=$ARGV[0];
28 | my $tmp=$$.".tmp";
29 | 
30 | `du -h > $tmp`;
31 | 
32 | 
33 | if(! $ARGV[1]){ $level =1;}
34 | $level++;
35 | 
36 | open (IN, $tmp);
37 | while (my $line=<IN>){
38 | 	next if $line=~ /^#/;
39 | 	$line=~ s/\r//;
40 | 	chomp $line;
41 | 	next unless $line;
42 | 
43 | 	my($size, $path)=split(/\t/, $line);
44 | 	my @levels=split(/\//, $path);
45 | 	
46 | 	print $size."\t".$path."\n" if (scalar(@levels) == $level);
47 | }
48 | unlink $tmp;
49 | close IN;
50 | 


--------------------------------------------------------------------------------
/scripts/gbk2fna:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | 
 21 | =head1 NAME
 22 | 
 23 | gbk2fna - Read Genbank to Nucleotide Fasta file.
 24 | 
 25 | 
 26 | =head1 SYNOPSIS
 27 | 
 28 | B<gbk2fna>
 29 | 
 30 | 
 31 | =head1 DESCRIPTION
 32 | 
 33 | Read Genbank to Nucleotide Fasta file.
 34 | 
 35 | 
 36 | =head1 OPTIONS
 37 | 
 38 | =over 8
 39 | 
 40 | =item B<-in> I<CHAR>
 41 | 
 42 | FASTA File
 43 | 
 44 | =item B<-version>, B<-v> I<BOOLEAN>
 45 | 
 46 | version of the current script
 47 | 
 48 | =item B<-help>, B<-h> I<BOOLEAN>
 49 | 
 50 | This message.
 51 | 
 52 | =back
 53 | 
 54 | 
 55 | =head1 AUTHOR
 56 | 
 57 | Sunit Jain, (Thu Oct  1 11:10:47 EDT 2015)
 58 | sunitj [AT] umich [DOT] edu
 59 | 
 60 | 
 61 | =head1 SEE ALSO
 62 | 
 63 | L<omics(1)>, L<illumina-reads-processing(7)>
 64 | 
 65 | =head2 Other local resources
 66 | 
 67 | =over
 68 | 
 69 | =item [1]
 70 | 
 71 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
 72 | 
 73 | =item [2]
 74 | 
 75 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
 76 | 
 77 | =back
 78 | 
 79 | =head2 Web
 80 | 
 81 | =over
 82 | 
 83 | =item [3]
 84 | 
 85 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
 86 | 
 87 | =item [4]
 88 | 
 89 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
 90 | 
 91 | =item [5]
 92 | 
 93 | L<Github repository|https://github.com/Geo-omics/scripts>
 94 | 
 95 | =back
 96 | 
 97 | =cut
 98 | 
 99 | use strict;
100 | use Getopt::Long;
101 | use FileHandle;
102 | use File::Basename;
103 | 
104 | my $help;
105 | my $version=fileparse($0)."\tv0.0.1b";
106 | my $infile;
107 | my $outfile;
108 | GetOptions(
109 | 	'in:s'=>\$infile,
110 | 	'out:s'=>\$outfile,
111 | 	'v|version'=>sub{print $version."\n"; exit;},
112 | 	'h|help'=>sub{system("perldoc $0 \| cat"); exit;},
113 | );
114 | print "\# $version\n";
115 | 
116 | use Bio::SeqIO;
117 | my $seq_in = Bio::SeqIO->new(
118 | 	 -file   => "<$infile",
119 | 	-format => "genbank",
120 | );
121 | my $seq_out = Bio::SeqIO->new(
122 | 	-file   => ">$outfile",                                                                                                                                              -format => "fasta",                                                                                                );
123 | while (my $inseq = $seq_in->next_seq) {
124 |                                                                                                                                                                                         $seq_out->write_seq($inseq);
125 |                                                                                                                                                                                         }
126 | 


--------------------------------------------------------------------------------
/scripts/genomeCheck:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | use strict;
22 | 
23 | sub checkForCompleteness{
24 | 	my $fName=shift;
25 | 	chomp($fName);
26 | 	open (CONTIGS, $fName) || die "Couldn't open $fName\n";
27 | 	$/= ">";
28 | 	my %sequences;
29 | 	while (my $b = <CONTIGS>) {
30 | 		chomp $b;
31 | 		next unless $b;
32 | 		my ($name, @sequence) = split (/\n/, $b);
33 | 		my $seq = join ("", @sequence);
34 | 		$sequences{$name} = uc $seq;
35 | 	}
36 | 	close CONTIGS;
37 | 
38 | 	while (my($n, $s)=each(%sequences)){
39 | 		chomp($s);
40 | 		print "F:$fName\tSN: $n\n" unless (length($s)>0);
41 | 	}
42 | 	$/="\n";	
43 | 	return ();
44 | }
45 | 
46 | my $listOfFiles= $ARGV[0];
47 | open (LOF, "$listOfFiles") || die "ERROR: $ARGV[0]\n $!\n";
48 | print "Summary for incomplete Genomes:\n";
49 | while (my $file=<LOF>){
50 | 	checkForCompleteness($file);	
51 | }
52 | print "All Done!!\n";
53 | 


--------------------------------------------------------------------------------
/scripts/getGFF:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2014, 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | 
 21 | =head1 NAME
 22 | 
 23 | getGFF - Given a list of contig names extract GFF data.
 24 | 
 25 | 
 26 | =head1 SYNOPSIS
 27 | 
 28 | B<getGFF> B<-list> contig_names.list B<-gff> annotated_metagenome.gff
 29 | 
 30 | 
 31 | =head1 DESCRIPTION
 32 | 
 33 | Given a list of contig names extract GFF data.
 34 | 
 35 | 
 36 | =head1 OPTIONS
 37 | 
 38 | =over 8
 39 | 
 40 | =item B<-list> I<CHAR>
 41 | 
 42 | list of contigs
 43 | 
 44 | =item B<-gff> I<CHAR>
 45 | 
 46 | metagenome GFF file.
 47 | 
 48 | =item B<-col> I<INT>
 49 | 
 50 | Column number that contains the contig names; start count from 1. [ default = 1]
 51 | 
 52 | =item B<-version>, B<-v> I<BOOLEAN>
 53 | 
 54 | version of the current script
 55 | 
 56 | =item B<-help>, B<-h> I<BOOLEAN>
 57 | 
 58 | This message.
 59 | 
 60 | =back
 61 | 
 62 | 
 63 | =head1 AUTHOR
 64 | 
 65 | Sunit Jain, (Thu Jan  2 12:41:53 EST 2014)
 66 | sunitj [AT] umich [DOT] edu
 67 | 
 68 | 
 69 | =head1 SEE ALSO
 70 | 
 71 | L<omics(1)>, L<illumina-reads-processing(7)>
 72 | 
 73 | =head2 Other local resources
 74 | 
 75 | =over
 76 | 
 77 | =item [1]
 78 | 
 79 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
 80 | 
 81 | =item [2]
 82 | 
 83 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
 84 | 
 85 | =back
 86 | 
 87 | =head2 Web
 88 | 
 89 | =over
 90 | 
 91 | =item [3]
 92 | 
 93 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
 94 | 
 95 | =item [4]
 96 | 
 97 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
 98 | 
 99 | =item [5]
100 | 
101 | L<Github repository|https://github.com/Geo-omics/scripts>
102 | 
103 | =back
104 | 
105 | =cut
106 | 
107 | use strict;
108 | use Getopt::Long;
109 | use File::Basename;
110 | 
111 | my $help;
112 | my $version="getGFF\tv0.1.0";
113 | my ($list, $gff);
114 | my $col=1;
115 | GetOptions(
116 | 	'l|list:s'=>\$list,
117 | 	'g|gff:s'=>\$gff,
118 | 	'c|col:i'=>\$col,
119 | 	'v|version'=>sub{print $version."\n"; exit;},
120 | 	'h|help'=>sub{system("perldoc $0 \| cat"); exit;},
121 | );
122 | print "\# $version\n";
123 | 
124 | if ($col==0){ warn "Column 0, does not compute! Start your counts from 1\nAssuming you meant Column 1\n"; $col=1}
125 | 
126 | print $list;
127 | open(LIST, "<".$list)|| die $!;
128 | my %index;
129 | while(my $line=<LIST>){
130 | 	chomp $line;
131 | 	$index{uc($line)}++;
132 | }
133 | close LIST;
134 | 
135 | print "\t..";
136 | 
137 | $col--;
138 | my $out=fileparse($list, ".list");
139 | open(GFF, "<".$gff)|| die $!;
140 | open(OUT, ">".$out.".gff")|| die $!;
141 | while(my $line=<GFF>){
142 | 	chomp $line;
143 | 	my (@data)=split(/\t/, $line);
144 | 	my $contig=$data[$col];
145 | 	print OUT $line."\n" if ($index{uc($contig)});
146 | }
147 | close GFF;
148 | close OUT;
149 | print ".Done.\n"
150 | 
151 | 


--------------------------------------------------------------------------------
/scripts/getGIAnnotation:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | # usage: getGIAnnotation <blastOutput> <database_type>
22 | # example: getGIAnnotation test.blastn nucleotide
23 | if [ -z "$1" ]; then echo "command failed: Give a file name with list"; exit; fi
24 | if [ ! -s "$1" ]; then echo "$1 does not exist"; exit; fi
25 | cut -d '|' -f 2 "$1" | sort -u > gi.list
26 | 
27 | getGiInfo -d "$2" -o anno.xml -l gi.list
28 | 
29 | GI_info_XMLParser anno.xml gi.desc
30 | 


--------------------------------------------------------------------------------
/scripts/getGISummary:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Copyright 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | use Bio::DB::EUtilities;
21 | use strict;
22 |  
23 | my @ids;
24 | open (IN, $ARGV[0]) || die "[error] $ARGV[0] : $!\n";
25 | while (my $line=<IN>){
26 | 	next if $line=~ m/^#/;
27 | 	chomp $line;
28 | 	$line=~ s/\r//;
29 | 	next unless $line;
30 | 
31 | 	push(@ids, $line);
32 | }
33 |  
34 | my $factory = Bio::DB::EUtilities->new(-eutil => 'esummary',
35 |                                        -email => 'sunitj@umich.edu',
36 |                                        -db    => 'protein',
37 |                                        -id    => \@ids);
38 |  
39 | open (OUT, ">".$ARGV[1]);
40 | while (my $ds = $factory->next_DocSum) {
41 | 	my $id=$ds->get_id;
42 |     print OUT $id."\t";
43 |     # flattened mode
44 |     while (my $item = $ds->next_Item('flattened'))  {
45 |         # not all Items have content, so need to check...
46 | 		if ($item->get_content){    
47 | 	    	my $name= $item->get_name;
48 | 			my $content= $item->get_content;
49 | 			print OUT $name."\t".$content;
50 | 		}
51 |     }
52 | 	print OUT "\n";
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/scripts/getMasterList:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Copyright 2013, 2014, 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | use strict;
21 | use Getopt::Long;
22 | 
23 | my $ext="out";
24 | my $out=$$.".list";
25 | my $col=1;
26 | my $bs=0;
27 | GetOptions(
28 | 	'e:s'=>\$ext,
29 | 	'o:s'=>\$out,
30 | 	'c:i'=>\$col,
31 | 	's:f'=>\$bs,
32 | );
33 | 
34 | my @listOfFiles=glob("*.".$ext);
35 | print @listOfFiles." Filenames provided\n";
36 | 
37 | my $c= $col-1;
38 | my %masterList;
39 | open (OUT, ">".$out);
40 | foreach my $f(@listOfFiles){
41 | 	my $fh;	
42 | 	open($fh, $f) || die "[error] $f: $!\n";
43 | 	while (my $line=<$fh>){
44 | 		next if ($line=~ m/^#/);
45 | 		chomp $line;
46 | 		$line=~ s/\r//g;
47 | 		next unless $line;
48 | 
49 | 		my @cols=split(/\t/, $line);
50 | 		print OUT $cols[$c]."\n" unless ($masterList{$cols[$c]});
51 | 		$masterList{$cols[$c]}++;
52 | 	}
53 | 	close $fh;
54 | }
55 | close OUT;
56 | exit;
57 | 


--------------------------------------------------------------------------------
/scripts/getMyContigs:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl
 2 | 
 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | # USAGE: perl getMyContigs.pl <CoverageOutputFromReadCoverage Script> <Your Contig Shortlist file> <OUTPUT.list>
22 | 
23 | use strict;
24 | 
25 | my $readCov=$ARGV[0];
26 | my $list=$ARGV[1];
27 | my $OUT=$ARGV[2];
28 | 
29 | die "Incorrect number of files input\nUSAGE: perl getMyContigs.pl <CoverageOutputFromReadCoverage Script> <Your Contig Shortlist file> <OUTPUT.list>" if (scalar(@ARGV) != 3);
30 | 
31 | open(LIST, $list)|| $!;
32 | my %LIST;
33 | while(my $line=<LIST>){
34 | 	chomp;
35 | 	next unless $line;
36 | 	next if $line=~ /^#/;
37 | #	NODE_14_length_2679_cov_8.406121
38 | 	my @headerParts=split(/\_/, $line);
39 | 	$LIST{$headerParts[1]}++;
40 | }
41 | close LIST;
42 | 
43 | my %READS;
44 | open(READ, $readCov)|| $!;
45 | while(my $line=<READ>){
46 | 	next if $line=~ /^#/;
47 | 	chomp $line;
48 | 	next unless $line;
49 | 
50 | 	my ($contigName, $size, @reads)=split(/\t/, $line);
51 | 
52 | 	next unless $LIST{$contigName};
53 | 	foreach my $r(@reads){
54 | 		$READS{$r}++;
55 | 	}
56 | }
57 | close READ;
58 | undef %LIST;
59 | 
60 | print "Total # Reads Mapped to this bin:".keys(%READS)."\n";
61 | open(OUT, ">".$OUT)|| die $!;
62 | foreach my $r(keys %READS){
63 | 	print OUT $r."\n";
64 | }
65 | close OUT;
66 | 


--------------------------------------------------------------------------------
/scripts/kmerFreq:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | 
 21 | use strict;
 22 | use Getopt::Long;
 23 | 
 24 | ###################################
 25 | ## Parameters
 26 | ###################################
 27 | 
 28 | my $seqFile;
 29 | my $kmerFile=$$.".kmer.out";
 30 | my $kps;
 31 | my $isFastq;
 32 | my $k=4;
 33 | 
 34 | GetOptions(
 35 | 	'i|in:s'=>\$seqFile,
 36 | 	'k|kmer:i'=>\$k,
 37 | 	'o|out:s'=>\$kmerFile,
 38 | 	's|each_seq:s'=>\$kps,
 39 | 	'fq|fastq'=>\$isFastq,
 40 | );
 41 | 
 42 | ###################################
 43 | ## Main
 44 | ###################################
 45 | 
 46 | $/= $isFastq ? "@" : ">";
 47 | my %kmers;
 48 | my %kmersPerSeq;
 49 | 
 50 | open (SEQ, $seqFile)|| die "$! : $seqFile\n";
 51 | while(my $line=<SEQ>){
 52 | 	next if ($line=~ /^#/);
 53 | 	chomp $line;
 54 | 	$line=~ s/ //;
 55 | 	next unless $line;
 56 | 
 57 | 	$isFastq ? &parseFastq($line) : &parseFasta($line);
 58 | }
 59 | close SEQ;
 60 | 
 61 | open (SUMM, ">".$kmerFile);
 62 | my %seen;
 63 | my @kmerArray;
 64 | while(my($kmer, $count)=each(%kmers)){
 65 | 	next if $seen{$kmer};
 66 | 	
 67 | 	my $rc_kmer=&rev_comp($kmer);
 68 | 	
 69 | 	# if rev_comp exists, combine counts and mark with '*'.
 70 | 	my $totalCount= $kmers{$rc_kmer} ? ($kmers{$kmer} + $kmers{$rc_kmer})."\t\*" : $kmers{$kmer} ;
 71 | 
 72 | 	# Print to Output file
 73 | 	print SUMM $kmer."\t".$totalCount."\n";
 74 | 	push(@kmerArray, $kmer);
 75 | 	$seen{$kmer}++;
 76 | 	$seen{$rc_kmer}++;
 77 | }
 78 | close SUMM;
 79 | unlink %kmers;
 80 | unlink %seen;
 81 | 
 82 | if($kps){
 83 | 	open (OUT, ">".$kps);
 84 | 	print OUT "#SeqNames.\t";
 85 | 	foreach my $km(@kmerArray){ print OUT $km."\t"; }
 86 | 	print OUT "\n";
 87 | 
 88 | 	foreach my $desc(keys %kmersPerSeq){
 89 | 		print OUT $desc."\t";
 90 | 		foreach my $km(@kmerArray){
 91 | 			my $rc_km=&rev_comp($km);
 92 | 			my $totalCount= $kmersPerSeq{$desc}{$km} + $kmersPerSeq{$desc}{$rc_km};
 93 | 			$totalCount = $totalCount ? $totalCount : 0;
 94 | 			print OUT $totalCount."\t";
 95 | 		}
 96 | 		print OUT "\n";
 97 | 	}
 98 | 	close OUT;
 99 | }
100 | unlink %kmersPerSeq;
101 | exit;
102 | 
103 | ###################################
104 | ## Sub-Routines
105 | ###################################
106 | 
107 | sub parseFastq{
108 | 	my 	$line=shift;
109 | 	my($seqDesc,$seq,$qualDesc,$qual)=split(/\n/, $line);
110 | 	$seq=~ s/ //g;
111 | 	&getKmers($seq, $seqDesc);
112 | 	return;
113 | }
114 | 
115 | sub parseFasta{
116 | 	my 	$line=shift;
117 | 	my($seqDesc,@sequence)=split(/\n/, $line);
118 | 	my $seq=join("",@sequence);
119 | 	$seq=~ s/ //g;
120 | 	&getKmers($seq, $seqDesc);
121 | 	return;
122 | }
123 | 
124 | sub getKmers{
125 | 	my $seq = shift;
126 | 	my $desc=shift;
127 | 	$seq=uc($seq);
128 | 	my $windows=length($seq) - $k;
129 | 	
130 | 	for (my $pos=0; $pos <= $windows; $pos++){
131 | 		my $kmer=substr $seq, $pos, $k;
132 | 		$kmers{$kmer}++;
133 | 		$kmersPerSeq{$desc}{$kmer}++;
134 | 	}
135 | 	return;
136 | }
137 | 
138 | sub rev_comp{
139 | 	my $seq=shift;
140 | 	$seq=uc($seq);
141 | 	$seq=reverse($seq);
142 | 	$seq=~ tr/ATGCN/TACGN/;
143 | 	return $seq;
144 | }
145 | 


--------------------------------------------------------------------------------
/scripts/length+GC:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl
  2 | 
  3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | =head1 NAME
 21 | 
 22 | length+GC - extract length and GC content from a fasta file
 23 | 
 24 | 
 25 | =head1 SYNOPSIS
 26 | 
 27 | length+GC.pl -f input.fasta
 28 | 
 29 | 
 30 | =head1 DESCRIPTION
 31 | 
 32 | This program takes a fasta file, extracts length and %GC information (if '-gc' is specified)
 33 | 
 34 | 
 35 | =head1 OPTIONS
 36 | 
 37 | =over 8
 38 | 
 39 | =item B<-gc>
 40 | 
 41 | Calculate GC(%) content.
 42 | 
 43 | =item B<-len>
 44 | 
 45 | calculate for sequences abovea certain length only.
 46 | 
 47 | =back
 48 | 
 49 | 
 50 | =head1 AUTHOR
 51 | 
 52 | Sunit Jain
 53 | 
 54 | 
 55 | =head1 SEE ALSO
 56 | 
 57 | L<omics(1)>, L<illumina-reads-processing(7)>
 58 | 
 59 | =head2 Other local resources
 60 | 
 61 | =over
 62 | 
 63 | =item [1]
 64 | 
 65 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
 66 | 
 67 | =item [2]
 68 | 
 69 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
 70 | 
 71 | =back
 72 | 
 73 | =head2 Web
 74 | 
 75 | =over
 76 | 
 77 | =item [3]
 78 | 
 79 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
 80 | 
 81 | =item [4]
 82 | 
 83 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
 84 | 
 85 | =item [5]
 86 | 
 87 | L<Github repository|https://github.com/Geo-omics/scripts>
 88 | 
 89 | =back
 90 | 
 91 | =cut
 92 | 
 93 | 
 94 | use strict;
 95 | use Getopt::Long;
 96 | use FileHandle;
 97 | 
 98 | my $calcGC;
 99 | my $fasta;
100 | my $minLen=1;
101 | my $version="0.1.1";
102 | 
103 | GetOptions(
104 | 	"gc"=>\$calcGC,
105 | 	"f:s"=>\$fasta,
106 | 	"len:i"=>\$minLen,
107 | 	"v|version"=>\$version,
108 | 	"h|help"=>sub{system('perldoc', $0); exit;},
109 | );
110 | 
111 | &help if ! $fasta;
112 | 
113 | my $CONTIGS=FileHandle->new();
114 | open ($CONTIGS, "<",$fasta) || die "Couldn't open $fasta\n";
115 | $/= ">";
116 | my (%sequences, @names);
117 | while (my $b = <$CONTIGS>) {
118 |     chomp $b;
119 |     next unless $b;
120 |     my ($name, @sequence) = split (/\n/, $b);
121 |     my $seq = join ("", @sequence);
122 |     my $length = length($seq);
123 | 	if($length < $minLen){
124 | 	    print STDERR "[WARNING: Length_less_than_minimum]\t".$name."\t".$length."\n";
125 | 	    next;
126 | 	}
127 | 
128 | 	unless ($calcGC){
129 | 		print "$name\t$length\n" ;
130 | 	}
131 | 	else{
132 | 		my ($g, $c);
133 | 		$seq=uc($seq);
134 | 	    while ( $seq =~ /G/ig ) { $g++ }
135 | 	    while ( $seq =~ /C/ig ) { $c++ }
136 | 
137 | 		my $GC = (($g+$c)/$length)*100;
138 | 		my $printGC = sprintf( "%.4f", $GC);
139 | 		print "$name\t$printGC\t$length\n";
140 | 	}
141 | }
142 | close $CONTIGS;
143 | 
144 | sub help{
145 | 	system('perldoc', $0);
146 | 	exit;
147 | }
148 | 
149 | exit;
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/scripts/map_project_names:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | 
 21 | =head1 NAME
 22 | 
 23 | map_project_names - Map IMG project names to your own. Creates Symbolic links with your project names to extracted IMG tar balls.
 24 | 
 25 | 
 26 | =head1 SYNOPSIS
 27 | 
 28 | B<map_project_names>
 29 | 
 30 | 
 31 | =head1 DESCRIPTION
 32 | 
 33 | Map IMG project names to your own. Creates Symbolic links with your project names to extracted IMG tar balls.
 34 | 
 35 | 
 36 | =head1 OPTIONS
 37 | 
 38 | =over 8
 39 | 
 40 | =item B<-version>, B<-v> I<BOOLEAN>
 41 | 
 42 | version of the current script
 43 | 
 44 | =item B<-help>, B<-h> I<BOOLEAN>
 45 | 
 46 | This message.
 47 | 
 48 | =back
 49 | 
 50 | 
 51 | =head1 AUTHOR
 52 | 
 53 | Sunit Jain, (Mon Feb 23 12:55:40 EST 2015)
 54 | sunitj [AT] umich [DOT] edu
 55 | 
 56 | 
 57 | =head1 SEE ALSO
 58 | 
 59 | L<omics(1)>, L<illumina-reads-processing(7)>
 60 | 
 61 | =head2 Other local resources
 62 | 
 63 | =over
 64 | 
 65 | =item [1]
 66 | 
 67 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
 68 | 
 69 | =item [2]
 70 | 
 71 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
 72 | 
 73 | =back
 74 | 
 75 | =head2 Web
 76 | 
 77 | =over
 78 | 
 79 | =item [3]
 80 | 
 81 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
 82 | 
 83 | =item [4]
 84 | 
 85 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
 86 | 
 87 | =item [5]
 88 | 
 89 | L<Github repository|https://github.com/Geo-omics/scripts>
 90 | 
 91 | =back
 92 | 
 93 | =cut
 94 | 
 95 | use strict;
 96 | use Getopt::Long;
 97 | use FileHandle;
 98 | use File::Basename;
 99 | 
100 | my $help;
101 | my $version=fileparse($0)."\tv0.0.1b";
102 | my $mapFile="names_map.txt";
103 | my $path2col1="./";
104 | my $outPath="./";
105 | 
106 | GetOptions(
107 |         'm|map:s'=>\$mapFile,
108 |         'p1|path2col1:s'=>\$path2col1,
109 |         'o|outdir:s'=>\$outPath,
110 | 	'v|version'=>sub{print $version."\n"; exit;},
111 | 	'h|help'=>sub{system("perldoc $0 \| cat"); exit;},
112 | );
113 | print "\# $version\n";
114 | 
115 | my %projects;
116 | my $MAP=FileHandle->new();
117 | open( $MAP, "<", $mapFile) || die $!;
118 | while(my $line=<$MAP>){
119 |     chomp $line;
120 |     next unless ($line=~/^\d+/);
121 |     next unless $line;
122 |     my($imgName, $sampleNum, $desc)=split(/\t/, $line);
123 |     my $oldPath=File::Spec->catdir($path2col1,$imgName);
124 |     my $newPath=File::Spec->catdir($outPath,$sampleNum);
125 |     symlink($oldPath, $newPath);
126 | }
127 | close $MAP;
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/scripts/mapper_getQueryList:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | 
 21 | =head1 NAME
 22 | 
 23 | mapper_getQueryList - get subset of query list from mapper script
 24 | 
 25 | =head1 SYNOPSIS
 26 | 
 27 | B<mapper_getQueryList> B<-log> I<log file> B<-list> I<list of Queries of interest> B<-out> I<output>
 28 | 
 29 | 
 30 | =head1 MOTIVATION
 31 | 
 32 | Get a subset of query list from the mapper script
 33 | 
 34 | 
 35 | =head1 AUTHOR
 36 | 
 37 | Sunit Jain, July 2013
 38 | 
 39 | 
 40 | =head1 SEE ALSO
 41 | 
 42 | L<omics(1)>, L<illumina-reads-processing(7)>
 43 | 
 44 | =head2 Other local resources
 45 | 
 46 | =over
 47 | 
 48 | =item [1]
 49 | 
 50 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
 51 | 
 52 | =item [2]
 53 | 
 54 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
 55 | 
 56 | =back
 57 | 
 58 | =head2 Web
 59 | 
 60 | =over
 61 | 
 62 | =item [3]
 63 | 
 64 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
 65 | 
 66 | =item [4]
 67 | 
 68 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
 69 | 
 70 | =item [5]
 71 | 
 72 | L<Github repository|https://github.com/Geo-omics/scripts>
 73 | 
 74 | =back
 75 | 
 76 | =cut
 77 | 
 78 | use strict; 
 79 | use Getopt::Long;
 80 | 
 81 | my ($logFile,$list);
 82 | my $version="0.3.1";
 83 | my $compatible="0.3.0 +";
 84 | my $out=$$.".list";
 85 | 
 86 | GetOptions(
 87 | 	'log=s'=>\$logFile,
 88 | 	'list=s'=>\$list,
 89 | 	'o|out:s'=>\$out,
 90 | 	'v|version'=>sub{print $version."\n"."Compatible with mapper script version $compatible"; exit;},
 91 | 	'h|help'=>sub{system('perldoc', $0); exit;},
 92 | );
 93 | 
 94 | my %index;
 95 | open(LIST, "<".$list)|| die $!;
 96 | while(my $line=<LIST>){
 97 | 	next if ($line=~ /^#/);
 98 | 	chomp($line);
 99 | 	$line=~ s/\r//;
100 | 	next unless $line;
101 | 	
102 | 	$index{$line}++;
103 | }
104 | close LIST;
105 | 
106 | open (LOG, "<".$logFile)|| die $!;
107 | open (OUT, ">".$out)|| die $!;
108 | while(my $line=<LOG>){
109 | 	next if ($line=~ /^#/);
110 | 	chomp($line);
111 | 	$line=~ s/\r//;
112 | 	next unless $line;
113 | 	
114 | 	my ($subj, @queries)=split(/\t/, $line);
115 | 	next unless $index{$subj};
116 | 
117 | 	print OUT "#".$subj."\n";
118 | 	foreach my $q(@queries){
119 | 		print OUT $q."\n";
120 | 	}
121 | }
122 | close LOG;
123 | close OUT;
124 | 


--------------------------------------------------------------------------------
/scripts/match-dada2-mothur:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2020 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | """
 21 | Match final DADA2 sequences to sequences in mothur pre.cluster map files
 22 | """
 23 | import argparse
 24 | from pathlib import Path
 25 | 
 26 | from Bio import pairwise2
 27 | 
 28 | 
 29 | argp = argparse.ArgumentParser(description=__doc__)
 30 | argp.add_argument(
 31 |     'pre_cluster_map',
 32 |     metavar='PRECLUSTER_MAP',
 33 |     type=argparse.FileType(),
 34 |     help='map output file from mothur\'s per.cluster command',
 35 | )
 36 | argp.add_argument(
 37 |     'dada2_seqtab',
 38 |     metavar='DADA2_SEQTAB',
 39 |     type=argparse.FileType(),
 40 |     help='DADA2 sequence table, saved with write.table and tab delimited',
 41 | )
 42 | argp.add_argument('--version', action='version', version='%(prog)s '
 43 |                   'is part of geo-omics-scripts VERSION_PLACEHOLDER')
 44 | args = argp.parse_args()
 45 | 
 46 | # output file names
 47 | map_o = Path(Path(args.pre_cluster_map.name).name)
 48 | map_o = map_o.with_suffix('.dada2' + map_o.suffixes[-1])
 49 | fa_o = Path(Path(args.dada2_seqtab.name).name).with_suffix('.fa')
 50 | 
 51 | # read map file
 52 | map_data = []
 53 | map_header = args.pre_cluster_map.readline()
 54 | for line in args.pre_cluster_map:
 55 |     line = line.strip()
 56 |     map_data.append(line.split('\t'))
 57 | 
 58 | # read seqtab file
 59 | seqtab = []
 60 | seqs = args.dada2_seqtab.readline()
 61 | seqs = seqs.strip().split('\t')
 62 | seqs = [i.replace('"', '') for i in seqs]
 63 | for line in args.dada2_seqtab:
 64 |     line = line.strip()
 65 |     seqtab.append(line.split('\t'))
 66 | 
 67 | # assign ids to sequences
 68 | seqs = {'dada2_{}'.format(i): s for i, s in enumerate(seqs)}
 69 | 
 70 | 
 71 | # implement sequence matching
 72 | def match_seq(query_seq, seqs):
 73 |     """
 74 |     Match a mothur-aligned query seq to those in seqs
 75 | 
 76 |     Return id of closest match
 77 |     """
 78 |     # rm gaps
 79 |     q = query_seq.replace('-', '')
 80 |     high_score = None
 81 |     best_hit = None
 82 |     for id, seq in seqs.items():
 83 |         score = pairwise2.align.globalms(
 84 |             q, seq,
 85 |             1, -1, -1, -1,
 86 |             score_only=True
 87 |         )
 88 |         if high_score is None or score > high_score:
 89 |             high_score = score
 90 |             best_hit = id
 91 | 
 92 |     return best_hit
 93 | 
 94 | 
 95 | # write output map
 96 | with map_o.open('w') as f:
 97 |     map_header = map_header.split('\t')
 98 |     map_header.insert(4, 'dada2')
 99 |     f.write('\t'.join(map_header))
100 |     for row in map_data:
101 |         match = match_seq(row[4], seqs)
102 |         if match is None:
103 |             match = ''
104 |         row.insert(4, match)
105 |         f.write('\t'.join(row) + '\n')
106 | 
107 | print('map output written to: {}'.format(map_o))
108 | 
109 | # write fasta file
110 | with fa_o.open('w') as f:
111 |     for id, seq in seqs.items():
112 |         f.write('>{}\n{}\n'.format(id, seq))
113 | 
114 | print('Fasta output written to: {}'.format(fa_o))
115 | 


--------------------------------------------------------------------------------
/scripts/matchQueryNames:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | 
 21 | =head1 NAME
 22 | 
 23 | matchQueryNames - Do this.
 24 | 
 25 | 
 26 | =head1 SYNOPSIS
 27 | 
 28 | B<matchQueryNames>
 29 | 
 30 | 
 31 | =head1 DESCRIPTION
 32 | 
 33 | Do this.
 34 | 
 35 | 
 36 | =head1 OPTIONS
 37 | 
 38 | =over 8
 39 | 
 40 | =item B<-version>, B<-v> I<BOOLEAN>
 41 | 
 42 | version of the current script
 43 | 
 44 | =item B<-help>, B<-h> I<BOOLEAN>
 45 | 
 46 | This message.
 47 | 
 48 | =back
 49 | 
 50 | 
 51 | =head1 AUTHOR
 52 | 
 53 | Sunit Jain, (Fri Jul 18 14:46:20 EDT 2014)
 54 | sunitj [AT] umich [DOT] edu
 55 | 
 56 | 
 57 | =head1 SEE ALSO
 58 | 
 59 | L<omics(1)>, L<illumina-reads-processing(7)>
 60 | 
 61 | =head2 Other local resources
 62 | 
 63 | =over
 64 | 
 65 | =item [1]
 66 | 
 67 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
 68 | 
 69 | =item [2]
 70 | 
 71 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
 72 | 
 73 | =back
 74 | 
 75 | =head2 Web
 76 | 
 77 | =over
 78 | 
 79 | =item [3]
 80 | 
 81 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
 82 | 
 83 | =item [4]
 84 | 
 85 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
 86 | 
 87 | =item [5]
 88 | 
 89 | L<Github repository|https://github.com/Geo-omics/scripts>
 90 | 
 91 | =back
 92 | 
 93 | =cut
 94 | 
 95 | use strict;
 96 | use Getopt::Long;
 97 | 
 98 | my ($meta1, $meta2, $out);
 99 | my $help;
100 | my $version="matchQueryNames.pl\tv0.0.1b";
101 | GetOptions(
102 | 	'1|meta1:s'=>\$meta1,
103 | 	'2|meta2:s'=>\$meta2,
104 | 	'o|out:s'=>\$out,
105 | 	'v|version'=>sub{print $version."\n"; exit;},
106 | 	'h|help'=>sub{system("perldoc $0 \| cat"); exit;},
107 | );
108 | print "\# $version\n";
109 | my %metaDataIndex;
110 | print $meta1."\n";
111 | open(META1, "<".$meta1)|| die $!;
112 | while(my $line=<META1>){
113 | 	next if ($line=~/^#/);
114 | 	chomp $line;
115 | 	next unless $line;
116 | 	match("1",$line);
117 | }
118 | close META1;
119 | 
120 | open(META2, "<".$meta2)|| die $!;
121 | while(my $line=<META2>){
122 | 	next if ($line=~/^#/);
123 | 	chomp $line;
124 | 	next unless $line;
125 | 	match("2",$line);
126 | }
127 | close META2;
128 | 
129 | open(OUT, ">".$out)||die $!;
130 | print OUT $meta1."\t".$meta2."\n";
131 | my $gt_2=0;
132 | my %seen;
133 | foreach my $meta(keys %metaDataIndex){
134 | 	if (@{$metaDataIndex{$meta}}>2){
135 | 		$gt_2++;
136 | 	}
137 | 	elsif(@{$metaDataIndex{$meta}}==2){
138 | 		my $line;
139 | 		foreach my $name(@{$metaDataIndex{$meta}}){
140 | 			$line.=$name."\t";
141 | 		}
142 | 		$line=~ s/\t$/\n/;
143 | 		next if $seen{$line};
144 | 		print OUT $line;
145 | 		$seen{$line}++;
146 | 	}
147 | }
148 | close OUT;
149 | print "gt_2 = ".$gt_2."\n";
150 | 
151 | sub match{
152 | 	my $prefix=shift;
153 | 	my $line=shift;
154 | 
155 | 	my($alias, @metadata)=split(/\t/,$line);
156 | 	foreach my $m(@metadata){
157 | 		push(@{$metaDataIndex{$m}},$prefix."_".$alias);
158 | 	}
159 | }
160 | 
161 | 


--------------------------------------------------------------------------------
/scripts/nameClassFiles:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | 
 21 | =head1 NAME
 22 | 
 23 | nameClassFiles - Rename class files according to a tab-separated list of old/original names to new/more sensible names.
 24 | 
 25 | 
 26 | =head1 SYNOPSIS
 27 | 
 28 | B<nameClassFiles> B<-tsv> old_and_new_filenames.tsv B<-ext> fasta
 29 | 
 30 | 
 31 | =head1 DESCRIPTION
 32 | 
 33 | Rename class files according to a tab-separated list of old/original names to new/more sensible names.
 34 | 
 35 | 
 36 | =head1 OPTIONS
 37 | 
 38 | =over 8
 39 | 
 40 | =item B<-tsv> I<CHAR>
 41 | 
 42 | col1=old name; <TAB> col2=new name
 43 | 
 44 | =item B<-out> I<CHAR>
 45 | 
 46 | Output Folder [default: "Renamed"]
 47 | 
 48 | =item B<-ext> I<CHAR>
 49 | 
 50 | extensions for the old and new names if the tsv doesn't already have them.
 51 | 
 52 | =item B<-version>, B<-v> I<BOOLEAN>
 53 | 
 54 | version of the current script
 55 | 
 56 | =item B<-help>, B<-h> I<BOOLEAN>
 57 | 
 58 | This message.
 59 | 
 60 | =back
 61 | 
 62 | 
 63 | =head1 AUTHOR
 64 | 
 65 | Sunit Jain, (Mon Feb 10 14:26:12 EST 2014)
 66 | sunitj [AT] umich [DOT] edu
 67 | 
 68 | 
 69 | =head1 SEE ALSO
 70 | 
 71 | L<omics(1)>, L<illumina-reads-processing(7)>
 72 | 
 73 | =head2 Other local resources
 74 | 
 75 | =over
 76 | 
 77 | =item [1]
 78 | 
 79 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
 80 | 
 81 | =item [2]
 82 | 
 83 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
 84 | 
 85 | =back
 86 | 
 87 | =head2 Web
 88 | 
 89 | =over
 90 | 
 91 | =item [3]
 92 | 
 93 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
 94 | 
 95 | =item [4]
 96 | 
 97 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
 98 | 
 99 | =item [5]
100 | 
101 | L<Github repository|https://github.com/Geo-omics/scripts>
102 | 
103 | =back
104 | 
105 | =cut
106 | 
107 | use strict;
108 | use Getopt::Long;
109 | use File::Spec;
110 | use File::Copy "cp";
111 | 
112 | my ($tsv);
113 | my $out="Renamed";
114 | my $ext="fasta";
115 | my $help;
116 | my $version="nameClassFiles.pl\tv0.0.2b";
117 | GetOptions(
118 | 	'list|tsv:s'=>\$tsv,
119 | 	'out:s'=>\$out,
120 | 	'ext:s'=>\$ext,
121 | 	'v|version'=>sub{print $version."\n"; exit;},
122 | 	'h|help'=>sub{system("perldoc $0 \| cat"); exit;},
123 | );
124 | print "\# $version\n";
125 | 
126 | unless (-e $out){mkdir($out, 0755)};
127 | 
128 | my %tracker;
129 | open(TSV, "<".$tsv)|| die $!;
130 | while(my $line=<TSV>){
131 | 	next if ($line=~ "^#");
132 | 	chomp $line;
133 | 	next unless $line;
134 | 	
135 | 	my($old, $new)=split(/\t/, $line);
136 | 	if ($ext){
137 | 		$old.=".".$ext;
138 | 		$new.=".".$ext;
139 | 	}
140 | 	$tracker{$old}=$new;
141 | }
142 | close TSV;
143 | 
144 | foreach my $file(keys %tracker){
145 | 	my $new=File::Spec->catfile( $out, $tracker{$file} );
146 | 	print "Creating:\t$new\n";
147 | 	cp($file, $new);
148 | }
149 | 


--------------------------------------------------------------------------------
/scripts/oasesPaired_pipe:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | #velveth assembly_61 61 -fastq -short derep_trimmed_day_fwd.fastq
22 | #velvetg assembly_61 -read_trkg yes
23 | #oases ~/Velvet_Assembly &
24 | 
25 | 
26 | module load AMOS/3.1.0-rc1
27 | module load velvet/1.1.07-MAX99-OPENMP
28 | module load oases/0.2.01
29 | 
30 | args=("$@")
31 | 
32 | KMER=${args[0]}
33 | FASTQ=${args[@]:1} # everything from element 2 onwards, inc element 2
34 | 
35 | INTERVAL=120  # Change this number to modify the time interval (in seconds) of the usageStats script.
36 | 
37 | echo "K-mer Length=		$KMER"
38 | echo "FastQ Files=	$FASTQ"
39 | 
40 | if [ $# -ne 3 ]; then # if num of arguments not equal to 4
41 | 	echo "USAGE: ./$0 <k-mer Length> <Forward Fastq File> <Reverse Fastq File>"
42 | 	echo "Yes, it has to be in that exact order!"
43 | 	exit
44 | fi
45 | 
46 | OUTDIR="assembly_paired_$KMER"
47 | # bash check if directory exists
48 | if [ -d $OUTDIR ]; then
49 | 	echo "$OUTDIR already exists!"
50 | 	exit
51 | fi 
52 | 
53 | BANK="bank_paired_$KMER"
54 | if [ -d $BANK ]; then
55 | 	echo "$BANK already exists!"
56 | 	exit
57 | fi 
58 | LOG="$OUTDIR.log"
59 | STATS="usageStats_K$KMER.tsv"
60 | 
61 | perl /geomicro/data1/COMMON/scripts/usageStats.pl -i $INTERVAL -o $STATS -e &
62 | 
63 | echo "************************************* VELVETH *******************************************************" > $LOG
64 | echo >> $LOG
65 | velveth $OUTDIR $KMER -fastq -shortPaired $FASTQ >> $LOG
66 | echo >> $LOG
67 | echo "************************************* VELVETG *******************************************************" >> $LOG
68 | echo >> $LOG
69 | velvetg $OUTDIR -read_trkg yes >> $LOG
70 | echo >> $LOG
71 | echo "************************************* OASES **************************************************" >> $LOG
72 | echo >> $LOG
73 | oases $OUTDIR >> $LOG
74 | echo >> $LOG
75 | 
76 | 
77 | exit
78 | 


--------------------------------------------------------------------------------
/scripts/omics:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | from omics.__main__ import main
21 | 
22 | 
23 | main()
24 | 


--------------------------------------------------------------------------------
/scripts/omics-container:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | echo "omics-container is deprecated, use comics instead"
21 | exit 1
22 | 


--------------------------------------------------------------------------------
/scripts/omics-init:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | from omics import init
21 | 
22 | 
23 | init.main()
24 | 


--------------------------------------------------------------------------------
/scripts/omics-prep:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | from omics import prep
21 | 
22 | 
23 | prep.main()
24 | 


--------------------------------------------------------------------------------
/scripts/omics-qc:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2014, 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | from omics import qc
21 | 
22 | 
23 | qc.main()
24 | 


--------------------------------------------------------------------------------
/scripts/omics-qc-check:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | from omics import qc_check
21 | 
22 | 
23 | qc_check.main()
24 | 


--------------------------------------------------------------------------------
/scripts/omics-run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | echo "omics-run is deprecated, use comics instead"
21 | exit 1
22 | 


--------------------------------------------------------------------------------
/scripts/parseBlastXML:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | use strict;
 21 | use Bio::SearchIO; 
 22 | use Getopt::Long;
 23 | 
 24 | =head1 NAME
 25 | 
 26 | parseBlastXML - parse and print BlastXML results
 27 | 
 28 | 
 29 | =head1 SYNOPSIS
 30 | 
 31 | B<parseBlastXML> -b <blastXML file>
 32 | 
 33 | 
 34 | =head1 DESCRIPTION
 35 | 
 36 | parse BlastXML results and print out Query, Subj, %id, evalue, bitScore. To get more contact me.
 37 | 
 38 | 
 39 | =head1 OPTIONAL
 40 | 
 41 | -o output file name; default processID.tsv,
 42 | -s minimum bit score; default 0
 43 | 
 44 | =head4 NOTE:
 45 | 
 46 | To use this script Bioperl MUST be installed.
 47 | For our lab, here is how you can access Bioperl for this script:
 48 | 
 49 |  1) Make sure you're on Cayman.
 50 |  2) type "/opt/package/perl5/5.12.2/bin/perl" instead of just "perl" while running the script. example:
 51 | 
 52 |    /opt/package/perl5/5.12.2/bin/perl parseBlastXML.pl -b <blastXML file>
 53 | 
 54 | 
 55 | =head1 AUTHOR
 56 | 
 57 |  Sunit Jain, July 2011
 58 |  sunitj [AT] umich [DOT] edu
 59 | 
 60 | 
 61 | =head1 SEE ALSO
 62 | 
 63 | L<omics(1)>, L<illumina-reads-processing(7)>
 64 | 
 65 | =head2 Other local resources
 66 | 
 67 | =over
 68 | 
 69 | =item [1]
 70 | 
 71 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
 72 | 
 73 | =item [2]
 74 | 
 75 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
 76 | 
 77 | =back
 78 | 
 79 | =head2 Web
 80 | 
 81 | =over
 82 | 
 83 | =item [3]
 84 | 
 85 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
 86 | 
 87 | =item [4]
 88 | 
 89 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
 90 | 
 91 | =item [5]
 92 | 
 93 | L<Github repository|https://github.com/Geo-omics/scripts>
 94 | 
 95 | =back
 96 | 
 97 | =cut
 98 | 
 99 | my $blastXml;
100 | my $tab=$$.".tsv";
101 | my $bs=0;
102 | GetOptions(
103 | 	'b:s'=>\$blastXml,
104 | 	'o:s'=>\$tab,
105 | 	's:f'=>\$bs,
106 | );
107 | 
108 | open (OUT, ">".$tab);
109 | print OUT "#File:$blastXml\tMin_BitScore:$bs\n";
110 | print OUT "#Query\tHit\t\%ID\tEvalue\tBitScore\n";
111 | chomp($blastXml);
112 | 
113 | my $in = new Bio::SearchIO(-format => 'blastxml', -file=> $blastXml);
114 | while( my $result = $in->next_result ) {
115 |   ## $result is a Bio::Search::Result::ResultI compliant object
116 | 	while( my $hit = $result->next_hit ) {
117 | 		## $hit is a Bio::Search::Hit::HitI compliant object
118 | 		while( my $hsp = $hit->next_hsp ) {
119 | 		## $hsp is a Bio::Search::HSP::HSPI compliant object
120 | 			if($hsp->score >= $bs){
121 | 				print OUT $result->query_name,
122 | 				"\t", $hit->name,
123 | 				"\t", $hsp->percent_identity,
124 | 				"\t", $hsp->evalue,
125 | 				"\t", $hsp->score,"\n";
126 | 			}
127 | 		}
128 | 	}
129 | }
130 | 


--------------------------------------------------------------------------------
/scripts/parseTinySeqXML.xslt:
--------------------------------------------------------------------------------
 1 | <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
 2 | <xsl:output method="text"/>
 3 | <xsl:template match="/">
 4 | <xsl:for-each select="//TSeq[TSeq_seqtype/@value='protein']">
 5 | 	<xsl:value-of select="TSeq_gi"/>
 6 |         <xsl:text>&#x9;</xsl:text>
 7 |         <xsl:value-of select="TSeq_taxid"/>
 8 |         <xsl:text>&#x9;</xsl:text>
 9 |         <xsl:value-of select="TSeq_sequence"/>
10 |         <xsl:text>&#10;</xsl:text>
11 | </xsl:for-each>
12 | </xsl:template>
13 | </xsl:stylesheet>
14 | 


--------------------------------------------------------------------------------
/scripts/patchBlastLineage:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Copyright 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | use strict;
22 | 
23 | # $ARGV[0]; Blast output.
24 | # $ARGV[1]; name of hit (y/n).
25 | 
26 | my $fName= $ARGV[0];
27 | my $lFile= "l_".$fName.".txt";
28 | 
29 | my %index;
30 | open (LF, $lFile) || die "[err] $lFile not found\n".$!."\n";
31 | while (my $desc=<LF>){
32 | 	my($gi, $taxa, $rank)=split(/\t/, $desc);
33 | 	chomp($gi);
34 | 	chomp($taxa);
35 | 	$index{$gi}=$taxa;
36 | }
37 | close LF;
38 | 
39 | open (OUT, ">taxaBlast_".$ARGV[0]);
40 | open (BO, $fName) || die "[err] $fName not found\n".$!."\n";
41 | while(my $line=<BO>){
42 | 	next if ($line=~ m/^\#/);
43 | 	my @blast=split(/\t/, $line);
44 | 	chomp(@blast);
45 | 	my($giTag, $gi, $id, $name)=split(/\|/, $blast[1]);
46 | 	chomp($gi);
47 | 	$blast[1]=$index{$gi}."\|".$id;
48 | 	$blast[1].="\|".$name if (lc($ARGV[1]) eq 'y');
49 | 	my $bo=join("\t", @blast);
50 | 	print OUT $bo."\n";
51 | }
52 | close BO;
53 | close OUT;
54 | 


--------------------------------------------------------------------------------
/scripts/refseq-rna:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | #
21 | #
22 | # Download RefSeq rna sequences and create blast db
23 | #
24 | #
25 | set -eE
26 | # shellcheck disable=SC2034
27 | {
28 | USAGE="[OPTIONS]..."
29 | HELP="download archaeal and bacterial NCBI RefSeq RNA sequences and build BLAST DBs"
30 | ARG_HELP="
31 | "
32 | CHECK_PROGS="makeblastdb"
33 | }
34 | # shellcheck disable=SC1090
35 | . "$(dirname "$0")/../share/geo-omics-scripts/liba.sh" || (echo "Failed to source script library"; exit 1)
36 | 
37 | URL=ftp://ftp.ncbi.nih.gov/refseq/release
38 | 
39 | if [ "$VERBOSITY" -ge 2 ]; then
40 |     CURL=curl
41 | else
42 |     CURL="curl --silent"
43 | fi
44 | 
45 | if [[ -z ${REFSEQ_RELEASE:-} ]]; then
46 |     echo -n "Getting current RefSeq release number... "
47 |     refseq_release=$($CURL $URL/RELEASE_NUMBER)
48 |     archive_part=
49 |     echo "$refseq_release"
50 | else
51 |     refseq_release=$REFSEQ_RELEASE
52 |     # assume given release number is from archive, not the current one
53 |     archive_part=archive/
54 | fi
55 | [[ $refseq_release =~ [0-9]+ ]] || (echo "Failed to get RefSeq release number, got: ${refseq_release}"; exit 1)
56 | 
57 | out_dir="$WORK_DIR/refseq$refseq_release"
58 | $MKDIR "$out_dir" || (echo "Directory exists alread: $out_dir"; exit 1)
59 | 
60 | log=$out_dir/refseq-rna.log
61 | 
62 | files_installed=$URL/release-catalog/${archive_part}release${refseq_release}.files.installed
63 | curl --silent "$files_installed" | grep -o -P '(archaea|bacteria).[0-9]+\.rna\.fna\.gz' | sort -n |
64 |     while read -r i; do
65 | 	echo -n "Getting $i... "
66 | 	kingdom=${i%%.*}
67 | 	url=$URL/$kingdom/$i
68 | 	out=$out_dir/$kingdom.rna.fasta
69 | 	$CURL "$url"  | gunzip -c >> "$out"
70 | 	echo "Appending $url to $out" >> "$log"
71 | 	echo "done"
72 |     done
73 | 
74 | for i in archaea bacteria; do
75 |     echo -n "Making blast db for $i... "
76 |     makeblastdb -in "$out_dir/$i.rna.fasta" -dbtype nucl >> "$log"
77 |     echo "done"
78 | done
79 | 
80 | echo -n "Finishing... "
81 | # no need to ever touch these files again
82 | chmod -w "$out_dir"/*
83 | 
84 | if [[ -z ${REFSEQ_RELEASE:-} ]]; then
85 |     $RM -f latest && $LN -s "$(basename "$out_dir")" latest
86 | fi
87 | echo "done"
88 | 


--------------------------------------------------------------------------------
/scripts/removeBlastSubj:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | use strict;
22 | 
23 | my $in=$ARGV[0];
24 | my $blastOut= $ARGV[1];
25 | my $out= $$.".QueriesFromListRemoved.out";
26 | my $list= $$.".QueriesFromList.out";
27 | 
28 | my %exclude;
29 | open(LIST, $in)|| die $!;
30 | while (my $line=<LIST>){
31 | 	next if ($line=~ m/^#/);
32 | 	chomp ($line);
33 | 	next unless ($line);
34 | 	$line=~ s/ //g;
35 | 	$line=~ s/\r//g;
36 | 	$line=lc($line);
37 | 	$exclude{$line}++;
38 | }
39 | close LIST;
40 | 
41 | print keys(%exclude)."\n";
42 | 
43 | open(BOUT, $blastOut) || die $!;
44 | open(OUT, ">".$out);
45 | open(OUT2, ">".$list);
46 | my $count=0;
47 | while (my $line= <BOUT>){
48 | 	next if ($line=~ m/^#/);
49 | 	chomp ($line);
50 | 	next unless ($line);
51 | 
52 | 	my ($query, $subj, @etc)=split(/\t/, $line);
53 | 	chomp($query, $subj);
54 | 	$subj=~ s/ //g;
55 | 
56 | 	$subj=lc($subj);
57 | 	if ($exclude{$subj}){
58 | 		$count++;
59 | 		print OUT2 $line."\n";
60 | 	}
61 | 	else{
62 | 		print OUT $line."\n";
63 | 	}
64 | }
65 | print "Matches Found:".$count."\n";
66 | close BOUT;
67 | close OUT;
68 | 


--------------------------------------------------------------------------------
/scripts/removeCommentLines:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | 
21 | use strict;
22 | 
23 | my $in=$ARGV[0];
24 | my $out=$ARGV[1];
25 | 
26 | open(IN, $in)|| die $!."\n";
27 | open(OUT, ">".$out);
28 | 
29 | while(my $line=<IN>){
30 | 	next if $line=~ /^#/;
31 | 	chomp $line;
32 | 	$line=~ s/\r//;
33 | 	next unless $line;
34 | 
35 | 	print OUT $line."\n";
36 | }
37 | close IN;
38 | close OUT;
39 | 


--------------------------------------------------------------------------------
/scripts/remove_space_from_filenames:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | find . -name '* *' | while read file;
21 | do
22 | target=`echo "$file" | sed 's/ /_/g'`;
23 | echo "Renaming '$file' to '$target'";
24 | mv "$file" "$target";
25 | done;
26 | 


--------------------------------------------------------------------------------
/scripts/reverse_complement:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | 
 21 | =head1 NAME
 22 | 
 23 | reverse_complement - Do this.
 24 | 
 25 | 
 26 | =head1 SYNOPSIS
 27 | 
 28 | B<reverse_complement> B<-fasta> input.fasta B<-out> output.fasta
 29 | 
 30 | 
 31 | =head1 DESCRIPTION
 32 | 
 33 | Do this.
 34 | 
 35 | 
 36 | =head1 OPTIONS
 37 | 
 38 | =over 8
 39 | 
 40 | =item B<-fasta>, B<-f> I<CHAR>
 41 | 
 42 | Fasta file
 43 | 
 44 | =item B<-out>, B<-o> I<CHAR>
 45 | 
 46 | Output file name
 47 | 
 48 | =item B<-version>, B<-v> I<BOOLEAN>
 49 | 
 50 | version of the current script
 51 | 
 52 | =item B<-help>, B<-h> I<BOOLEAN>
 53 | 
 54 | This message.
 55 | 
 56 | =back
 57 | 
 58 | 
 59 | =head1 AUTHOR
 60 | 
 61 | Sunit Jain, (Tue Jan 20 11:00:25 EST 2015)
 62 | sunitj [AT] umich [DOT] edu
 63 | 
 64 | 
 65 | =head1 SEE ALSO
 66 | 
 67 | L<omics(1)>, L<illumina-reads-processing(7)>
 68 | 
 69 | =head2 Other local resources
 70 | 
 71 | =over
 72 | 
 73 | =item [1]
 74 | 
 75 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
 76 | 
 77 | =item [2]
 78 | 
 79 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
 80 | 
 81 | =back
 82 | 
 83 | =head2 Web
 84 | 
 85 | =over
 86 | 
 87 | =item [3]
 88 | 
 89 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
 90 | 
 91 | =item [4]
 92 | 
 93 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
 94 | 
 95 | =item [5]
 96 | 
 97 | L<Github repository|https://github.com/Geo-omics/scripts>
 98 | 
 99 | =back
100 | 
101 | =cut
102 | 
103 | use strict;
104 | use Getopt::Long;
105 | 
106 | my $help;
107 | my $version="reverse_complement.pl\tv0.0.1b";
108 | my ($fasta, $out);
109 | GetOptions(
110 | 	'f|fasta:s'=>\$fasta,
111 | 	'o|out:s'=>\$out,
112 | 	'v|version'=>sub{print $version."\n"; exit;},
113 | 	'h|help'=>sub{system("perldoc $0 \| cat"); exit;},
114 | );
115 | print "\# $version\n";
116 | 
117 | open(FASTA, "<".$fasta) || die $!;
118 | open(OUT, ">".$out)|| die $!;
119 | $/=">";
120 | while(my $line=<FASTA>){
121 | 	chomp $line;
122 | 	next unless $line;
123 | 
124 | 	my($header, @seq)=split(/\n/, $line);
125 | 	my $s=join("",@seq);
126 | 	my $rcSeq=reverseComplement($s);
127 | 
128 | 	print OUT ">".$header."\n".$rcSeq."\n";
129 | }
130 | $/="\n";
131 | close FASTA;
132 | close OUT;
133 | 
134 | sub reverseComplement{
135 | 	my $seq=shift;
136 | 	chomp $seq;
137 | 	my $rSeq=uc(reverse($seq));
138 | 	$rSeq=~ tr/GTCA/CAGT/;
139 | 	return $rSeq;
140 | }
141 | 


--------------------------------------------------------------------------------
/scripts/rgi-setup:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2019 Regents of The University of Michigan.
 3 | 
 4 | # This file is part of geo-omics-scripts.
 5 | 
 6 | # Geo-omics-scripts is free software: you can redistribute it and/or
 7 | # modify it under the terms of the GNU General Public License as published
 8 | # by the Free Software Foundation, either version 3 of the License, or (at
 9 | # your option) any later version.
10 | 
11 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 | # General Public License for more details.
15 | 
16 | # You should have received a copy of the GNU General Public License along
17 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
18 | 
19 | ###########################################
20 | # This script loads the CARD reference data
21 | # and prepares a directory from within you
22 | # can run `rgi main`
23 | # The commands were adatped from the README
24 | # of the RGI software.
25 | ###########################################
26 | set -euo pipefail
27 | 
28 | trap 'echo "error at line $LINENO, exit status $?"' ERR
29 | card=./card.json
30 | 
31 | # Load CARD reference data
32 | 
33 | if [[ ! -e $card ]]; then
34 |     wget https://card.mcmaster.ca/latest/data
35 |     tar -xvf data $card
36 |     rm data
37 | 
38 | fi
39 | 
40 | rgi load --card_json $card --local
41 | rgi card_annotation -i $card > card_annotation.log 2>&1
42 | [[ $(ls card_database_v*.fasta) =~ card_database_v(.*).fasta ]]
43 | version=${BASH_REMATCH[1]}
44 | echo "[INFO] version parsed: $version"
45 | rgi load -i $card --card_annotation card_database_v"$version".fasta --local
46 | 
47 | wget -O wildcard_data.tar.bz2 https://card.mcmaster.ca/latest/variants
48 | mkdir -p wildcard
49 | tar -xvf wildcard_data.tar.bz2 -C wildcard
50 | rm wildcard_data.tar.bz2
51 | gunzip wildcard/*.gz
52 | 
53 | rgi wildcard_annotation -i wildcard --card_json $card -v "$version" > wildcard_annotation.log 2>&1
54 | rgi load --wildcard_annotation wildcard_database_v"$version".fasta --wildcard_index wildcard/index-for-model-sequences.txt --card_annotation card_database_v"$version".fasta --local
55 | 
56 | # check db version
57 | rgi database --version --local
58 | 


--------------------------------------------------------------------------------
/scripts/sangerSeqParser:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2013, 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | 
 21 | =head1 NAME
 22 | 
 23 | sangerSeqParser - sanger seq parser
 24 | 
 25 | =head1 SYNOPSIS
 26 | 
 27 | B<sangerSeqParser> B<-p> I<Folder_Path>
 28 | 
 29 | 
 30 | =head1 OPTIONS
 31 | 
 32 | =over 8
 33 | 
 34 | =item B<-o> I<file>
 35 | 
 36 | output file name; default= processID.fasta
 37 | 
 38 | =item B<-e> I<suffix>
 39 | 
 40 | file extension to look for in folder; default= fasta
 41 | 
 42 | =item B<-h>
 43 | 
 44 | this page.
 45 | 
 46 | =back
 47 | 
 48 | 
 49 | =head1 Suggestions/Corrections/Feedback/Beer
 50 | 
 51 | Sunit Jain, sunitj@umich.edu
 52 | 
 53 | 
 54 | =head1 SEE ALSO
 55 | 
 56 | L<omics(1)>, L<illumina-reads-processing(7)>
 57 | 
 58 | =head2 Other local resources
 59 | 
 60 | =over
 61 | 
 62 | =item [1]
 63 | 
 64 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
 65 | 
 66 | =item [2]
 67 | 
 68 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
 69 | 
 70 | =back
 71 | 
 72 | =head2 Web
 73 | 
 74 | =over
 75 | 
 76 | =item [3]
 77 | 
 78 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
 79 | 
 80 | =item [4]
 81 | 
 82 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
 83 | 
 84 | =item [5]
 85 | 
 86 | L<Github repository|https://github.com/Geo-omics/scripts>
 87 | 
 88 | =back
 89 | 
 90 | =cut
 91 | 
 92 | use strict;
 93 | use Getopt::Long;
 94 | use File::Basename;
 95 | 
 96 | my $path; # Folder path
 97 | my $ext="fasta";
 98 | my $out;
 99 | 
100 | GetOptions(
101 | 	'p:s'=>\$path,
102 | 	'o:s'=>\$out,
103 | 	'e:s'=>\$ext,
104 | 	'h|help'=>sub{system('perldoc', $0); exit;},
105 | );
106 | 
107 | $path= `pwd` if !$path;
108 | chomp $path;
109 | $out=$$.".".$ext if !$out;
110 | 
111 | my @files=<$path/*.$ext>;
112 | 
113 | open(FASTA, ">".$out) || die $!;
114 | 
115 | foreach my $f(@files){
116 | 	my $fhIN;
117 | 	
118 | 	open($fhIN, $f) || die $!;
119 | 	my @sequence;
120 | 	while(my $line=<$fhIN>){
121 | 		chomp $line;
122 | 		$line=~ s/\r//;
123 | 		next unless $line;
124 | 		push(@sequence, $line);
125 | 	}
126 | 	close($fhIN);
127 | 	my $seq= join("", @sequence);
128 | 	
129 | 	my $nuHead=fileparse($f);
130 | 
131 | 	print FASTA ">".$nuHead."\n".$seq."\n";
132 | }
133 | close (FASTA);
134 | 


--------------------------------------------------------------------------------
/scripts/setup_metapathways:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Copyright 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | ln -s /opt/packages/MetaPathways/1.0/blastDB
21 | ln -s /opt/packages/MetaPathways/1.0/executables
22 | cp /geomicro/data1/COMMON/src/MetaPathways/setup/template_* .
23 | 


--------------------------------------------------------------------------------
/scripts/shared-filter-abundance:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2020 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | """
 21 | Remove OTUs with small relative abundance from a mothur shared file.
 22 | 
 23 | For each sample, for OTUs less abundant than the threashold, the count is set
 24 | to zero for that sample.  OTUs that consequently are all zero in the whole data
 25 | set will be removed completely.
 26 | """
 27 | import argparse
 28 | from pathlib import Path
 29 | 
 30 | from omics.shared import MothurShared
 31 | 
 32 | 
 33 | DEFAULT_FILTER_CUTOFF = 1000
 34 | 
 35 | 
 36 | def per_sample_cut_off(row):
 37 |     """
 38 |     Set read counts below cut-off to zero
 39 | 
 40 |     To be passed to DataFrame.apply
 41 |     """
 42 |     cutoff = size[row.name] / args.filter_cut_off
 43 |     row[row < cutoff] = 0
 44 |     sh.counts.loc[row.name] = row
 45 | 
 46 | 
 47 | def is_all_zero(col):
 48 |     """
 49 |     Return name of an "all zero count" OTU
 50 | 
 51 |     To be passed to DataFrame.apply
 52 |     """
 53 |     for i in col:
 54 |         if i > 0:
 55 |             return None
 56 |     return col.name
 57 | 
 58 | 
 59 | argp = argparse.ArgumentParser(description=__doc__)
 60 | argp.add_argument(
 61 |     'shared_file',
 62 |     type=argparse.FileType(),
 63 |     help='Input data, a mothur shared file.'
 64 | )
 65 | argp.add_argument(
 66 |     '-t', '--threads',
 67 |     type=int,
 68 |     default=1,
 69 |     help='Number of threads to use for parallizable steps',
 70 | )
 71 | argp.add_argument(
 72 |     '-f', '--filter-cut-off',
 73 |     type=int,
 74 |     default=DEFAULT_FILTER_CUTOFF,
 75 |     help='Set OTUs count with relative abundance of less than one in this many'
 76 |          'sequences in a sample to zero.  The default is {0}, i.e. counts of '
 77 |          'less than 1/{0} of the sample size are set to zero.'
 78 |          ''.format(DEFAULT_FILTER_CUTOFF),
 79 | )
 80 | argp.add_argument('--version', action='version', version='%(prog)s '
 81 |                   'is part of geo-omics-scripts VERSION_PLACEHOLDER')
 82 | args = argp.parse_args()
 83 | 
 84 | sh = MothurShared(args.shared_file, threads=args.threads)
 85 | old_sizes = sh.sample_sizes.copy()
 86 | size = dict(zip(sh.samples, sh.sample_sizes))
 87 | 
 88 | sh.counts.apply(per_sample_cut_off, axis=1)
 89 | zero_otus = [i for i in sh.counts.apply(is_all_zero, axis=0) if i is not None]
 90 | sh.remove_otus(zero_otus)
 91 | 
 92 | diffs = old_sizes - sh.sample_sizes
 93 | print('reads removed from dataset: {} ({:%})'
 94 |       ''.format(diffs.sum(), diffs.sum() / old_sizes.sum()))
 95 | max_sample = diffs.idxmax()
 96 | print('max removed in a sample: {} {:%}'
 97 |       ''.format(max_sample, diffs[max_sample] / old_sizes[max_sample]))
 98 | print('Low-abundance OTUs removed: {} ({:%})'
 99 |       ''.format(
100 |           len(zero_otus),
101 |           len(zero_otus) / (len(zero_otus) + len(sh.otus))
102 |       ))
103 | 
104 | outfile = Path(Path(args.shared_file.name).name).with_suffix(
105 |     '.f{}.shared'.format(args.filter_cut_off)
106 | )
107 | sh.save(outfile)
108 | 


--------------------------------------------------------------------------------
/scripts/shared-set-accessions:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright 2020 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | """
 21 | Set new OTU numbering scheme
 22 | 
 23 | This can be used in the ASV-producing pipeline.  An existing ASV->sequence
 24 | assignment can be supplied.  Any new sequence will be assigned a new ASV
 25 | number.
 26 | 
 27 | This script is part of the Schmidt Lab Mothur pipeline.
 28 | """
 29 | import argparse
 30 | from pathlib import Path
 31 | 
 32 | from omics.shared import MothurShared
 33 | 
 34 | DEFAULT_PREFIX = 'ASV'
 35 | 
 36 | 
 37 | argp = argparse.ArgumentParser(description=__doc__)
 38 | argp.add_argument(
 39 |     'shared',
 40 |     type=argparse.FileType(),
 41 |     help='Mothur shared file',
 42 | )
 43 | argp.add_argument(
 44 |     '-p', '--prefix',
 45 |     default=DEFAULT_PREFIX,
 46 |     help='Numbering scheme prefix, default is ' + DEFAULT_PREFIX + '. This '
 47 |          'option has no effect if the --with-map option is used',
 48 | )
 49 | argp.add_argument(
 50 |     '-z', '--leading-zeros',
 51 |     action='store_true',
 52 |     help='Add leading zeros to small numbers',
 53 | )
 54 | argp.add_argument(
 55 |     '-s', '--start-with',
 56 |     type=int,
 57 |     default=1,
 58 |     help='Accession number to start with.  This should be one larger than the '
 59 |          'largest existing ASV number',
 60 | )
 61 | argp.add_argument(
 62 |     '--with-map',
 63 |     type=argparse.FileType(),
 64 |     help='Use the mapping in given two-column tab-separated text file'
 65 | )
 66 | argp.add_argument(
 67 |     '--save-map',
 68 |     type=argparse.FileType('w'),
 69 |     help='Save old->new OTU accession mapping in two-column tab-separated text'
 70 |          ' file under given name',
 71 | )
 72 | argp.add_argument(
 73 |     '-o', '--output',
 74 |     default=None,
 75 |     help='Name of output shared file, by default a name will be generated '
 76 |          'based on the input file name',
 77 | )
 78 | argp.add_argument(
 79 |     '-t', '--threads',
 80 |     type=int,
 81 |     default=1,
 82 |     help='Number of threads',
 83 | )
 84 | argp.add_argument('--version', action='version', version='%(prog)s '
 85 |                   'is part of geo-omics-scripts VERSION_PLACEHOLDER')
 86 | args = argp.parse_args()
 87 | if args.output is None:
 88 |     output = Path(Path(args.shared.name).name).with_suffix('.accs.shared')
 89 | else:
 90 |     output = args.output
 91 | 
 92 | if args.with_map is None:
 93 |     acc_map = None
 94 | else:
 95 |     acc_map = {}
 96 |     for line in args.with_map:
 97 |         old, new = line.strip().split('\t')
 98 |         acc_map[old] = new
 99 | 
100 | sh = MothurShared(args.shared, threads=args.threads)
101 | acc_map = sh.set_accessions(
102 |     with_map=acc_map,
103 |     prefix=args.prefix,
104 |     leading_zeros=args.leading_zeros,
105 |     first=args.start_with,
106 | )
107 | if args.save_map is not None:
108 |     for k, v in acc_map.items():
109 |         args.save_map.write('{}\t{}\n'.format(k, v))
110 |     args.save_map.close()
111 |     sh.info('Accession map saved as', args.save_map.name)
112 | sh.save(str(output))
113 | sh.info('New shared file saved as:', output)
114 | 


--------------------------------------------------------------------------------
/scripts/silva-db:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2019, 2022 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | #
21 | #
22 | # Download SILVA SSU/LSU rRNA data and create blast db
23 | #
24 | #
25 | set -eE
26 | # shellcheck disable=SC2034
27 | {
28 | USAGE="[OPTIONS]..."
29 | HELP="download SILVA SSU/LSU rRNA sequences and build BLAST DBs"
30 | ARG_HELP="
31 | "
32 | CHECK_PROGS="makeblastdb"
33 | }
34 | # shellcheck disable=SC1090
35 | . "$(dirname "$0")/../share/geo-omics-scripts/liba.sh" || (echo "Failed to source script library"; exit 1)
36 | 
37 | URL=ftp://ftp.arb-silva.de
38 | 
39 | if [ "$VERBOSITY" -ge 2 ]; then
40 |     CURL=curl
41 | else
42 |     CURL="curl --silent"
43 | fi
44 | 
45 | echo -n "Getting current SIVLA db release number... "
46 | # slash after url makes curl list directory in complex format, -> indicates a symlink
47 | release=$($CURL $URL/ | grep -o "current.*" | sed -r 's/current -> release_//')
48 | [[ $release =~ [0-9]+ ]] || (echo; echo "Failed to get current SIVLA db release number, got: ${release}"; exit 1)
49 | # for point releases, replace _ by .
50 | release=${release/_/.}
51 | echo "$release"
52 | 
53 | out_dir="$WORK_DIR/release_$release"
54 | $MKDIR "$out_dir" || (echo "Directory exists alread: $out_dir"; exit 1)
55 | 
56 | log=$out_dir/silva-db.log
57 | 
58 | (
59 |     cd "$out_dir"
60 | 
61 |     echo -n "Downloading SIVLA SSU/LSU db..."
62 |     $CURL -O "$URL/current/Exports/SILVA_${release}_{SSU,LSU}Ref_tax_silva.fasta.gz"
63 |     echo "done"
64 | 
65 |     echo -n "Decompressing... "
66 |     $GUNZIP -- SILVA_"${release}"_{SSU,LSU}Ref_tax_silva.fasta.gz
67 |     echo "done"
68 | )
69 | 
70 | for i in SSU LSU; do
71 |     echo -n "Making $i BLAST db... "
72 |     makeblastdb -in "$out_dir/SILVA_${release}_${i}Ref_tax_silva.fasta" -dbtype nucl >> "$log"
73 |     echo "done"
74 | done
75 | 
76 | echo -n "Finishing... "
77 | # no need to ever touch these files again
78 | chmod -R -w "$out_dir"
79 | 
80 | $RM -f latest && $LN -s "$(basename "$out_dir")" latest
81 | echo "done"
82 | 


--------------------------------------------------------------------------------
/scripts/tally:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2013, 2014, 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | use strict;
 21 | use Getopt::Long;
 22 | 
 23 | my $in;
 24 | my $master;
 25 | my $out=$$.".out";
 26 | my $bs=0;
 27 | my $printValues; # which column?
 28 | my $fasta;
 29 | GetOptions(
 30 | 	'i:s'=>\$in,
 31 | 	'm:s'=>\$master,
 32 | 	'o:s'=>\$out,
 33 | 	's:f'=>\$bs,
 34 | 	'f|fasta:s'=>\$fasta,
 35 | 	'v|values|value:i'=>\$printValues,
 36 | );
 37 | 
 38 | my %seqLen;
 39 | if($fasta){
 40 | 	$/=">";
 41 | 	open(FASTA, "<".$fasta)|| die $!;
 42 | 	while(my $line=<FASTA>){
 43 | 		chomp $line;
 44 | 		next unless $line;
 45 | 		my($header, @s)=split(/\n/, $line);
 46 | 		my $seq=join("",@s);
 47 | 		$header=~ s/^>//;
 48 | 		$seqLen{$header}=length($seq);
 49 | 	}
 50 | 	close(FASTA);
 51 | 	$/="\n";
 52 | }
 53 | 
 54 | $printValues--;
 55 | open(IN, $in)|| die "[Error] $in: $!\n";
 56 | my %seen;
 57 | while (my $line=<IN>){
 58 | 	next if ($line=~ m/^#/);
 59 | 	chomp $line;
 60 | 	next unless $line;
 61 | 	$line=~ s/\r//g;
 62 | 
 63 | 	my @lineParts=split(/\t/, $line);
 64 | 
 65 | 	if(($fasta) && ($seqLen{$lineParts[0]})){
 66 | 		# Alternative to previous condition; don't need it if query 100% identical with same start and stop positions.
 67 | 		my($subjStart,$subjStop)=sort{$a <=> $b} ($lineParts[8],$lineParts[9]);
 68 | 		next if(($lineParts[6]==$lineParts[8]) && ($lineParts[7]==$lineParts[9]) && ($lineParts[2]==100) && ($subjStop==$seqLen{$lineParts[0]}));
 69 | 	}
 70 | 	else{
 71 | 		next if ($lineParts[0] eq $lineParts[1]); # Don't want query and subj to be the same
 72 | 	}
 73 | 	
 74 | 	next if($seen{$lineParts[0]}); # Only need the top hit.
 75 | 	next if ($lineParts[-1] < $bs); # Don't need anything with a bitscore less than user provided BS.
 76 | 
 77 | 	if(! $printValues){
 78 | 		$seen{$lineParts[0]}++;
 79 | 	}
 80 | 	else{
 81 | 		$seen{$lineParts[0]}=$lineParts[$printValues];
 82 | 	}
 83 | }
 84 | close IN;
 85 | 
 86 | open(MASTER, $master) || die "[Error] $master: $!\n";
 87 | open(OUT, ">".$out);
 88 | while (my $line=<MASTER>){
 89 | 	next if ($line=~ m/^#/);
 90 | 	chomp $line;
 91 | 	next unless $line;
 92 | 	$line=~ s/\r//g;
 93 | 
 94 | 	if ($seen{$line}){
 95 | 		print OUT $line."\t".$seen{$line}."\n";
 96 | 	}
 97 | 	else{
 98 | 		print OUT $line."\t0\n";
 99 | 	}
100 | }
101 | close MASTER;
102 | close OUT;
103 | exit;
104 | 


--------------------------------------------------------------------------------
/scripts/tally-weave:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Copyright 2013, 2014, 2015, 2019 Regents of The University of Michigan.
 4 | 
 5 | # This file is part of geo-omics-scripts.
 6 | 
 7 | # Geo-omics-scripts is free software: you can redistribute it and/or
 8 | # modify it under the terms of the GNU General Public License as published
 9 | # by the Free Software Foundation, either version 3 of the License, or (at
10 | # your option) any later version.
11 | 
12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 | # General Public License for more details.
16 | 
17 | # You should have received a copy of the GNU General Public License along
18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
19 | 
20 | use strict;
21 | use Getopt::Long;
22 | use File::Basename;
23 | 
24 | my $ext="tally";
25 | my $out;
26 | GetOptions(
27 | 	'e:s'=>\$ext,
28 | 	'o:s'=>\$out,
29 | );
30 | 
31 | my @DBs;
32 | my @files=glob("*.".$ext);
33 | open(OUT, ">".$out);
34 | print OUT "#Transcripts\t";
35 | my %master;
36 | print @files." Files will be tallied...!\n";
37 | foreach my $f(@files){
38 | 	my $dbName=basename($f,"\.$ext"); #split(/\_/, $f);
39 | 	push(@DBs, $dbName);
40 | 	print OUT $dbName."\t";
41 | 	my $fh;
42 | 	open($fh, $f) || die "[error] $f: $! \n";
43 | 	while (my $line=<$fh>){
44 | 		next if ($line=~ m/^#/);
45 | 		chomp $line;
46 | 		$line=~ s/\r//g;
47 | 		next unless $line;
48 | 
49 | 		my @cols=split(/\t/, $line);
50 | 		$master{$cols[0]}{$dbName}=$cols[1];
51 | 	}
52 | 	close $fh;
53 | }
54 | print OUT "DB-presence\n";
55 | 
56 | foreach my $key(keys %master){
57 | 	print OUT $key."\t";
58 | 	my $total=0;
59 | 	foreach my $db(@DBs){
60 | 		my $v;
61 | 		if($master{$key}{$db}){$v = $master{$key}{$db}}
62 | 		else{$v=0}
63 | 		print OUT $v."\t";
64 | 		$total++ if($v != 0);
65 | 	}
66 | 	print OUT $total."\n";
67 | }
68 | 


--------------------------------------------------------------------------------
/scripts/tallyWrap:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright 2013, 2014, 2015, 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | 
 21 | =head1 NAME
 22 | 
 23 | tallyWrap - count hits above given bit score
 24 | 
 25 | =head1 SYNOPSIS
 26 | In multiple blast outputs, count the number of times a query gets a hit above a certain bit score:
 27 | 
 28 |  B<tallyWrap> -ext blastp -m masterList_output -t combinedTally_output -s 40
 29 | 
 30 | In multiple tab delimited files, for each value in the first column, combine the values of the last column from each file into a combined tabular file:
 31 | 
 32 |  B<tallyWrap> -ext blastp -m masterList_output -t combinedTally_output -values
 33 | 
 34 | 
 35 | =head1 DEPENDENCIES
 36 | 
 37 |  getMasterList
 38 |  tally
 39 |  tally-weave
 40 | 
 41 | 
 42 | =head1 SEE ALSO
 43 | 
 44 | L<omics(1)>, L<illumina-reads-processing(7)>
 45 | 
 46 | =head2 Other local resources
 47 | 
 48 | =over
 49 | 
 50 | =item [1]
 51 | 
 52 | L<HTML documentation|file:///usr/share/doc/geo-omics-scripts/html/index.html>
 53 | 
 54 | =item [2]
 55 | 
 56 | L<Omics workflow documentation [PDF]|file:///usr/share/doc/geo-omics-scripts/Geomicro-Illumina-Reads-Processing-Pipeline.pdf>
 57 | 
 58 | =back
 59 | 
 60 | =head2 Web
 61 | 
 62 | =over
 63 | 
 64 | =item [3]
 65 | 
 66 | L<Workflow documentation [PDF]|https://drive.google.com/open?id=0BxFSivK8RfJed05wamtrbEVUeE0>
 67 | 
 68 | =item [4]
 69 | 
 70 | L<Website|http://www.earth.lsa.umich.edu/geomicrobiology/>
 71 | 
 72 | =item [5]
 73 | 
 74 | L<Github repository|https://github.com/Geo-omics/scripts>
 75 | 
 76 | =back
 77 | 
 78 | =cut
 79 | 
 80 | 
 81 | use strict;
 82 | use Getopt::Long;
 83 | use File::Spec::Functions;
 84 | use File::Which;
 85 | 
 86 | my $masterList;
 87 | my $ext="blastn";
 88 | my $combinedTally;
 89 | my $bs=0;
 90 | my $printValue; # column number (start counting from 1)
 91 | my $fasta; # query fasta
 92 | GetOptions(
 93 | 	'ext:s'=>\$ext,
 94 | 	'm:s'=>\$masterList,
 95 | 	't:s'=>\$combinedTally,
 96 | 	's:f'=>\$bs,
 97 | 	'value|values:i'=>\$printValue,
 98 | 	'f|fasta:s'=>\$fasta,
 99 | );
100 | 
101 | #Check for dependencies
102 | my @deps = ('getMasterList', 'tally', 'tally-weave');
103 | foreach $dep (@deps) {
104 |     my $p = catfile('.', $dep);
105 |     if (-x $p) {
106 |         $dep = $p;
107 |     } else {
108 |         $dep = which $dep or die "Could not find dependency: $dep";
109 |     }
110 | }
111 | my ($master_list_script, $tally_script, $tally_weave_script) = @deps;
112 | 
113 | my @files=glob("*.$ext");
114 | print "Creating MasterList\n";
115 | system("$master_list_script -o ".$masterList." -s ".$bs.($ext ? " -e $ext" : ""));
116 | foreach my $f(@files){
117 | 	print "\tTally: $f\n";
118 | 	my($name, $ext)=split(/\./, $f);
119 | 	my $tallyFile=$name.".tally";
120 | 	system("$tally_script -m ".$masterList." -i ".$f ." -o ".$tallyFile." -s ".$bs.($printValue ? " -values $printValue" : "").($fasta ? " -fasta $fasta":""));
121 | }
122 | print "Weaving all tally files...\n";
123 | system("$tally_weave_script -o ".$combinedTally);
124 | exit;
125 | 


--------------------------------------------------------------------------------
/scripts/tinySeq2fasta.xslt:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0' ?>
 2 | <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
 3 | <!--
 4 | This stylesheet transforms a tinySeq XML file from NCBI to a Fasta file
 5 | 
 6 | USAGE: xsltproc tinySeq2fasta.xslt file.xml > output.txt
 7 | 
 8 | AUTHOR: Sunit Jain, sunitj [AT] umich [DOT] edu
 9 | CREATED: July 2014
10 | -->
11 | <xsl:output method="text"/>
12 | 
13 | <!-- Match the root node -->
14 | <xsl:template match="/">
15 | <!-- In the next line, Change the value to "nucleotide" if your tinySeq xml contains nucleotides -->
16 | <xsl:for-each select="//TSeq[TSeq_seqtype/@value='protein']">
17 | 	<xsl:value-of select="concat('&gt;gi|',TSeq_gi,'|',TSeq_accver,'|',TSeq_defline)"/>
18 | 	<xsl:text>&#x9;</xsl:text>
19 | 	<xsl:value-of select="TSeq_taxid"/>
20 | 	<xsl:text>&#x9;</xsl:text>
21 | 	<xsl:value-of select="TSeq_orgname"/>
22 | 	<xsl:text>&#x9;</xsl:text>
23 | 	<xsl:value-of select="TSeq_length"/>
24 | 	<xsl:text>&#10;</xsl:text>
25 | 	<xsl:value-of select="TSeq_sequence"/>
26 | 	<xsl:text>&#10;</xsl:text>
27 | 	</xsl:for-each>
28 | </xsl:template>
29 | </xsl:stylesheet>
30 | 


--------------------------------------------------------------------------------
/scripts/tinySeq2table.xslt:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0' ?>
 2 | <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
 3 | <!--
 4 | This stylesheet transforms a tinySeq XML file from NCBI to a table(tab-delimited)
 5 | 
 6 | USAGE: xsltproc tinySeq2table.xslt file.xml > output.txt
 7 | 
 8 | AUTHOR: Sunit Jain, sunitj [AT] umich [DOT] edu
 9 | CREATED: July 2014
10 | -->
11 | <xsl:output method="text"/>
12 | 
13 | <!-- Match the root node -->
14 | <xsl:template match="/">
15 | <!-- In the next line, Change the value to "nucleotide" if your tinySeq xml contains nucleotides -->
16 | <xsl:for-each select="//TSeq[TSeq_seqtype/@value='protein']">
17 | 	<xsl:value-of select="TSeq_gi"/>
18 | 	<xsl:text>&#x9;</xsl:text>
19 | 	<xsl:value-of select="TSeq_accver"/>
20 | 	<xsl:text>&#x9;</xsl:text>
21 | 	<xsl:value-of select="TSeq_taxid"/>
22 | 	<xsl:text>&#x9;</xsl:text>
23 | 	<xsl:value-of select="TSeq_orgname"/>
24 | 	<xsl:text>&#x9;</xsl:text>
25 | 	<xsl:value-of select="TSeq_defline"/>
26 | 	<xsl:text>&#x9;</xsl:text>
27 | 	<xsl:value-of select="TSeq_length"/>
28 | 	<xsl:text>&#x9;</xsl:text>
29 | 	<xsl:value-of select="TSeq_sequence"/>
30 | 	<xsl:text>&#10;</xsl:text>
31 | </xsl:for-each>
32 | </xsl:template>
33 | </xsl:stylesheet>
34 | 


--------------------------------------------------------------------------------
/scripts/twitterscript.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <Module>
 3 |     <ModulePrefs title="Twitter Timeline"/>
 4 |     <Content type="html">
 5 |         <![CDATA[
 6 |         <a class="twitter-timeline" href="https://twitter.com/umich_geomicro" data-widget-id="635590709001621504">Tweets by @umich_geomicro</a>
 7 |         <script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script>
 8 |         ]]>
 9 |     </Content>
10 | </Module>
11 | 


--------------------------------------------------------------------------------
/test/run:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright 2019 Regents of The University of Michigan.
  4 | 
  5 | # This file is part of geo-omics-scripts.
  6 | 
  7 | # Geo-omics-scripts is free software: you can redistribute it and/or
  8 | # modify it under the terms of the GNU General Public License as published
  9 | # by the Free Software Foundation, either version 3 of the License, or (at
 10 | # your option) any later version.
 11 | 
 12 | # Geo-omics-scripts is distributed in the hope that it will be useful, but
 13 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 | # General Public License for more details.
 16 | 
 17 | # You should have received a copy of the GNU General Public License along
 18 | # with Geo-omics-scripts.  If not, see <https://www.gnu.org/licenses/>.
 19 | 
 20 | set -ue
 21 | # shellcheck disable=SC2034
 22 | USAGE="[--help|-h|-t] [-- [<first>]-[<last>]]"
 23 | # shellcheck disable=SC2034
 24 | SHORT_OPTIONS=t:
 25 | # shellcheck disable=SC2034
 26 | LONG_OPTIONS=threads:
 27 | # shellcheck disable=SC2034
 28 | HELP="Run omics SOP tests"
 29 | handle_options () {
 30 |     if [ "$#" -gt 0 ]; then
 31 |         case "$1" in
 32 |             -t|--threads)
 33 |                 THREADS=$2
 34 |                 return 2;;
 35 |         esac
 36 |     else
 37 |         return 0
 38 |     fi
 39 | }
 40 | 
 41 | # default values
 42 | STEPS=(prep qc assemble mapping binning)
 43 | THREADS=32
 44 | DATA_DIR=data.01
 45 | TMPDIR=/tmp
 46 | 
 47 | # shellcheck disable=SC1090
 48 | . "$(dirname "$0")/../share/geo-omics-scripts/liba.sh" || (echo "Failed to source script library"; exit 1)
 49 | 
 50 | # set script name here to prefix script output more meaningfully
 51 | # shellcheck disable=2034
 52 | SCRIPT_NAME="omics test"
 53 | 
 54 | if [ "$#" -gt 0 ]; then
 55 |     if echo "$1" | grep -q -; then
 56 |         first=$(echo "$1" | cut -d- -f1)
 57 |         last=$(echo "$1" | cut -d- -f2)
 58 |     else
 59 |         first=$1
 60 |         last=$1
 61 |     fi
 62 | fi
 63 | 
 64 | # The default: run all steps
 65 | first=${first:-${STEPS[0]}}
 66 | last=${last:-${STEPS[-1]}}
 67 | 
 68 | pick_step=false
 69 | last_ok=false
 70 | steps=()
 71 | for i in "${STEPS[@]}"; do
 72 |     [ "$i" == "$first" ] && pick_step=true
 73 |     $pick_step && steps+=($i)
 74 |     [ "$i" == "$last" ] && last_ok=true && break
 75 | done
 76 | 
 77 | [ ${#steps[@]} == 0 ] && abort "Unknown first step: $first"
 78 | $last_ok || abort "Unknown last step: $last"
 79 | 
 80 | data=($(find "$(realpath "$(dirname "$0")")"/$DATA_DIR -type f -name "*.fastq.gz"))
 81 | 
 82 | do_prep () {
 83 |     omics prep --cpus $THREADS "${data[@]}"
 84 | }
 85 | 
 86 | do_qc () {
 87 |     omics qc -t $THREADS 66*
 88 | }
 89 | 
 90 | do_assemble () {
 91 |     omics assemble --cpus $THREADS --skip-phylosift --megahit 66*
 92 | }
 93 | 
 94 | do_mapping () {
 95 |     omics mapping --cpus $THREADS --chop --index-only
 96 |     for i in 66*; do (
 97 |         cd "$i"
 98 |         omics mapping --cpus $THREADS --index-dir ../bowtie2-index \
 99 |             -a ../assembly.chop.fa \
100 |             -f fwd.good.fastq -r rev.good.fastq
101 |         cd ..
102 |     ) done
103 | }
104 | 
105 | do_binning () {
106 |     omics binning --cpus $THREADS --assembly assembly.chop.fa 66*
107 | }
108 | 
109 | do_steps () {
110 |     for i in "${steps[@]}"; do
111 |         case $i in
112 |             prep) do_prep;;
113 |             qc) do_qc;;
114 |             assemble) do_assemble;;
115 |             mapping) do_mapping;;
116 |             binning) do_binning;;
117 |             *) abort "runtime error: illegal step: $i";;
118 |         esac
119 |     done
120 | }
121 | 
122 | tmpdir=$(mktemp -d --tmpdir=$TMPDIR omics_test.XXXXXXX)
123 | info "Created working directory: $tmpdir"
124 | cd "$tmpdir"
125 | time do_steps
126 | info "Ran $first to $last, results in $tmpdir"
127 | 
128 | 


--------------------------------------------------------------------------------