├── .DS_Store
├── .gitmodules
├── Configuration_example
├── LICENSE
├── Outdated
    ├── Placeholder
    ├── dDocent.GATK
    └── install_dDocent.GATK_requirements
├── README.md
├── Rename_for_dDocent.sh
├── Rename_for_dDocent_with-INDEX.sh
├── Sample Comparsion.png
├── dDocent
├── dDocent_ngs.sh
├── install_dDocent_requirements
├── logo.png
├── scripts
    ├── .DS_Store
    ├── ErrorCount.sh
    ├── RefMapOpt.sh
    ├── ReferenceOpt.sh
    ├── Rename_SequenceFiles.sh
    ├── dDocent_filters
    ├── filter_hwe_by_pop.pl
    ├── filter_missing_ind.sh
    ├── pop_missing_filter.sh
    ├── remake_reference.sh
    ├── remove.bad.hap.loci.sh
    ├── test
    └── untested
    │   ├── .DS_Store
    │   ├── BS_reference_to_fasta.sh
    │   ├── ErrorCount.sh
    │   ├── Filter_VCF_best_SNP_per_contig.sh
    │   ├── Filter_one_random_snp_per_contig.sh
    │   ├── ReferenceOpt.sh
    │   ├── VCFtoOutlierOnly.sh
    │   ├── count_hets.sh
    │   ├── count_loci
    │   ├── dDocent_filters
    │   ├── dDocent_filters_lite
    │   ├── dup_sample_filter.sh
    │   ├── filter_missing_ind.sh
    │   ├── filter_paralog.sh
    │   ├── maf_gp_filter.sh
    │   ├── multi.maf.sh
    │   ├── old
    │       ├── BS_reference_to_fasta.sh
    │       ├── ErrorCount.sh
    │       ├── FB_filters.sh
    │       ├── FB_filters4
    │       ├── LositantoOutlierVCF.sh
    │       ├── NI.dDocent.sh
    │       ├── Rename_for_dDocent.sh
    │       ├── ToSNP.sh
    │       ├── VCFtoOutlierOnly.sh
    │       ├── coverage.sh
    │       ├── dDocent.sh
    │       ├── dDocent.sh.backup
    │       ├── filter_lowdepth_ind.sh
    │       ├── filter_missing_ind.sh
    │       ├── filter_paralog.sh
    │       ├── filter_vcf.sh
    │       ├── hwe_filter.sh
    │       ├── jpuritz@gdcws1.ethz.ch
    │       ├── lowQD.sh
    │       ├── pop_missing_filter.sh
    │       ├── reference.sh
    │       ├── reftest.old.sh
    │       ├── reftest.sh
    │       └── remove_bad_loci_for_hap.sh
    │   ├── pi_sample.sh
    │   ├── pop_missing_filter.sh
    │   ├── remove.bad.hap.loci.sh
    │   ├── untested.new
    │   └── vardist.sh
└── tutorials
    ├── .DS_Store
    ├── FilterTut
    ├── Filtering Tutorial.md
    ├── README.md
    ├── RefTut
    └── Reference Assembly Tutorial.md


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpuritz/dDocent/414d6f3f3fba0be75d01da2797629c3808571113/.DS_Store


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "rad_haplotyper"]
2 | 	path = rad_haplotyper
3 | 	url = https://github.com/chollenbeck/rad_haplotyper.git
4 | 


--------------------------------------------------------------------------------
/Configuration_example:
--------------------------------------------------------------------------------
 1 | Number of Processors
 2 | 24
 3 | Maximum Memory
 4 | 24
 5 | Trimming
 6 | yes
 7 | Assembly?
 8 | yes
 9 | Type_of_Assembly
10 | PE
11 | Clustering_Similarity%
12 | 0.88
13 | Minimum within individual coverage level to include a read for assembly (K1)
14 | 5
15 | Minimum number of individuals a read must be present in to include for assembly (K2)
16 | 6
17 | Mapping_Reads?
18 | yes
19 | Mapping_Match_Value
20 | 1
21 | Mapping_MisMatch_Value
22 | 3
23 | Mapping_GapOpen_Penalty
24 | 5
25 | Calling_SNPs?
26 | yes
27 | Email
28 | jpuritz@gmail.com
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 jpuritz
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/Outdated/Placeholder:
--------------------------------------------------------------------------------
1 | Placeholder
2 | 


--------------------------------------------------------------------------------
/Outdated/install_dDocent.GATK_requirements:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ###Script to check for and install most of the required software for dDocent
  4 | 
  5 | if [[ -z "$1" ]]; then
  6 | echo "Correct usage is sh install_dDocent_requirements [directory in your computer or user PATH]"
  7 | echo "if installing for all users, best to run this script as root"
  8 | exit 1
  9 | fi
 10 | 
 11 | INSTALL_PATH=$1
 12 | 
 13 | 
 14 | 
 15 | if type -p java; then
 16 |     _java=java
 17 | elif [[ -n "$JAVA_HOME" ]] && [[ -x "$JAVA_HOME/bin/java" ]];  then
 18 |     _java="$JAVA_HOME/bin/java"
 19 | else
 20 |     echo "no java"
 21 | fi
 22 | 
 23 | if [[ "$_java" ]]; then
 24 |     version=$("$_java" -version 2>&1 | awk -F '"' '/version/ {print $2}')
 25 |     echo version "$version"
 26 |     if [[ "$version" > "1.7" ]]; then
 27 |         echo version is greater than 1.7
 28 |     else         
 29 |         echo version is less than 1.7 please upgrade version
 30 |         exit 1
 31 |     fi
 32 | fi
 33 | 
 34 | echo "Checking for required software"
 35 | 
 36 | 
 37 | if [ -f $INSTALL_PATH/GenomeAnalysisTK.jar ]; then
 38 |     echo "GATK is already installed"
 39 | else
 40 | 	echo "Please install GATK.  Follow this link to download and install. http://www.broadinstitute.org/gatk/auth?package=GATK"
 41 | fi
 42 | 
 43 | echo "Checking for STACKS"
 44 | if which clone_filter >/dev/null; then
 45 |     echo "STACKS is already installed"
 46 | else
 47 | 	echo "Downloading and installing STACKS components clone_filter and process_radtags"
 48 | 	curl -O http://creskolab.uoregon.edu/stacks/source/stacks-1.12.tar.gz
 49 | 	tar xvzf stacks-1.12.tar.gz
 50 | 	cd stacks-1.12
 51 | 	./configure
 52 | 	make
 53 | 	cp clone_filter process_radtags $INSTALL_PATH
 54 | 	cd ..
 55 | fi
 56 | 
 57 | echo "Checking for FreeBayes"
 58 | if which freebayes >/dev/null; then
 59 |     echo "FreeBayes is already installed"
 60 | else
 61 |         echo "Downloading and installing FreeBayes"
 62 | 	git clone --recursive git://github.com/ekg/freebayes.git
 63 |         cd freebayes
 64 |         make
 65 | 	cd bin
 66 | 	cp * $INSTALL_PATH
 67 | 	cd ..
 68 | 	cd ..
 69 | fi
 70 | echo "Checking for cutadapt"
 71 | if which cutadapt >/dev/null; then
 72 |     echo "cutadapt is already installed"
 73 | else
 74 | 	"Downloading and installing cutadapt"
 75 | 	curl -O  http://cutadapt.googlecode.com/files/cutadapt-1.2.1.tar.gz
 76 | 	tar xvzf cutadapt-1.2.1.tar.gz
 77 | 	cd cutadapt-1.2.1
 78 | 	python setup.py build
 79 | 	python setup.py build_ext -i
 80 | 	python setup.py install
 81 | 	cd ..
 82 | fi
 83 | 
 84 | echo "Checking for fastqc"
 85 | if which cutadapt >/dev/null; then
 86 |     echo "fastqc is already installed"
 87 | else
 88 | 	"Downloading and installing fastqc"
 89 | 	curl -O http://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.10.1.zip
 90 | 	unzip fastqc_v0.10.1.zip
 91 | 	cd FastQC
 92 | 	cp fastqc $INSTALL_PATH
 93 | 	cd ..
 94 | fi
 95 | 
 96 | echo "Checking for trim_galore"
 97 | if which trim_galore >/dev/null; then
 98 |     echo "trim_galore is already installed"
 99 | else
100 | 	"Downloading and installing trim_galore"
101 | 	curl -O http://www.bioinformatics.babraham.ac.uk/projects/trim_galore/trim_galore_v0.3.3.zip
102 | 	unzip trim_galore_v0.3.3.zip
103 | 	cp trim_galore $INSTALL_PATH
104 | fi
105 | 
106 | echo "Checking for mawk"
107 | if which mawk >/dev/null; then
108 |     echo "mawk is already installed"
109 | else
110 | 	"Downloading and installing mawk"
111 | 	curl -O http://invisible-island.net/datafiles/release/mawk.tar.gz
112 | 	tar xvzf mawk.tar.gz
113 | 	cd mawk-1.*
114 | 	./configure
115 | 	make
116 | 	cp mawk $INSTALL_PATH
117 | 	cd ..
118 | fi
119 | 
120 | echo "Checking for bwa"
121 | if which bwa >/dev/null; then
122 |     echo "bwa is already installed"
123 | else
124 |     echo "Downloading and installing bwa"
125 | 	git clone https://github.com/lh3/bwa.git
126 | 	cd bwa
127 | 	make
128 | 	cp bwa $INSTALL_PATH
129 | 	cd ..
130 | fi
131 | 
132 | 
133 | echo "Checking for samtools"
134 | if which samtools >/dev/null; then
135 |     echo "samtools is already installed"
136 | else
137 | 	"Downloading and installing samtools"
138 | 	curl -L -o samtools-0.x.tar.bz2 http://sourceforge.net/projects/samtools/files/latest/download?source=files
139 | 	tar xvjf samtools-0.x.tar.bz2
140 | 	cd samtools-0.1*
141 | 	make
142 | 	cp samtools $INSTALL_PATH
143 | 	cd ..
144 | fi
145 | 
146 | echo "Checking for VCFtools"
147 | if which vcftools >/dev/null; then
148 |     echo "VCFtools is already installed"
149 | else
150 | 	echo "Downloading and installing VCFtools"
151 | 	curl -L -o vcftools_x.tar.gz http://downloads.sourceforge.net/project/vcftools/vcftools_0.1.11.tar.gz?r=http%3A%2F%2Fsourceforge.net%2Fprojects%2Fvcftools%2Ffiles%2F%3Fsource%3Dnavbar&ts=1398959218&use_mirror=hivelocity
152 | 	tar xvzf vcftools_x.tar.gz
153 | 	cd vcftools_0*
154 | 	make
155 | 	cp ./bin/vcftools $INSTALL_PATH
156 | 	cd ..
157 | fi
158 | 
159 | echo "Checking for Rainbow"
160 | if which rainbow >/dev/null; then
161 |     echo "Rainbow is already installed"
162 | else
163 | 	echo "Downloading and installing Rainbow"
164 | 	curl -L -o rainbow.x.tar.gz http://sourceforge.net/projects/bio-rainbow/files/latest/download?source=files
165 | 	tar xvzf rainbow.x.tar.gz
166 | 	cd rainbow_*
167 | 	make
168 | 	cp rainbow rbasm rbmergetag select_* $INSTALL_PATH	
169 | 	cd ..
170 | fi
171 | 
172 | echo "Checking for seqtk"
173 | if which seqtk >/dev/null; then
174 |     echo "seqtk is already installed"
175 | else
176 |     	echo "Downloading and installing seqtk"
177 | 	git clone https://github.com/lh3/seqtk.git
178 | 	cd seqtk
179 | 	make
180 | 	cp seqtk $INSTALL_PATH	
181 | 	cd ..
182 | fi
183 | 
184 | echo "Checking for cd-hit"
185 | if which cd-hit-est >/dev/null; then
186 |     echo "cd-hit is already installed"
187 | else
188 | 	echo "Downloading and installing cd-hit"
189 | 	curl -O http://www.bioinformatics.org/downloads/index.php/cd-hit/cd-hit-v4.5.4-2011-03-07.tgz
190 | 	tar xvzf cd-hit-v4.5.4-2011-03-07.tgz
191 | 	cd cd-hit-v4.5.4-2011-03-07
192 | 	make openmp=yes
193 | 	cp cd-hit-est cd-hit-est-2d $INSTALL_PATH
194 | 	cd ..
195 | fi
196 | 
197 | echo "Checking for mergefq.pl"
198 | if [ -f $INSTALL_PATH/mergefq.pl ]; then
199 | 	echo "mergefq.pl already installed"
200 | else
201 | 	curl -O https://github.com/jpuritz/dDocent/raw/master/mergefq.pl	
202 | 	cp mergefq.pl $INSTALL_PATH
203 | fi
204 | 
205 | echo "Checking for Seq_filter.pl"
206 | if [ -f $INSTALL_PATH/Seq_filter.pl ]; then
207 | 	echo "Seq_filter.pl already installed"
208 | else
209 | 	curl -O http://seq-filter.googlecode.com/files/Seq_filter.pl
210 | 	cp Seq_filter.pl $INSTALL_PATH
211 | fi
212 | 
213 | echo "Checking for cutseq_fasta.pl"
214 | if [ -f $INSTALL_PATH/cutseq_fasta.pl ]; then
215 | 	echo "cutseq_fasta.pl already installed"
216 | else
217 | 	curl -O http://nash-bioinformatics-codelets.googlecode.com/files/cutseq_fasta.pl
218 | 	cp cutseq_fasta.pl $INSTALL_PATH
219 | fi
220 | 
221 | echo "Checking for bedtools"
222 | if which bamToBed >/dev/null; then
223 |     	echo "bedtools is already installed"
224 | else
225 | 	curl -L -O https://github.com/arq5x/bedtools2/releases/download/v2.19.1/bedtools-2.19.1.tar.gz
226 | 	tar xvzf bedtools-2.19.1.tar.gz
227 | 	cd bedtools2-2.19.1
228 | 	make 
229 | 	cd bin
230 | 	cp * $INSTALL_PATH
231 | 	cd ..
232 | 	cd ..
233 | fi
234 | 
235 | echo "Now installing dDocent"
236 | 	curl -L -O https://github.com/jpuritz/dDocent/raw/master/dDocent.GATK
237 | 	chmod +x dDocent.GATK
238 | 	cp dDocent.GATK $INSTALL_PATH
239 | 
240 | 
241 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![logo](logo.png)
 2 | _RADSeq Bioinformatics and Beyond_
 3 | 
 4 | [![alt text](https://anaconda.org/bioconda/ddocent/badges/downloads.svg)](https://anaconda.org/bioconda/ddocent) 
 5 | [![dDocent documentation](https://img.shields.io/badge/documentation-website-informational?logo=Read%20The%20Docs&logoColor=white)](https://www.ddocent.com)
 6 | 
 7 | dDocent is simple bash wrapper to QC, assemble, map, and call SNPs from almost any kind of RAD sequencing. If you have a reference already, dDocent can be used to call SNPs from almost any type of NGS data set. It is designed to run on Linux based machines with large memory capacity and multiple processing cores, and it can be modified for use on HPC. 
 8 | 
 9 | ## Installing
10 | 
11 | ### with `conda` (recommended)
12 | ```bash
13 | > conda install -c bioconda ddocent
14 | 
15 | # or into a fresh environment #
16 | 
17 | > conda create -n environmentname -c bioconda ddocent
18 | ```
19 | 
20 | #### If you are getting samtools errors:
21 | ```bash
22 | > conda install -c bioconda ddocent 'samtools>=1.10'
23 | 
24 | # or into a fresh environment #
25 | 
26 | > conda create -n environmentname -c bioconda ddocent 'samtools>=1.10'
27 | ```
28 | Alternatively, it's recommended to use [mamba](https://github.com/mamba-org/mamba) as your conda solver.
29 | 
30 | ### manually (no longer directly supported)
31 | ```bash
32 | > git clone https://github.com/jpuritz/dDocent.git
33 | 
34 | > cd dDocent
35 | 
36 | > chmod +x ./install_dDocent_requirements
37 | 
38 | > sh ./install_dDocent_requirements
39 | ```
40 | 
41 | ## How does dDocent compare?
42 | 
43 | ![ngs comparison](https://github.com/jpuritz/dDocent/blob/master/Sample%20Comparsion.png)
44 | 
45 | ## Citing dDocent
46 | Puritz JB, Hollenbeck CM, Gold JR. 2014. dDocent: a RADseq, variant-calling pipeline designed for population genomics of non-model organisms. PeerJ 2:e431 https://doi.org/10.7717/peerj.431
47 | 
48 | -----
49 | 
50 | _The "d" is silent_ 🤫
51 | 


--------------------------------------------------------------------------------
/Rename_for_dDocent.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ -z "$1" ]
 4 | then
 5 | echo "No file with barcodes and sample names specified."
 6 | echo "Correct usage: Rename_for_dDocent.sh barcodefile"
 7 | exit 1
 8 | else
 9 | NAMES=( `cut -f1  $1 `)
10 | BARCODES=( `cut -f2 $1 `)
11 | LEN=( `wc -l $1 `)
12 | LEN=$(($LEN - 1))
13 | 
14 | echo ${NAMES[0]}
15 | echo ${BARCODES[0]}
16 | 
17 | for ((i = 0; i <= $LEN; i++));
18 | do
19 | if [ -f sample_${BARCODES[$i]}.2.fq.gz ]; then
20 | mv sample_${BARCODES[$i]}.1.fq.gz ${NAMES[$i]}.F.fq.gz
21 | mv sample_${BARCODES[$i]}.2.fq.gz ${NAMES[$i]}.R.fq.gz
22 | else
23 | mv sample_${BARCODES[$i]}.fq.gz ${NAMES[$i]}.F.fq.gz
24 | fi
25 | done
26 | fi


--------------------------------------------------------------------------------
/Rename_for_dDocent_with-INDEX.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ -z "$1" ]
 4 | then
 5 | echo "No file with barcodes and sample names specified."
 6 | echo "Correct usage: Rename_for_dDocent.sh barcodefile"
 7 | exit 1
 8 | else
 9 | NAMES=( `cut -f1  $1 `)
10 | BARCODES=( `cut -f2 $1 `)
11 | BARCODES2=( `cut -f3 $1 `)
12 | LEN=( `wc -l $1 `)
13 | LEN=$(($LEN - 1))
14 | 
15 | echo ${NAMES[0]}
16 | echo ${BARCODES[0]}
17 | 
18 | for ((i = 0; i <= $LEN; i++));
19 | do
20 | mv sample_${BARCODES[$i]}-${BARCODES2[$i]}.1.fq.gz ${NAMES[$i]}.F.fq.gz
21 | mv sample_${BARCODES[$i]}-${BARCODES2[$i]}.2.fq.gz ${NAMES[$i]}.R.fq.gz
22 | done
23 | fi
24 | 


--------------------------------------------------------------------------------
/Sample Comparsion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpuritz/dDocent/414d6f3f3fba0be75d01da2797629c3808571113/Sample Comparsion.png


--------------------------------------------------------------------------------
/install_dDocent_requirements:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ###Script to check for and install most of the required software for dDocent
  4 | 
  5 | if [ -z "$1" ]; then
  6 | echo "Correct usage is sh install_dDocent_requirements [directory in your computer or user PATH]"
  7 | echo "if installing for all users, best to run this script as root"
  8 | exit 1
  9 | fi
 10 | 
 11 | INSTALL_PATH=$1
 12 | 
 13 | export PATH=$INSTALL_PATH:$PATH
 14 | 
 15 | echo "Checking for required software"
 16 | 
 17 | echo "Checking for cmake"
 18 | if which cmake &>/dev/null; then
 19 |     echo "cmake is already installed"
 20 | else
 21 | 	echo "Downloading and installing cmake"
 22 | 	curl -O https://cmake.org/files/v3.4/cmake-3.4.0-rc3.tar.gz
 23 | 	tar xvzf cmake-3.4.0-rc3.tar.gz
 24 | 	cd cmake-3.4.0-rc3
 25 | 	./bootstrap --prefix=$INSTALL_PATH
 26 | 	make
 27 | 	make install
 28 | 	ln -s $INSTALL_PATH/bin/* $INSTALL_PATH
 29 | 	cd ..
 30 | fi
 31 | 
 32 | echo "Checking for FreeBayes"
 33 | if which freebayes &>/dev/null; then
 34 |     echo "FreeBayes is already installed"
 35 | else
 36 |         echo "Downloading and installing FreeBayes"
 37 | 	git clone --recursive https://github.com/ekg/freebayes.git
 38 | 	cd freebayes
 39 |         make
 40 | 	cd bin
 41 | 	chmod +x *
 42 | 	cp -f * $INSTALL_PATH
 43 | 	cd ..
 44 | 	cd ./vcflib/
 45 | 	make
 46 | 	chmod +x ./bin/*
 47 | 	cp -f ./bin/* $INSTALL_PATH
 48 | 	cd ..
 49 | 	cd ..
 50 | fi
 51 | echo "Checking for vcflib"
 52 | if which vcfcombine &>/dev/null; then
 53 |     echo "vcflib is already installed"
 54 | else
 55 |         echo "Downloading and installing vcflib"
 56 | 	git clone --recursive git://github.com/ekg/freebayes.git
 57 |         cd freebayes
 58 | 	cd ./vcflib/
 59 | 	make
 60 | 	chmod +x ./bin/*
 61 | 	cp -f ./bin/* $INSTALL_PATH
 62 | 	cd ..
 63 | 	cd ..
 64 | fi
 65 | 
 66 | echo "Checking for Trimmomatic"
 67 | 
 68 | if find ${PATH//:/ } -maxdepth 1 -name trimmomatic*jar 2> /dev/null | awk '/./' ; then
 69 |     echo "Trimmomatic is already installed"
 70 | else
 71 | 	echo "Downloading and installing Trimmomatic"
 72 | 	curl -O http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.33.zip
 73 | 	unzip Trimmomatic-0.33.zip
 74 | 	cd Trimmomatic-0.33
 75 | 	cp -f ./adapters/* $INSTALL_PATH
 76 | 	cp -f trimmomatic-0.33.jar $INSTALL_PATH
 77 | 	cd ..
 78 | fi
 79 | 
 80 | if find ${PATH//:/ } -maxdepth 1 -name TruSeq2-PE.fa 2> /dev/null | grep -q '.' ; then
 81 |      echo "Trimmomatic adapters already installed"
 82 | else
 83 |     	echo "Downloading and installing Trimmomatic"
 84 |         curl -O http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.33.zip
 85 |         unzip Trimmomatic-0.33.zip
 86 |         cd Trimmomatic-0.33
 87 |         cp -f ./adapters/* $INSTALL_PATH
 88 |         cp -f trimmomatic-0.33.jar $INSTALL_PATH
 89 | 	cd ..
 90 | fi
 91 | 
 92 | 
 93 | echo "Checking for mawk"
 94 | if which mawk &>/dev/null; then
 95 |     echo "mawk is already installed"
 96 | else
 97 | 	echo "Downloading and installing mawk"
 98 | 	curl -O http://invisible-island.net/datafiles/release/mawk.tar.gz
 99 | 	tar xvzf mawk.tar.gz
100 | 	cd mawk-1.*
101 | 	./configure
102 | 	make
103 | 	chmod +x mawk
104 | 	cp -f mawk $INSTALL_PATH
105 | 	cd ..
106 | fi
107 | 
108 | echo "Checking for bwa"
109 | if which bwa &>/dev/null; then
110 |     echo "bwa is already installed"
111 | else
112 |     echo "Downloading and installing bwa"
113 | 	git clone https://github.com/lh3/bwa.git
114 | 	cd bwa
115 | 	make
116 | 	chmod +x bwa
117 | 	cp -f bwa $INSTALL_PATH
118 | 	cd ..
119 | fi
120 | 
121 | echo "Checking for samtools"
122 | if which samtools &>/dev/null; then
123 |     echo "samtools is already installed"
124 | else
125 | 	echo "Downloading and installing samtools"
126 | 	curl -L -o samtools-1.3.1.tar.bz2 "https://github.com/samtools/samtools/releases/download/1.3.1/samtools-1.3.1.tar.bz2"
127 | 	tar xvjf samtools-1.3.1.tar.bz2 
128 | 	cd samtools-1.3.1/
129 | 	./configure
130 | 	make
131 | 	chmod +x samtools
132 | 	cp -f samtools $INSTALL_PATH
133 | 	cd ..
134 | fi
135 | 
136 | echo "Checking for VCFtools"
137 | if which vcftools &>/dev/null; then
138 |     echo "VCFtools is already installed"
139 | else
140 | 	echo "Downloading and installing VCFtools"
141 | 	curl -L -o vcftools_x.tar.gz "https://github.com/vcftools/vcftools/releases/download/v0.1.14/vcftools-0.1.14.tar.gz"	
142 | 	tar xvzf vcftools_x.tar.gz
143 | 	cd vcftools-*
144 | 	./configure
145 | 	make
146 | 	chmod +x ./src/cpp/vcftools
147 | 	cp -f ./src/cpp/vcftools $INSTALL_PATH
148 | 	cd ..
149 | fi
150 | 
151 | echo "Checking for Rainbow"
152 | if which rainbow &>/dev/null; then
153 |         VER=(`rainbow | head -1 | cut -f2 -d' ' `)	
154 | 	if [[ "$VER" == "2.0.2" || "$VER" == "2.0.3" || "$VER" == "2.0.4" ]]; then
155 |         	echo "Rainbow is already installed"
156 | 	else
157 | 		echo "Downloading and installing Rainbow"
158 |         	curl -L -o rainbow.x.tar.gz "http://sourceforge.net/projects/bio-rainbow/files/latest/download?source=files"
159 |         	tar xvzf rainbow.x.tar.gz
160 |         	cd rainbow_*
161 |         	make
162 |         	chmod +x rainbow rbasm rbmergetag select_*
163 |         	cp -f rainbow rbasm rbmergetag select_* $INSTALL_PATH
164 | 		cd ..
165 | 	fi
166 | else
167 | 	echo "Downloading and installing Rainbow"
168 | 	curl -L -o rainbow.x.tar.gz "http://sourceforge.net/projects/bio-rainbow/files/latest/download?source=files"
169 | 	tar xvzf rainbow.x.tar.gz
170 | 	cd rainbow_*
171 | 	make
172 | 	chmod +x rainbow rbasm rbmergetag select_*
173 | 	cp -f rainbow rbasm rbmergetag select_* $INSTALL_PATH	
174 | 	cd ..
175 | fi
176 | 
177 | echo "Checking for gnuplot"
178 | if which gnuplot &>/dev/null; then
179 | 	echo "gnuplot is already installed"
180 | else
181 | 	echo "Downliading and installing gnuplot"
182 | 	curl -L -o gnuplot.x.tar.gz "http://sourceforge.net/projects/gnuplot/files/latest/download?source=files"
183 | 	tar xvzf gnuplot.x.tar.gz 
184 | 	cd gnuplot-* 
185 | 	current=`pwd` 
186 | 	./configure --prefix=$current/gnuplot_install 
187 | 	make install
188 | 	chmod +x gnuplot_install/bin/gnuplot
189 | 	cp -f gnuplot_install/bin/gnuplot $INSTALL_PATH 
190 | 	cd .. 
191 | fi
192 | 
193 | echo "Checking for seqtk"
194 | if which seqtk &>/dev/null; then
195 |     echo "seqtk is already installed"
196 | else
197 |     	echo "Downloading and installing seqtk"
198 | 	git clone https://github.com/lh3/seqtk.git
199 | 	cd seqtk
200 | 	make
201 | 	chmod +x seqtk
202 | 	cp -f seqtk $INSTALL_PATH	
203 | 	cd ..
204 | fi
205 | 
206 | echo "Checking for cd-hit"
207 | if which cd-hit-est &>/dev/null; then
208 |     echo "cd-hit is already installed"
209 | else
210 | 	echo "Downloading and installing cd-hit"
211 | 	curl -L -O https://github.com/weizhongli/cdhit/releases/download/V4.6.6/cd-hit-v4.6.6-2016-0711.tar.gz
212 | 	tar xvzf cd-hit-v4.6.6-2016-0711.tar.gz
213 | 	cd cd-hit-v4.6.6-2016-0711
214 | 	make openmp=yes
215 | 	chmod +x cd-hit-est cd-hit-est-2d
216 | 	cp -f cd-hit-est cd-hit-est-2d $INSTALL_PATH
217 | 	cd ..
218 | fi
219 | 
220 | echo "Checking for bedtools"
221 | if which bamToBed &>/dev/null; then
222 |     	echo "bedtools is already installed"
223 | else
224 | 	curl -L -O https://github.com/arq5x/bedtools2/releases/download/v2.26.0/bedtools-2.26.0.tar.gz
225 | 	tar xvzf bedtools-2.26.0.tar.gz 
226 | 	cd bedtools2
227 | 	make 
228 | 	cd bin
229 | 	chmod +x *
230 | 	cp -f * $INSTALL_PATH
231 | 	cd ..
232 | 	cd ..
233 | fi
234 | 
235 | echo "Checking for bedops"
236 | if which bedops &>/dev/null; then
237 | 	echo "bedops is already installed"
238 | else
239 | 	git clone https://github.com/bedops/bedops.git
240 | 	cd bedops
241 | 	make all
242 | 	make install
243 | 	cp -f bin/* $INSTALL_PATH
244 | 	cd ..
245 | fi
246 | 
247 | echo "Checking for bamtools"
248 | if which bamtools &>/dev/null; then
249 | 	echo "bamtools is already installed"
250 | else
251 | 	git clone https://github.com/pezmaster31/bamtools.git
252 | 	cd bamtools
253 | 	mkdir build
254 | 	cd build
255 | 	cmake ..
256 | 	make
257 | 	cd ../bin
258 | 	chmod +x bamtools
259 | 	cp -f bamtools $INSTALL_PATH
260 | 	cd ..
261 | 	cd ..
262 | fi
263 | 
264 | echo "Checking for GNU-parallel"
265 | if which parallel &>/dev/null; then
266 |     	echo "GNU-parallel is already installed"
267 | else
268 | 	curl -L -O http://ftp.gnu.org/gnu/parallel/parallel-latest.tar.bz2
269 | 	tar xvjf parallel-latest.tar.bz2
270 | 	rm parallel-latest.tar.bz2
271 | 	cd parallel*
272 | 	./configure && make
273 | 	chmod +x ./src/parallel ./src/sem ./src/niceload ./src/sql
274 | 	cp -f ./src/parallel ./src/sem ./src/niceload ./src/sql $INSTALL_PATH
275 | 	cd ..
276 | fi
277 | 
278 | echo "Checking for PEAR (read merger)"
279 | if which pearRM &>/dev/null; then
280 | 	echo "pear is already installed"
281 | else
282 | 	curl -L -O http://sco.h-its.org/exelixis/web/software/pear/files/pear-0.9.6-bin-64.tar.gz
283 | 	tar vxzf pear-0.9.6-bin-64.tar.gz 
284 | 	cd pear-0.9.6-bin-64
285 | 	cp -f pear-0.9.6-bin-64 pearRM
286 | 	chmod +x pearRM
287 | 	cp -f pearRM $INSTALL_PATH
288 | 	cd ..
289 | fi	
290 | 	
291 | echo "Now installing dDocent"
292 | 	curl -L -O https://github.com/jpuritz/dDocent/raw/master/dDocent
293 | 	chmod +x dDocent
294 | 	cp -f dDocent $INSTALL_PATH
295 | 


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpuritz/dDocent/414d6f3f3fba0be75d01da2797629c3808571113/logo.png


--------------------------------------------------------------------------------
/scripts/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpuritz/dDocent/414d6f3f3fba0be75d01da2797629c3808571113/scripts/.DS_Store


--------------------------------------------------------------------------------
/scripts/ErrorCount.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export LC_ALL=en_US.UTF-8
 3 | 
 4 | echo "This script counts the number of potential genotyping errors due to low read depth"
 5 | echo "It report a low range, based on a 50% binomial probability of observing the second allele in a heterozygote and a high range based on a 25% probability."
 6 | 
 7 | R1H01S0=$(grep -oh '0[/|]1:1:' $1 | wc -l) 
 8 | R1H01S1=$(grep -oh '1[/|]0:1:' $1 | wc -l) 
 9 | R1H0S1=$(grep -oh '0[/|]0:1:' $1 | wc -l) 
10 | R1H1S1=$(grep -oh '1[/|]1:1:' $1 | wc -l) 
11 | 
12 | R1GEN=$(python -c "print($R1H01S0+$R1H01S1+$R1H0S1+$R1H1S1)")
13 | R1ERR1=$(python -c "print($R1GEN/2)")
14 | R1ERR2=$(python -c "print($R1GEN*0.75)")
15 | 
16 | echo "Potential genotyping errors from genotypes from only 1 read range from $R1ERR1 to $R1ERR2" 
17 | 
18 | R2H01S0=$(grep -oh '0[/|]1:2:[02]' $1 | wc -l) 
19 | R2H01S2=$(grep -oh '1[/|]0:2:[02]' $1 | wc -l) 
20 | R2H0S1=$(grep -oh '0[/|]0:2:' $1 | wc -l) 
21 | R2H1S1=$(grep -oh '1[/|]1:2:' $1 | wc -l)
22 | 
23 | R2GEN=$(python -c "print($R2H01S0+$R2H01S2+$R2H1S1+$R2H0S1)")
24 | R2ERR1=$(python -c "print($R2GEN/4)")
25 | R2ERR2=$(python -c "print($R2GEN* 0.5625)")
26 | 
27 | echo "Potential genotyping errors from genotypes from only 2 reads range from $R2ERR1 to $R2ERR2" 
28 | 
29 | R3H01S0=$(grep -oh '0[/|]1:3:[03]' $1 | wc -l) 
30 | R3H01S2=$(grep -oh '1[/|]0:3:[03]' $1 | wc -l) 
31 | R3H0S1=$(grep -oh '0[/|]0:3:' $1 | wc -l)
32 | R3H1S3=$(grep -oh '1[/|]1:3:' $1 | wc -l)
33 | 
34 | R3GEN=$(python -c "print($R3H01S0+$R3H01S2+$R3H0S1+$R3H1S3)")
35 | R3ERR1=$(python -c "print($R3GEN/8)")
36 | R3ERR2=$(python -c "print($R3GEN*0.42)")
37 | 
38 | echo "Potential genotyping errors from genotypes from only 3 reads range from $R3ERR1 to $R3ERR2"
39 | 
40 | R4H0=$(grep -oh '0[/|]0:4:' $1 | wc -l)
41 | R4H1=$(grep -oh '1[/|]1:4:' $1 | wc -l)
42 | 
43 | R4GEN=$(python -c "print($R4H0+$R4H1)")
44 | R4ERR1=$(python -c "print($R4GEN/16)")
45 | R4ERR2=$(python -c "print($R4GEN*0.316)")
46 | 
47 | echo "Potential genotyping errors from genotypes from only 4 reads range from $R4ERR1 to $R4ERR2"
48 | 
49 | R5H0=$(grep -oh '0[/|]0:5:' $1 | wc -l)
50 | R5H1=$(grep -oh '1[/|]1:5:' $1 | wc -l)
51 | 
52 | R5GEN=$(python -c "print($R5H0+$R5H1)")
53 | R5ERR1=$(python -c "print($R5GEN/32)")
54 | R5ERR2=$(python -c "print(int(	$R5GEN*0.237))")
55 | 
56 | echo "Potential genotyping errors from genotypes from only 5 reads range from $R5ERR1 to $R5ERR2"
57 | 
58 | IND=$(mawk '/#/' $1 | tail -1 | wc -w)
59 | IND=$(($IND - 9))
60 | LOCI=$(mawk '!/#/' $1 | wc -l)
61 | MISSING=$(grep -Fwo ./.:. $1 | wc -l)
62 | GENO=$(( $IND * $LOCI  ))
63 | 
64 | echo $IND "number of individuals and" $LOCI "equals" $GENO "total genotypes"
65 | GENO=$(( $GENO - $MISSING))
66 | echo Total genotypes not counting missing data $GENO
67 | 
68 | TOTERR1=$(python -c "print($R1ERR1+$R2ERR1+$R3ERR1+$R4ERR1+$R5ERR1)")
69 | TOTERR2=$(python -c "print($R1ERR2+$R2ERR2+$R3ERR2+$R4ERR2+$R5ERR2)")
70 | ERRRATEL=$(python -c "print($TOTERR1/float($GENO))")
71 | ERRRATEH=$(python -c "print($TOTERR2/float($GENO))")
72 | 
73 | echo "Total potential error rate is between $ERRRATEL and $ERRRATEH"
74 | 
75 | ALL=$(($R1GEN+$R2GEN+$R3GEN+$R4GEN+$R5GEN))
76 | ERRALL=$(python -c "print($ALL/float($GENO))")
77 | 
78 | echo "SCORCHED EARTH SCENARIO"
79 | echo "WHAT IF ALL LOW DEPTH HOMOZYGOTE GENOTYPES ARE ERRORS?????"
80 | echo "The total SCORCHED EARTH error rate is $ERRALL."
81 | 


--------------------------------------------------------------------------------
/scripts/RefMapOpt.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | export LC_ALL=en_US.UTF-8
  3 | export SHELL=bash
  4 | v="2.9.6"
  5 | 
  6 | 
  7 | if [[ -z "$7" ]]; then
  8 | echo "Usage is RefMapOpt minK1 maxK1 minK2 maxK2 cluster_similarity Assembly_Type Num_of_Processors optional_list_of_individuals"
  9 | exit 1
 10 | fi
 11 | 
 12 | echo "Checking for required software"
 13 | DEP=(freebayes mawk bwa samtools vcftools rainbow gnuplot seqtk cd-hit-est bamToBed bedtools parallel vcfcombine pearRM fastp)
 14 | NUMDEP=0
 15 | for i in "${DEP[@]}"
 16 | do
 17 | 	if which $i &> /dev/null; then
 18 | 		foo=0
 19 | 	else
 20 |     		echo "The dependency" $i "is not installed or is not in your" '$PATH'"."
 21 |     		NUMDEP=$((NUMDEP + 1))
 22 | 	fi
 23 | done
 24 | 
 25 | SAMV1=$(samtools 2>&1 >/dev/null | grep Ver | sed -e 's/Version://' | cut -f2 -d " " | sed -e 's/-.*//' | cut -f1 -d ".")
 26 | SAMV2=$(samtools 2>&1 >/dev/null | grep Ver | sed -e 's/Version://' | cut -f2 -d " " | sed -e 's/-.*//' | cut -f2 -d ".")
 27 | 	if [ "$SAMV1"  -ge "1" ]; then
 28 | 		if [ "$SAMV2"  -lt "3" ]; then
 29 |         	echo "The version of Samtools installed in your" '$PATH' "is not optimized for dDocent."
 30 |         	echo "Please install at least version 1.3.0"
 31 | 			echo -en "\007"
 32 | 			echo -en "\007"
 33 | 			exit 1
 34 | 		fi
 35 | 	
 36 | 	else
 37 | 		    echo "The version of Samtools installed in your" '$PATH' "is not optimized for dDocent."
 38 |         	echo "Please install at least version 1.3.0"
 39 | 			echo -en "\007"
 40 | 			echo -en "\007"
 41 | 			exit 1
 42 | 	fi
 43 | 	
 44 | RAINV=(`rainbow | head -1 | cut -f2 -d' ' `)	
 45 | 	if [[ "$RAINV" != "2.0.2" && "$RAINV" != "2.0.3" && "$RAINV" != "2.0.4" ]]; then
 46 |         	echo "The version of Rainbow installed in your" '$PATH' "is not optimized for dDocent."
 47 |         	echo -en "\007"
 48 | 			echo -en "\007"
 49 | 			echo -en "\007"
 50 |         	echo "Is the version of rainbow installed newer than 2.0.2?  Enter yes or no."
 51 | 			read TEST
 52 | 			if [ "$TEST" != "yes" ]; then 
 53 |         		echo "Please install a version newer than 2.0.2"
 54 |         		exit 1
 55 |         	fi
 56 |         fi
 57 | FREEB=(`freebayes | grep -oh 'v[0-9].*' | cut -f1 -d "." | sed -e 's/v//' `)	
 58 | 	if [ "$FREEB" != "1" ]; then
 59 |         	echo "The version of FreeBayes installed in your" '$PATH' "is not optimized for dDocent."
 60 |         	echo "Please install at least version 1.0.0"
 61 |         	exit 1
 62 |         fi  
 63 | SEQTK=( `seqtk 2>&1  | grep Version | cut -f2 -d ":" |  sed -e 's/1.[0-9]-r//g' | sed -e 's/-dirty//g' `)
 64 | 	if [ "$SEQTK" -lt "102" ]; then
 65 | 		echo "The version of seqtk installed in your" '$PATH' "is not optimized for dDocent."
 66 |         	echo "Please install at least version 1.2-r102-dirty"
 67 |         	exit 1
 68 | 	fi
 69 | 	
 70 | VCFTV=$(vcftools | grep VCF | grep -oh '[0-9]*[a-z]*)$' | sed -e 's/[a-z)]//')
 71 | 	if [ "$VCFTV" -lt "10" ]; then
 72 |         	echo "The version of VCFtools installed in your" '$PATH' "is not optimized for dDocent."
 73 |         	echo "Please install at least version 0.1.11"
 74 |         	exit 1
 75 |         elif [ "$VCFTV" == "11" ]; then
 76 |                 VCFGTFLAG="--geno" 
 77 |         elif [ "$VCFTV" -ge "12" ]; then
 78 |                 VCFGTFLAG="--max-missing"
 79 | 	fi
 80 | BWAV=$(bwa 2>&1 | mawk '/Versi/' | sed -e 's/Version: //g' | sed -e 's/0.7.//g' | sed -e 's/-.*//g' | cut -c 1-2)
 81 | 	if [ "$BWAV" -lt "13" ]; then
 82 |         	echo "The version of bwa installed in your" '$PATH' "is not optimized for dDocent."
 83 |         	echo "Please install at least version 0.7.13"
 84 |         	exit 1
 85 | 	fi
 86 | 
 87 | BTC=$( bedtools --version | mawk '{print $2}' | sed -e 's/v//g' | cut -f1,2 -d"." | sed -e 's/2\.//g' )
 88 | 	if [ "$BTC" -ge "26" ]; then
 89 | 		BEDTOOLSFLAG="NEW"
 90 | 		elif [ "$BTC" == "23" ]; then
 91 | 		BEDTOOLSFLAG="OLD"
 92 | 		elif [ "$BTC" != "23" ]; then
 93 | 		echo "The version of bedtools installed in your" '$PATH' "is not optimized for dDocent."
 94 | 		echo "Please install version 2.23.0 or version 2.26.0 and above"
 95 | 		exit 1	
 96 | 	fi
 97 | 	
 98 | FASTP=$(fastp -v 2>&1 | cut -f2 -d " ")
 99 | FASTP1=$(echo $FASTP | cut -f1 -d ".")
100 | FASTP2=$(echo $FASTP | cut -f2 -d ".")
101 | FASTP3=$(echo $FASTP | cut -f3 -d ".")
102 | 	if [ "$FASTP1" -lt "2" ]; then
103 | 		if [ "$FASTP2" -lt "20" ]; then
104 | 			if [ "$FASTP2" -lt "5" ]; then
105 | 				echo "The version of fastp installed in your" '$PATH' "is not optimized for dDocent."
106 | 				echo "Please install version 0.19.5 or above"
107 | 				exit 1
108 | 			fi
109 | 		fi
110 | 	fi
111 | 	
112 | if ! sort --version | fgrep GNU &>/dev/null; then
113 | 	sort=gsort
114 | else
115 | 	sort=sort
116 | fi
117 | 
118 | if [ $NUMDEP -gt 0 ]; then
119 | 	echo -e "\nPlease install all required software before running RefMapOpt again."
120 | 	exit 1
121 | else
122 | 	echo -e "\nAll required software is installed!"
123 | fi
124 | 
125 | simC=$5
126 | 
127 | ATYPE=$6
128 | 
129 | if [[ $ATYPE != "SE" && $ATYPE != "PE" && $ATYPE != "OL" && $ATYPE != "HYB" && $ATYPE != "ROL" && $ATYPE != "RPE" ]]; then
130 | echo "Usage is RefMapOpt minK1 maxK1 minK2 maxK2 cluster_similarity Assembly_Type Num_of_Processors optional_list_of_individuals"
131 | echo "Please make sure to choose assembly type."
132 | exit 1
133 | fi
134 | 
135 | NUMProc=$7
136 | ls *.F.fq.gz > namelist
137 | sed -i'' -e 's/.F.fq.gz//g' namelist
138 | NAMES=( `cat "namelist" `)
139 | 
140 | echo -e "\ndDocent RefMapOpt version $v"
141 | 
142 | #This code checks for trimmed sequence files
143 | TEST=$(ls *.R1.fq.gz 2> /dev/null | wc -l )
144 | if [ "$TEST" -gt 0 ]; then
145 | 	echo -e "\nTrimmed sequences found, proceeding with optimization."
146 | else
147 | 	echo -e "\nRefMapOpt.sh requires that you have trimmed sequence files.\nPlease include trimmed sequence files with the .R1.fq.gz and .R2.fq.gz naming convention."
148 | 	echo "dDocent will create these for you"
149 | 	echo "Please rerun RefMapOpt.sh after trimming sequence files"
150 | exit 1
151 | fi
152 | 
153 | 
154 | Reference(){
155 | 
156 | CUTOFF=$1
157 | CUTOFF2=$2
158 | simC=$3
159 | 
160 | AWK1='BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}'
161 | AWK2='!/>/'
162 | AWK3='!/NNN/'
163 | AWK4='{for(i=0;i<$1;i++)print}'
164 | PERLT='while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}'
165 | SED1='s/^[ \t]*//'
166 | SED2='s/\s/\t/g'
167 | FRL=$(gunzip -c ${NAMES[0]}.F.fq.gz | mawk '{ print length() | "sort -rn" }' | head -1)
168 | 
169 | special_uniq(){
170 | 	mawk -v x=$1 '$1 >= x' $2  |cut -f2 | sed -e 's/NNNNNNNNNN/	/g' | cut -f1 | uniq
171 | }
172 | export -f special_uniq
173 | 
174 | if [ ${NAMES[@]:(-1)}.F.fq.gz -nt ${NAMES[@]:(-1)}.uniq.seqs ]; then
175 | 	if [[ "$ATYPE" == "PE" || "$ATYPE" == "RPE" ]]; then
176 | 	#If PE assembly, creates a concatenated file of every unique for each individual in parallel
177 | 		cat namelist | parallel --no-notice -j $NUMProc "gunzip -c {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.forward"
178 | 		cat namelist | parallel --no-notice -j $NUMProc "gunzip -c {}.R.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.reverse"
179 | 		if [ "$ATYPE" = "RPE" ]; then
180 | 			cat namelist | parallel --no-notice -j $NUMProc "paste {}.forward {}.reverse | $sort -k1 -S 200M > {}.fr"
181 | 			cat namelist | parallel --no-notice -j $NUMProc "cut -f1 {}.fr | uniq -c > {}.f.uniq && cut -f2 {}.fr > {}.r"
182 | 			cat namelist | parallel --no-notice -j $NUMProc "mawk '$AWK4' {}.f.uniq > {}.f.uniq.e" 
183 | 			cat namelist | parallel --no-notice -j $NUMProc "paste -d '-' {}.f.uniq.e {}.r | mawk '$AWK3'| sed -e 's/-/NNNNNNNNNN/' | sed -e '$SED1' | sed -e '$SED2'> {}.uniq.seqs"
184 | 			rm *.f.uniq.e *.f.uniq *.r *.fr
185 | 		else
186 | 			cat namelist | parallel --no-notice -j $NUMProc "paste -d '-' {}.forward {}.reverse | mawk '$AWK3'| sed -e 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.seqs"
187 | 		fi
188 | 		rm *.forward
189 | 		rm *.reverse
190 | 	fi
191 | 	
192 | 	if [ "$ATYPE" == "SE" ]; then
193 | 	#if SE assembly, creates files of every unique read for each individual in parallel
194 | 		cat namelist | parallel --no-notice -j $NUMProc "gunzip -c {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' | perl -e '$PERLT' > {}.uniq.seqs"
195 | 	fi
196 | 	
197 | 	if [ "$ATYPE" == "OL" ]; then
198 | 	#If OL assembly, dDocent assumes that the marjority of PE reads will overlap, so the software PEAR is used to merge paired reads into single reads
199 | 		for i in "${NAMES[@]}"; do
200 | 			gunzip -c $i.R.fq.gz | head -2 | tail -1 >> lengths.txt
201 | 		done	
202 | 		MaxLen=$(mawk '{ print length() | "sort -rn" }' lengths.txt| head -1)
203 | 		LENGTH=$(( $MaxLen / 3))
204 | 		for i in "${NAMES[@]}";	do
205 | 			pearRM -f $i.F.fq.gz -r $i.R.fq.gz -o $i -j $NUMProc -n $LENGTH 
206 | 		done
207 | 		cat namelist | parallel --no-notice -j $NUMProc "mawk '$AWK1' {}.assembled.fastq | mawk '$AWK2' | perl -e '$PERLT' > {}.uniq.seqs"
208 | 	fi
209 | 	if [ "$ATYPE" == "HYB" ]; then
210 | 	#If HYB assembly, dDocent assumes some PE reads will overlap but that some will not, so the OL method performed and remaining reads are then put through PE method
211 | 		for i in "${NAMES[@]}"; do
212 |       		gunzip -c $i.R.fq.gz | head -2 | tail -1 >> lengths.txt
213 | 		done	
214 | 		MaxLen=$(mawk '{ print length() | "sort -rn" }' lengths.txt| head -1)
215 | 		LENGTH=$(( $MaxLen / 3))
216 | 		for i in "${NAMES[@]}";	do
217 | 			pearRM -f $i.F.fq.gz -r $i.R.fq.gz -o $i -j $NUMProc -n $LENGTH &>kopt.log
218 | 		done
219 | 		cat namelist | parallel --no-notice -j $NUMProc "mawk '$AWK1' {}.assembled.fastq | mawk '$AWK2' | perl -e '$PERLT' > {}.uniq.seqs"
220 | 		
221 | 		cat namelist | parallel --no-notice -j $NUMProc "cat {}.unassembled.forward.fastq | mawk '$AWK1' | mawk '$AWK2' > {}.forward"
222 | 		cat namelist | parallel --no-notice -j $NUMProc "cat {}.unassembled.reverse.fastq | mawk '$AWK1' | mawk '$AWK2' > {}.reverse"
223 | 		cat namelist | parallel --no-notice -j $NUMProc "paste -d '-' {}.forward {}.reverse | mawk '$AWK3'| sed -e 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.ua.seqs"
224 | 		rm *.forward
225 | 		rm *.reverse
226 | 	fi	
227 | 	
228 | fi
229 | 
230 | #Create a data file with the number of unique sequences and the number of occurrences
231 | 
232 | if [ -f "uniq.seqs.gz" ]; then
233 | 	if [ uniq.seqs.gz -nt uniq.seqs ]; then
234 | 		gunzip uniq.seqs.gz 2>/dev/null
235 | 	fi
236 | fi
237 | 
238 | if [ ! -f "uniq.seqs" ]; then
239 | 	cat *.uniq.seqs > uniq.seqs
240 | fi
241 | 	
242 | if [[ -z $CUTOFF || -z $CUTOFF2 ]]; then
243 | 	getAssemblyInfo
244 | fi
245 | 
246 | if [[ "$ATYPE" == "RPE" || "$ATYPE" == "ROL" ]]; then
247 |   	parallel --no-notice -j $NUMProc --env special_uniq special_uniq $CUTOFF {} ::: *.uniq.seqs  | $sort --parallel=$NUMProc -S 2G | uniq -c > uniqCperindv
248 | else
249 | 	parallel --no-notice -j $NUMProc mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' > uniqCperindv
250 | fi
251 | 
252 | #Now that data cutoffs have been chosen, reduce data set to specified set of unique reads, convert to FASTA format,
253 | #and remove reads with substantial amounts of adapters
254 | 
255 | if [[ "$ATYPE" == "RPE" || "$ATYPE" == "ROL" ]]; then
256 |   parallel --no-notice -j $NUMProc mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | sed -e 's/NNNNNNNNNN/-/' >  total.uniqs
257 |   cut -f 1 -d "-" total.uniqs > total.u.F
258 |   cut -f 2 -d "-" total.uniqs > total.u.R
259 |   paste total.u.F total.u.R | $sort -k1 --parallel=$NUMProc -S 2G > total.fr
260 |  
261 |   parallel --no-notice --env special_uniq special_uniq $CUTOFF {} ::: *.uniq.seqs  | $sort --parallel=$NUMProc -S 2G | uniq -c > total.f.uniq
262 |   join -1 2 -2 1 -o 1.1,1.2,2.2 total.f.uniq total.fr | mawk '{print $1 "\t" $2 "NNNNNNNNNN" $3}' | mawk -v x=$CUTOFF2 '$1 >= x' > uniq.k.$CUTOFF.c.$CUTOFF2.seqs
263 |   rm total.uniqs total.u.* total.fr total.f.uniq* 
264 |   
265 | else
266 | 	parallel --no-notice mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | mawk -v x=$CUTOFF2 '$1 >= x' > uniq.k.$CUTOFF.c.$CUTOFF2.seqs
267 | fi
268 | $sort -k1 -r -n --parallel=$NUMProc -S 2G uniq.k.$CUTOFF.c.$CUTOFF2.seqs |cut -f2 > totaluniqseq
269 | mawk '{c= c + 1; print ">dDocent_Contig_" c "\n" $1}' totaluniqseq > uniq.full.fasta
270 | LENGTH=$(mawk '!/>/' uniq.full.fasta  | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
271 | LENGTH=$(($LENGTH * 3 / 4))
272 | seqtk seq -F I uniq.full.fasta > uniq.fq
273 | if [ "$NUMProc" -gt 8 ]; then
274 | 	NP=8
275 | else
276 | 	NP=$NUMProc
277 | fi
278 | MaxLen=$(mawk '!/>/' uniq.full.fasta  | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
279 | fastp -i uniq.fq -o uniq.fq1 -w $NP -Q -l $MaxLen &>/dev/null
280 | mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' uniq.fq1 | paste - - | sort -k1,1 -V | tr "\t" "\n" > uniq.fasta
281 | mawk '!/>/' uniq.fasta > totaluniqseq
282 | rm uniq.fq*
283 | 
284 | if [[ "$ATYPE" == "PE" || "$ATYPE" == "RPE" ]]; then
285 | 	pmerge(){
286 | 		num=$( echo $1 | sed -e 's/^0*//g')
287 | 		if [ "$num" -le 100 ]; then
288 | 			j=$num
289 | 			k=$(($num -1))
290 | 		else
291 | 			num=$(($num - 99))
292 |            		j=$(python -c "print ("$num" * 100)")
293 |                 	k=$(python -c "print ("$j" - 100)")
294 | 		fi
295 | 		mawk -v x="$j" -v y="$k" '$5 <= x && $5 > y'  rbdiv.out > rbdiv.out.$1
296 | 	   
297 | 	   	if [ -s "rbdiv.out.$1" ]; then
298 | 			rainbow merge -o rbasm.out.$1 -a -i rbdiv.out.$1 -r 2 -N10000 -R10000 -l 20 -f 0.75
299 | 		fi
300 | 	}
301 | 	export -f pmerge
302 | 
303 | 	#Reads are first clustered using only the Forward reads using CD-hit instead of rainbow
304 | 	if [ "$ATYPE" == "PE" ]; then
305 | 		sed -e 's/NNNNNNNNNN/	/g' uniq.fasta | cut -f1 > uniq.F.fasta
306 | 		CDHIT=$(python -c "print (max("$simC" - 0.1,0.8))")
307 | 		cd-hit-est -i uniq.F.fasta -o xxx -c $CDHIT -T $NUMProc -M 0 -g 1 -d 100 &>cdhit.log
308 | 		mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed -e 's/[>dDocent_Contig_,...]//g' | $sort -g -k1 -S 2G --parallel=$NUMProc > sort.contig.cluster.ids
309 | 		paste sort.contig.cluster.ids totaluniqseq > contig.cluster.totaluniqseq
310 | 	else
311 | 		sed -e 's/NNNNNNNNNN/	/g' totaluniqseq | cut -f1 | $sort --parallel=$NUMProc -S 2G| uniq | mawk '{c= c + 1; print ">dDocent_Contig_" c "\n" $1}' > uniq.F.fasta
312 | 		CDHIT=$(python -c "print (max("$simC" - 0.1,0.8))")
313 | 		cd-hit-est -i uniq.F.fasta -o xxx -c $CDHIT -T $NUMProc -M 0 -g 1 -d 100 &>cdhit.log
314 | 		mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed -e 's/[>dDocent_Contig_,...]//g' | $sort -g -k1 -S 2G --parallel=$NUMProc > sort.contig.cluster.ids
315 | 		paste sort.contig.cluster.ids <(mawk '!/>/' uniq.F.fasta) > contig.cluster.Funiq
316 | 		sed -e 's/NNNNNNNNNN/	/g' totaluniqseq | $sort --parallel=$NUMProc -k1 -S 2G | mawk '{print $0 "\t" NR}'  > totaluniqseq.CN
317 | 		join -t $'\t' -1 3 -2 1 contig.cluster.Funiq totaluniqseq.CN -o 2.3,1.2,2.1,2.2 > contig.cluster.totaluniqseq
318 | 	fi	
319 | 	
320 | 	#CD-hit output is converted to rainbow format
321 | 	$sort -k2,2 -g contig.cluster.totaluniqseq -S 2G --parallel=$NUMProc | sed -e 's/NNNNNNNNNN/	/g' > rcluster
322 | 	rainbow div -i rcluster -o rbdiv.out -f 0.5 -K 10
323 | 	CLUST=(`tail -1 rbdiv.out | cut -f5`)
324 | 	CLUST1=$(( $CLUST / 100 + 1))
325 | 	CLUST2=$(( $CLUST1 + 100 ))
326 | 	
327 | 	seq -w 1 $CLUST2 | parallel --no-notice -j $NUMProc --env pmerge pmerge {}
328 | 	
329 | 	cat rbasm.out.[0-9]* > rbasm.out
330 | 	rm rbasm.out.[0-9]* rbdiv.out.[0-9]*
331 | 
332 | 	#This AWK code replaces rainbow's contig selection perl script
333 | 	LENGTH=$(cut -f3 rbdiv.out |mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
334 | 
335 | 	LENGTH=$(( $LENGTH * 11 / 10 ))
336 | 
337 | 	cat rbasm.out <(echo "E") |sed -e 's/[0-9]*:[0-9]*://g' | mawk -v mlen=$LENGTH  '{
338 | 		if (NR == 1) e=$2;
339 | 		else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_A_Contig_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
340 | 		else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
341 | 		else if ($1 ~/C/) clus=$2;
342 | 		else if ($1 ~/L/) len=$2;
343 | 		else if ($1 ~/S/) seq=$2;
344 | 		else if ($1 ~/N/) freq=$2;
345 | 		else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len}
346 | 		else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/ && $0 ~/^R 0/ && len <= mlen) {seq1 = seq; fclus=clus;lenf=len}
347 | 		else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/ && $0 ~!/^R 0/ && len > mlen) {seq1 = seq; fclus=clus; len1=len}
348 | 		else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/ && $0 ~!/^R 0/ && len <= mlen) {seq1 = seq; fclus=clus; lenf=len}
349 | 		else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq}
350 | 		}' > rainbow.fasta
351 | 
352 | 
353 | 	seqtk seq -r rainbow.fasta > rainbow.RC.fasta
354 | 	mv rainbow.RC.fasta rainbow.fasta
355 | 
356 | 	#The rainbow assembly is checked for overlap between newly assembled Forward and Reverse reads using the software PEAR
357 | 
358 | 	grep -A1 "dDocent_A_Contig_" rainbow.fasta | mawk '!/^--/' | sed -e 's/dDocent_A_Contig_/dDocent_Contig_/g' > rainbow.asm.fasta
359 | 	grep -A1 "dDocent_Contig_" rainbow.fasta | mawk '!/^--/' > rainbow.n.fasta
360 | 
361 | 	sed -e 's/NNNNNNNNNN/	/g' rainbow.asm.fasta | cut -f1 | seqtk seq -F I - > ref.F.fq
362 | 	sed -e 's/NNNNNNNNNN/	/g' rainbow.asm.fasta | cut -f2 | seqtk seq -F I - > ref.R.fq
363 | 
364 | 	seqtk seq -r ref.R.fq > ref.RC.fq
365 | 	mv ref.RC.fq ref.R.fq
366 | 	LENGTH=$(mawk '!/>/' rainbow.fasta | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
367 | 	LENGTH=$(( $LENGTH * 5 / 4))
368 | 
369 | 	pearRM -f ref.F.fq -r ref.R.fq -o overlap -p 0.001 -j $NUMProc -n $LENGTH &>kopt.log
370 | 
371 | 	rm ref.F.fq ref.R.fq
372 | 
373 | 	mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' overlap.assembled.fastq > overlap.fasta
374 | 	mawk '/>/' overlap.fasta > overlap.loci.names
375 | 	mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' overlap.unassembled.forward.fastq > other.F
376 | 	mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' overlap.unassembled.reverse.fastq > other.R
377 | 	paste other.F other.R | mawk '{if ($1 ~ />/) print $1; else print $0}' | sed -e 's/	/NNNNNNNNNN/g' > other.FR
378 | 
379 | 	cat other.FR overlap.fasta rainbow.n.fasta > totalover.fasta
380 | 	paste <(mawk '{if (NR % 2) print $0}' totalover.fasta) <(mawk '{if (NR % 2 == 0) print $0}' totalover.fasta) | sort -V | sed -e 's/	/\'$'\n/g' > totalover.s.fasta
381 | 	mv totalover.s.fasta totalover.fasta
382 | 	rm *.F *.R
383 | fi
384 | 
385 | if [[ "$ATYPE" == "HYB" ]];then
386 | 	parallel --no-notice mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.ua.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | mawk -v x=$2 '$1 >= x' > uniq.k.$CUTOFF.c.$CUTOFF2.ua.seqs
387 | 	AS=$(cat uniq.k.$CUTOFF.c.$CUTOFF2.ua.seqs | wc -l)
388 | 	if [ "$AS" -gt 1 ]; then
389 | 		cut -f2 uniq.k.$CUTOFF.c.$CUTOFF2.ua.seqs > totaluniqseq.ua
390 | 		mawk '{c= c + 1; print ">dDocent_Contig_" c "\n" $1}' totaluniqseq.ua > uniq.full.ua.fasta
391 | 		LENGTH=$(mawk '!/>/' uniq.full.ua.fasta  | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
392 | 		LENGTH=$(($LENGTH * 3 / 4))
393 | 		seqtk seq -F I uniq.full.ua.fasta > uniq.ua.fq
394 | 		if [ "$NUMProc" -gt 8 ]; then
395 | 			NP=8
396 | 		else
397 | 			NP=$NUMProc
398 | 		fi
399 | 		MaxLen=$(mawk '!/>/' uniq.full.ua.fasta  | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
400 | 		fastp -i uniq.ua.fq -o uniq.ua.fq1 -w $NP -Q -l $MaxLen &>/dev/null
401 | 		mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' uniq.ua.fq1 > uniq.ua.fasta
402 | 		mawk '!/>/' uniq.ua.fasta > totaluniqseq.ua
403 | 		rm uniq.ua.fq*
404 | 		#Reads are first clustered using only the Forward reads using CD-hit instead of rainbow
405 | 		sed -e 's/NNNNNNNNNN/	/g' uniq.ua.fasta | cut -f1 > uniq.F.ua.fasta
406 | 		CDHIT=$(python -c "print(max("$simC" - 0.1,0.8))")
407 | 		cd-hit-est -i uniq.F.ua.fasta -o xxx -c $CDHIT -T 0 -M 0 -g 1 -d 100 &>cdhit.log
408 | 		mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed -e 's/[>dDocent_Contig_,...]//g' | $sort -g -k1 -S 2G --parallel=$NUMProc > sort.contig.cluster.ids.ua
409 | 		paste sort.contig.cluster.ids.ua totaluniqseq.ua > contig.cluster.totaluniqseq.ua
410 | 		$sort -k2,2 -g -S 2G --parallel=$NUMProc contig.cluster.totaluniqseq.ua | sed -e 's/NNNNNNNNNN/	/g' > rcluster.ua
411 | 		#CD-hit output is converted to rainbow format
412 | 		rainbow div -i rcluster.ua -o rbdiv.ua.out -f 0.5 -K 10
413 | 		if [ "$ATYPE" == "PE" ]; then
414 | 			rainbow merge -o rbasm.ua.out -a -i rbdiv.ua.out -r 2 -N10000 -R10000 -l 20 -f 0.75
415 | 		else
416 | 			rainbow merge -o rbasm.ua.out -a -i rbdiv.ua.out -r 2 -N10000 -R10000 -l 20 -f 0.75
417 | 		fi
418 | 		
419 | 		#This AWK code replaces rainbow's contig selection perl script
420 | 		cat rbasm.ua.out <(echo "E") |sed -e 's/[0-9]*:[0-9]*://g' | mawk ' {
421 | 			if (NR == 1) e=$2;
422 | 			else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_Contig_UA_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
423 | 			else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_UA_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
424 | 			else if ($1 ~/C/) clus=$2;
425 | 			else if ($1 ~/L/) len=$2;
426 | 			else if ($1 ~/S/) seq=$2;
427 | 			else if ($1 ~/N/) freq=$2;
428 | 			else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len}
429 | 			else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/) {seq1 = seq; fclus=clus; len1=len}
430 | 			else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq}
431 | 			}' > rainbow.ua.fasta
432 | 	
433 | 		seqtk seq -r rainbow.ua.fasta > rainbow.RC.fasta
434 | 		mv rainbow.RC.fasta rainbow.ua.fasta
435 | 	
436 | 		cat rainbow.ua.fasta uniq.fasta > totalover.fasta
437 | 		paste <(mawk '{if (NR % 2) print $0}' totalover.fasta) <(mawk '{if (NR % 2 == 0) print $0}' totalover.fasta) | sort -V | sed -e 's/	/\'$'\n/g' > totalover.s.fasta
438 | 		mv totalover.s.fasta totalover.fasta
439 | 	fi
440 | fi
441 | 
442 | if [[ "$ATYPE" != "PE" && "$ATYPE" != "RPE" && "$ATYPE" != "HYB" ]]; then
443 | 	cp uniq.fasta totalover.fasta
444 | 	paste <(mawk '{if (NR % 2) print $0}' totalover.fasta) <(mawk '{if (NR % 2 == 0) print $0}' totalover.fasta) | sort -V | sed -e 's/	/\'$'\n/g' > totalover.s.fasta
445 | 	mv totalover.s.fasta totalover.fasta
446 | fi
447 | cd-hit-est -i totalover.fasta -o reference.fasta.original -M 0 -T 0 -c $simC &>cdhit2.log
448 | 
449 | sed -e 's/^C/NC/g' -e 's/^A/NA/g' -e 's/^G/NG/g' -e 's/^T/NT/g' -e 's/T$/TN/g' -e 's/A$/AN/g' -e 's/C$/CN/g' -e 's/G$/GN/g' reference.fasta.original > reference.fasta
450 | 
451 | if [[ "$ATYPE" == "RPE" || "$ATYPE" == "ROL" ]]; then
452 | 	sed -i 's/dDocent/dDocentR/g' reference.fasta
453 | fi
454 | 
455 | samtools faidx reference.fasta &> index.log
456 | bwa index reference.fasta >> index.log 2>&1
457 | 
458 | SEQS=$(mawk 'END {print NR}' uniq.k.$CUTOFF.c.$CUTOFF2.seqs)
459 | TIGS=$(grep ">" -c reference.fasta)
460 | 
461 | echo -e "\ndDocent assembled $SEQS sequences (after cutoffs) into $TIGS contigs"
462 | 
463 | }
464 | 
465 | 
466 | ls -S *.F.fq.gz > namelist
467 | sed -i'' -e 's/.F.fq.gz//g' namelist
468 | 
469 | if [[ -z "$8" ]]; then
470 | NUMIND=$(cat namelist | wc -l)
471 | NUM1=$(($NUMIND / 10))
472 | NUM2=$(($NUMIND - $NUM1))
473 | NUM3=$(($NUM2 - $NUM1))
474 | cat namelist | head -$NUM2 | tail -$NUM3 > newlist
475 | NAMESR=( `cat "newlist" `)
476 | LEN=${#NAMESR[@]}
477 | LEN=$(($LEN - 1))
478 | echo "5" >randlist
479 | 	for ((rr = 1; rr<=50; rr++));
480 | 	do
481 | 	INDEX=$[ 1 + $[ RANDOM % $LEN ]]
482 | 		if grep -q ${NAMESR[$INDEX]} randlist;
483 | 		then x=x
484 | 		else
485 | 	echo ${NAMESR[$INDEX]} >> randlist
486 | 		fi
487 | 	done
488 | 
489 | RANDNAMES=( `mawk '!/^5$/' "randlist" | head -21 `)
490 | else
491 | RANDNAMES=( `cat "$8" `)
492 | fi
493 | 
494 | rm lengths.txt &> /dev/null
495 | for k in "${RANDNAMES[@]}";
496 | 	do
497 | 	if [ -f "$k.R.fq.gz" ]; then
498 | 		gunzip -c $k.R.fq.gz | head -2 | tail -1 >> lengths.txt
499 | 	fi
500 | 	done	
501 | 
502 | 
503 | rm rand.proc 2>/dev/null
504 | 
505 | for k in "${RANDNAMES[@]}"
506 | do
507 | 	echo $k >> rand.proc
508 | done
509 | 
510 | echo -e "Mean_Coverage_PerContig\tMeanCoverage_PerContig_NoZero\tContigs\tMean_NumberOfContigs_Mapped\tK1\tK2\tSUM_Mapped\tSUM_Properly\tMean_Mapped\tMean_Properly\tMean_MisMatched" > mapping.results
511 | 
512 | for ((r = $1; r <= $2; r++));
513 | do
514 | 	for ((j = $3; j <= $4; j++));
515 | 	do
516 | 	PP=$(($r + $j))
517 | 	if [ "$PP" != "2" ]; then
518 | 		Reference $r $j $simC
519 |     	#BWA for mapping for all samples
520 |     	rm $r.$j.results 2>/dev/null
521 |     	
522 |     	map_reads(){	
523 |     	r=$2;j=$3
524 | 		if [[ "$ATYPE" == "HYB"  || "$ATYPE" == "ROL" || "$ATYPE" == "RPE" ]]; then
525 | 			if [ -f "$1.R2.fq.gz" ]; then
526 | 				bwa mem -L 20,5 -t 8 -a -M -T 10 -A1 -B 3 -O 5 -R "@RG\tID:$1\tSM:$1\tPL:Illumina" reference.fasta $1.R1.fq.gz $1.R2.fq.gz  2> bwa.$1.log | mawk '!/\t[2-9].[SH].*/' | mawk '!/[2-9].[SH]\t/' | samtools view -@4 -q 1 -SbT reference.fasta - > $1.bam
527 | 			else
528 | 				bwa mem -L 20,5 -t 8 -a -M -T 10 -A1 -B 3 -O 5 -R "@RG\tID:$1\tSM:$1\tPL:Illumina" reference.fasta $1.R1.fq.gz 2> bwa.$1.log | mawk '!/\t[2-9].[SH].*/' | mawk '!/[2-9].[SH]\t/' | samtools view -@4 -q 1 -SbT reference.fasta - > $1.bam
529 | 			fi
530 | 		else
531 | 			if [ -f "$1.R2.fq.gz" ]; then
532 | 			
533 |     			INSERT=$(mawk '!/^>/' reference.fasta | mawk '{ print length() }' | mawk '{ sum += $1; n++ } END { if (n > 0) print int(sum / n); }')
534 |     			INSERTH=$(($INSERT + 100 ))
535 |     			INSERTL=$(($INSERT - 100 ))
536 |     			SD=$(($INSERT / 5))
537 | 			
538 |     			bwa mem -L 20,5 -I $INSERT,$SD,$INSERTH,$INSERTL -t 8 -a -M -T 10 -A 1 -B 3 -O 5 -R "@RG\tID:$1\tSM:$1\tPL:Illumina" reference.fasta $1.R1.fq.gz $1.R2.fq.gz 2> bwa.$1.log | mawk '!/\t[2-9].[SH].*/' | mawk '!/[2-9].[SH]\t/' | samtools view -@4 -q 1 -SbT reference.fasta - > $1.bam
539 |     			
540 | 			else
541 |     			
542 | 			bwa mem -L 20,5 -t 8 -a -M -T 10 -A 1 -B 3 -O 5 -R "@RG\tID:$1\tSM:$1\tPL:Illumina" reference.fasta $1.R1.fq.gz 2> bwa.$1.log | mawk '!/\t[2-9].[SH].*/' | mawk '!/[2-9].[SH]\t/' | samtools view -@4 -q 1 -SbT reference.fasta - > $1.bam
543 |     			fi
544 |     		fi
545 | 		
546 | 		samtools sort -@4 $1.bam -o $1.bam 2> /dev/null
547 | 		samtools index $1.bam
548 |     	Mappings=$(samtools flagstat $1.bam | grep -E 'mapped \(|properly' | cut -f1 -d '+' | tr -d '\n')
549 |     	Contigs_Mapped=$(samtools idxstats $1.bam | mawk '$3 > 0' | wc -l)
550 |     	Mean_Mapped_PerContig=$(samtools idxstats $1.bam | mawk '{ sum += $3; n++ } END { if (n > 0) print sum / n; }')
551 | 	Mean_Mapped_PerContig_NoZero=$(samtools idxstats $1.bam | mawk '$3 >0' | mawk '{ sum += $3; n++ } END { if (n > 0) print sum / n; }')
552 | 	Discordant=$(samtools flagstat $1.bam | grep mapQ | cut -f1 -d ' ')
553 | 	echo -e "$Mappings\t$Contigs_Mapped\t$Mean_Mapped_PerContig\t$Mean_Mapped_PerContig_NoZero\t$Discordant" >> $r.$j.results
554 |     	}
555 |     	export -f map_reads
556 |     	cat rand.proc | parallel --no-notice -j $NUMProc --env map_reads map_reads {} $r $j
557 |     	SUM_Mappings=$(mawk '{ sum+=$1} END {print sum}' $r.$j.results)
558 |     	SUM_Properly_Paired=$(mawk '{ sum+=$2} END {print sum}' $r.$j.results)
559 |     	Mean_Mapping=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $r.$j.results)
560 |     	Mean_Properly_Paired=$(mawk '{ sum += $2; n++ } END { if (n > 0) print sum / n; }' $r.$j.results)
561 |     	Meta_Mean_Mapped_PerContig=$(mawk '{ sum += $3; n++ } END { if (n > 0) print sum / n; }' $r.$j.results)
562 |     	Contigs=$(mawk '/>/' reference.fasta | wc -l)
563 | 	Mean_Contigs_Mapped=$(mawk '{ sum += $4; n++ } END { if (n > 0) print sum / n; }' $r.$j.results)
564 | 	Meta_Mean_Mapped_PerContig=$(mawk '{ sum += $5; n++ } END { if (n > 0) print sum / n; }' $r.$j.results)
565 | 	Meta_Mean_Mapped_PerContig_NoZero=$(mawk '{ sum += $6; n++ } END { if (n > 0) print sum / n; }' $r.$j.results)
566 |  	Mean_Discordant=$(mawk '{ sum += $7; n++ } END { if (n > 0) print sum / n; }' $r.$j.results)
567 | 	echo -e "$Meta_Mean_Mapped_PerContig\t$Meta_Mean_Mapped_PerContig_NoZero\t$Contigs\t$Mean_Contigs_Mapped\t$r\t$j\t$SUM_Mappings\t$SUM_Properly_Paired\t$Mean_Mapping\t$Mean_Properly_Paired\t$Mean_Discordant" >> mapping.results
568 | 	fi
569 | 	done
570 |     		
571 | done
572 | 


--------------------------------------------------------------------------------
/scripts/ReferenceOpt.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | export LC_ALL=en_US.UTF-8
  3 | export SHELL=bash
  4 | v="2.9.5"
  5 | 
  6 | if [[ -z "$6" ]]; then
  7 | echo "Usage is sh ReferenceOpt.sh minK1 maxK1 minK2 maxK2 Assembly_Type Number_of_Processors"
  8 | echo -e "\n\n"
  9 | echo "Optionally, a new range of similarities can be entered as well:"
 10 | echo "ReferenceOpt.sh minK1 maxK1 minK2 maxK2 Assembly_Type Number_of_Processors minSim maxSim increment"
 11 | echo -e "\nFor example, to scale between 0.95 and 0.99 using 0.005 increments:\nReferenceOpt.sh minK1 maxK1 minK2 maxK2 Assembly_Type Number_of_Processors 0.95 0.99 0.005"
 12 | exit
 13 | fi
 14 | 
 15 | if ! sort --version | fgrep GNU &>/dev/null; then
 16 | 	sort=gsort
 17 | else
 18 | 	sort=sort
 19 | fi
 20 | 
 21 | DEP=(mawk samtools rainbow gnuplot seqtk cd-hit-est parallel pearRM fastp)
 22 | NUMDEP=0
 23 | for i in "${DEP[@]}"
 24 | do
 25 | 	if which $i &> /dev/null; then
 26 | 		foo=0
 27 | 	else
 28 |     		echo "The dependency" $i "is not installed or is not in your" '$PATH'"."
 29 |     		NUMDEP=$((NUMDEP + 1))
 30 | 	fi
 31 | done
 32 | 
 33 | 
 34 | FASTP=$(fastp -v 2>&1 | cut -f2 -d " ")
 35 | FASTP1=$(echo $FASTP | cut -f1 -d ".")
 36 | FASTP2=$(echo $FASTP | cut -f2 -d ".")
 37 | FASTP3=$(echo $FASTP | cut -f3 -d ".")
 38 | 	if [ "$FASTP1" -lt "2" ]; then
 39 | 		if [ "$FASTP2" -lt "20" ]; then
 40 | 			if [ "$FASTP2" -lt "5" ]; then
 41 | 				echo "The version of fastp installed in your" '$PATH' "is not optimized for dDocent."
 42 | 				echo "Please install version 0.19.5 or above"
 43 | 				exit 1
 44 | 			fi
 45 | 		fi
 46 | 	fi
 47 | 
 48 | 
 49 | ATYPE=$5
 50 | NUMProc=$6
 51 | 
 52 | if [[ -z "$7" ]]; then
 53 | 
 54 | 	minSim=0.8
 55 | 	maxSim=0.98
 56 | 	incSim=0.02
 57 | else
 58 | 
 59 | 	minSim=$7 
 60 | 	maxSim=$8
 61 | 	incSim=$9
 62 | fi
 63 | 
 64 | if [ $NUMDEP -gt 0 ]; then
 65 | 	echo -e "\nPlease install all required software before running ReferenceOpt again."
 66 | 	exit 1
 67 | else
 68 | 	echo -e "\nAll required software is installed!"
 69 | fi
 70 | 
 71 | echo -e "\ndDocent ReferenceOpt version $v"
 72 | 
 73 | ls *.F.fq.gz > namelist
 74 | sed -i'' -e 's/.F.fq.gz//g' namelist
 75 | NAMES=( `cat "namelist" `)
 76 | 
 77 | getAssemblyInfo(){
 78 | echo "nope!"
 79 | }
 80 | 
 81 | 
 82 | Reference(){
 83 | 
 84 | CUTOFF=$1
 85 | CUTOFF2=$2
 86 | simC=$3
 87 | 
 88 | AWK1='BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}'
 89 | AWK2='!/>/'
 90 | AWK3='!/NNN/'
 91 | AWK4='{for(i=0;i<$1;i++)print}'
 92 | PERLT='while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}'
 93 | SED1='s/^[ \t]*//'
 94 | SED2='s/\s/\t/g'
 95 | FRL=$(gunzip -c ${NAMES[0]}.F.fq.gz | mawk '{ print length() | "sort -rn" }' | head -1)
 96 | 
 97 | special_uniq(){
 98 | 	mawk -v x=$1 '$1 >= x' $2  |cut -f2 | sed -e 's/NNNNNNNNNN/	/g' | cut -f1 | uniq
 99 | }
100 | export -f special_uniq
101 | 
102 | if [ ${NAMES[@]:(-1)}.F.fq.gz -nt ${NAMES[@]:(-1)}.uniq.seqs ];then
103 | 	if [[ "$ATYPE" == "PE" || "$ATYPE" == "RPE" ]]; then
104 | 	#If PE assembly, creates a concatenated file of every unique for each individual in parallel
105 | 		cat namelist | parallel --no-notice -j $NUMProc "gunzip -c {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.forward"
106 | 		cat namelist | parallel --no-notice -j $NUMProc "gunzip -c {}.R.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.reverse"
107 | 		if [ "$ATYPE" = "RPE" ]; then
108 | 			cat namelist | parallel --no-notice -j $NUMProc "paste {}.forward {}.reverse | $sort -k1 -S 200M > {}.fr"
109 | 			cat namelist | parallel --no-notice -j $NUMProc "cut -f1 {}.fr | uniq -c > {}.f.uniq && cut -f2 {}.fr > {}.r"
110 | 			cat namelist | parallel --no-notice -j $NUMProc "mawk '$AWK4' {}.f.uniq > {}.f.uniq.e" 
111 | 			cat namelist | parallel --no-notice -j $NUMProc "paste -d '-' {}.f.uniq.e {}.r | mawk '$AWK3'| sed -e 's/-/NNNNNNNNNN/' | sed -e '$SED1' | sed -e '$SED2'> {}.uniq.seqs"
112 | 			rm *.f.uniq.e *.f.uniq *.r *.fr
113 | 		else
114 | 			cat namelist | parallel --no-notice -j $NUMProc "paste -d '-' {}.forward {}.reverse | mawk '$AWK3'| sed -e 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.seqs"
115 | 		fi
116 | 		rm *.forward
117 | 		rm *.reverse
118 | 	fi
119 | 	
120 | 	if [ "$ATYPE" == "SE" ]; then
121 | 	#if SE assembly, creates files of every unique read for each individual in parallel
122 | 		cat namelist | parallel --no-notice -j $NUMProc "gunzip -c {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' | perl -e '$PERLT' > {}.uniq.seqs"
123 | 	fi
124 | 	
125 | 	if [ "$ATYPE" == "OL" ]; then
126 | 	#If OL assembly, dDocent assumes that the marjority of PE reads will overlap, so the software PEAR is used to merge paired reads into single reads
127 | 		for i in "${NAMES[@]}";
128 |         		do
129 |         		gunzip -c $i.R.fq.gz | head -2 | tail -1 >> lengths.txt
130 |         		done	
131 |         	MaxLen=$(mawk '{ print length() | "sort -rn" }' lengths.txt| head -1)
132 | 		LENGTH=$(( $MaxLen / 3))
133 | 		for i in "${NAMES[@]}"
134 | 			do
135 | 			pearRM -f $i.F.fq.gz -r $i.R.fq.gz -o $i -j $NUMProc -n $LENGTH &>kopt.log
136 | 			done
137 | 		cat namelist | parallel --no-notice -j $NUMProc "mawk '$AWK1' {}.assembled.fastq | mawk '$AWK2' | perl -e '$PERLT' > {}.uniq.seqs"
138 | 	fi
139 | 	if [ "$ATYPE" == "HYB" ]; then
140 | 	#If HYB assembly, dDocent assumes some PE reads will overlap but that some will not, so the OL method performed and remaining reads are then put through PE method
141 | 		for i in "${NAMES[@]}";
142 |       		do
143 |       		gunzip -c $i.R.fq.gz | head -2 | tail -1 >> lengths.txt
144 |       		done	
145 |     		MaxLen=$(mawk '{ print length() | "sort -rn" }' lengths.txt| head -1)
146 |     		LENGTH=$(( $MaxLen / 3))
147 | 		for i in "${NAMES[@]}"
148 | 			do
149 | 			pearRM -f $i.F.fq.gz -r $i.R.fq.gz -o $i -j $NUMProc -n $LENGTH &>kopt.log
150 | 			done
151 | 		cat namelist | parallel --no-notice -j $NUMProc "mawk '$AWK1' {}.assembled.fastq | mawk '$AWK2' | perl -e '$PERLT' > {}.uniq.seqs"
152 | 		
153 | 		cat namelist | parallel --no-notice -j $NUMProc "cat {}.unassembled.forward.fastq | mawk '$AWK1' | mawk '$AWK2' > {}.forward"
154 | 		cat namelist | parallel --no-notice -j $NUMProc "cat {}.unassembled.reverse.fastq | mawk '$AWK1' | mawk '$AWK2' > {}.reverse"
155 | 		cat namelist | parallel --no-notice -j $NUMProc "paste -d '-' {}.forward {}.reverse | mawk '$AWK3'| sed -e 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.ua.seqs"
156 | 		rm *.forward
157 | 		rm *.reverse
158 | 	fi	
159 | 	
160 | fi
161 | 
162 | #Create a data file with the number of unique sequences and the number of occurrences
163 | 
164 | if [ -f "uniq.seqs.gz" ]; then
165 | 	if [ uniq.seqs.gz -nt uniq.seqs ]; then
166 | 	gunzip uniq.seqs.gz 2>/dev/null
167 | 	fi
168 | fi
169 | 
170 | if [ ! -f "uniq.seqs" ]; then
171 | 	cat *.uniq.seqs > uniq.seqs
172 | fi
173 | 	
174 | if [[ -z $CUTOFF || -z $CUTOFF2 ]]; then
175 | getAssemblyInfo
176 | fi
177 | 
178 | if [[ "$ATYPE" == "RPE" || "$ATYPE" == "ROL" ]]; then
179 |   	parallel --no-notice -j $NUMProc --env special_uniq special_uniq $CUTOFF {} ::: *.uniq.seqs  | $sort --parallel=$NUMProc -S 2G | uniq -c > uniqCperindv
180 | else
181 | 	parallel --no-notice -j $NUMProc mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' > uniqCperindv
182 | fi
183 | 
184 | #Now that data cutoffs have been chosen, reduce data set to specified set of unique reads, convert to FASTA format,
185 | #and remove reads with substantial amounts of adapters
186 | 
187 | if [[ "$ATYPE" == "RPE" || "$ATYPE" == "ROL" ]]; then
188 |   parallel --no-notice -j $NUMProc mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | sed -e 's/NNNNNNNNNN/-/' >  total.uniqs
189 |   cut -f 1 -d "-" total.uniqs > total.u.F
190 |   cut -f 2 -d "-" total.uniqs > total.u.R
191 |   paste total.u.F total.u.R | $sort -k1 --parallel=$NUMProc -S 2G > total.fr
192 |  
193 |   parallel --no-notice --env special_uniq special_uniq $CUTOFF {} ::: *.uniq.seqs  | $sort --parallel=$NUMProc -S 2G | uniq -c > total.f.uniq
194 |   join -1 2 -2 1 -o 1.1,1.2,2.2 total.f.uniq total.fr | mawk '{print $1 "\t" $2 "NNNNNNNNNN" $3}' | mawk -v x=$CUTOFF2 '$1 >= x' > uniq.k.$CUTOFF.c.$CUTOFF2.seqs
195 |   rm total.uniqs total.u.* total.fr total.f.uniq* 
196 |   
197 | else
198 | 	parallel --no-notice mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | mawk -v x=$CUTOFF2 '$1 >= x' > uniq.k.$CUTOFF.c.$CUTOFF2.seqs
199 | fi
200 | $sort -k1 -r -n --parallel=$NUMProc -S 2G uniq.k.$CUTOFF.c.$CUTOFF2.seqs |cut -f2 > totaluniqseq
201 | #$sort -k1 -r -n uniq.k.$CUTOFF.c.$CUTOFF2.seqs | cut -f 2 > totaluniqseq
202 | mawk '{c= c + 1; print ">dDocent_Contig_" c "\n" $1}' totaluniqseq > uniq.full.fasta
203 | LENGTH=$(mawk '!/>/' uniq.full.fasta  | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
204 | LENGTH=$(($LENGTH * 3 / 4))
205 | seqtk seq -F I uniq.full.fasta > uniq.fq
206 | if [ "$NUMProc" -gt 8 ]; then
207 | 	NP=8
208 | else
209 | 	NP=$NUMProc
210 | fi
211 | MaxLen=$(mawk '!/>/' uniq.full.fasta  | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
212 | fastp -i uniq.fq -o uniq.fq1 -w $NP -Q -l $MaxLen &>/dev/null
213 | mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' uniq.fq1 | paste - - | sort -k1,1 -V | tr "\t" "\n" > uniq.fasta
214 | mawk '!/>/' uniq.fasta > totaluniqseq
215 | rm uniq.fq*
216 | 
217 | if [[ "$ATYPE" == "PE" || "$ATYPE" == "RPE" ]]; then
218 | 	pmerge(){
219 | 		num=$( echo $1 | sed -e 's/^0*//g')
220 | 		if [ "$num" -le 100 ]; then
221 | 			j=$num
222 | 			k=$(($num -1))
223 | 		else
224 | 			num=$(($num - 99))
225 |            		j=$(python -c "print ("$num" * 100)")
226 |                 	k=$(python -c "print ("$j" - 100)")
227 | 		fi
228 |                 mawk -v x="$j" -v y="$k" '$5 <= x && $5 > y'  rbdiv.out > rbdiv.out.$1
229 | 	   
230 | 	   	if [ -s "rbdiv.out.$1" ]; then
231 |            		rainbow merge -o rbasm.out.$1 -a -i rbdiv.out.$1 -r 2 -N10000 -R10000 -l 20 -f 0.75
232 |            	fi
233 |         }
234 | 	
235 | 	export -f pmerge
236 | 	
237 |         #Reads are first clustered using only the Forward reads using CD-hit instead of rainbow
238 |         if [ "$ATYPE" == "PE" ]; then
239 | 		sed -e 's/NNNNNNNNNN/	/g' uniq.fasta | cut -f1 > uniq.F.fasta
240 | 	  	CDHIT=$(python -c "print (max("$simC" - 0.1,0.8))")
241 | 	  	cd-hit-est -i uniq.F.fasta -o xxx -c $CDHIT -T $NUMProc -M 0 -g 1 -d 100 &>cdhit.log
242 | 	  	mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed -e 's/[>dDocent_Contig_,...]//g' | $sort -g -k1 -S 2G --parallel=$NUMProc > sort.contig.cluster.ids
243 | 	  	paste sort.contig.cluster.ids totaluniqseq > contig.cluster.totaluniqseq
244 |           
245 |      	else
246 |         	sed -e 's/NNNNNNNNNN/	/g' totaluniqseq | cut -f1 | $sort --parallel=$NUMProc -S 2G| uniq | mawk '{c= c + 1; print ">dDocent_Contig_" c "\n" $1}' > uniq.F.fasta
247 | 		CDHIT=$(python -c "print (max("$simC" - 0.1,0.8))")
248 | 		cd-hit-est -i uniq.F.fasta -o xxx -c $CDHIT -T $NUMProc -M 0 -g 1 -d 100 &>cdhit.log
249 |   		mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed -e 's/[>dDocent_Contig_,...]//g' | $sort -g -k1 -S 2G --parallel=$NUMProc > sort.contig.cluster.ids
250 |   		paste sort.contig.cluster.ids <(mawk '!/>/' uniq.F.fasta) > contig.cluster.Funiq
251 |   		sed -e 's/NNNNNNNNNN/	/g' totaluniqseq | $sort --parallel=$NUMProc -k1 -S 2G | mawk '{print $0 "\t" NR}'  > totaluniqseq.CN
252 |   		join -t $'\t' -1 3 -2 1 contig.cluster.Funiq totaluniqseq.CN -o 2.3,1.2,2.1,2.2 > contig.cluster.totaluniqseq
253 | 	fi	
254 | 	
255 | 	#CD-hit output is converted to rainbow format
256 | 	$sort -k2,2 -g contig.cluster.totaluniqseq -S 2G --parallel=$NUMProc | sed -e 's/NNNNNNNNNN/	/g' > rcluster
257 | 	rainbow div -i rcluster -o rbdiv.out -f 0.5 -K 10
258 |         CLUST=(`tail -1 rbdiv.out | cut -f5`)
259 | 	CLUST1=$(( $CLUST / 100 + 1))
260 | 	CLUST2=$(( $CLUST1 + 100 ))
261 | 	
262 | 	seq -w 1 $CLUST2 | parallel --no-notice -j $NUMProc --env pmerge pmerge {}
263 | 	
264 |         cat rbasm.out.[0-9]* > rbasm.out
265 |         rm rbasm.out.[0-9]* rbdiv.out.[0-9]*
266 | 
267 | 	#This AWK code replaces rainbow's contig selection perl script
268 | 	LENGTH=$(cut -f3 rbdiv.out |mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
269 | 
270 | 	LENGTH=$(( $LENGTH * 11 / 10 ))
271 | 
272 | 	cat rbasm.out <(echo "E") |sed -e 's/[0-9]*:[0-9]*://g' | mawk -v mlen=$LENGTH  '{
273 |                 if (NR == 1) e=$2;
274 |                 else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_A_Contig_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
275 |                 else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
276 |                 else if ($1 ~/C/) clus=$2;
277 |                 else if ($1 ~/L/) len=$2;
278 |                 else if ($1 ~/S/) seq=$2;
279 |                 else if ($1 ~/N/) freq=$2;
280 |                 else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len}
281 |                 else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/ && $0 ~/^R 0/ && len <= mlen) {seq1 = seq; fclus=clus;lenf=len}
282 |                 else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/ && $0 ~!/^R 0/ && len > mlen) {seq1 = seq; fclus=clus; len1=len}
283 |                 else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/ && $0 ~!/^R 0/ && len <= mlen) {seq1 = seq; fclus=clus; lenf=len}
284 |                 else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq}
285 |                 }' > rainbow.fasta
286 | 
287 | 
288 |         seqtk seq -r rainbow.fasta > rainbow.RC.fasta
289 |         mv rainbow.RC.fasta rainbow.fasta
290 | 
291 |         #The rainbow assembly is checked for overlap between newly assembled Forward and Reverse reads using the software PEAR
292 | 
293 |         grep -A1 "dDocent_A_Contig_" rainbow.fasta | mawk '!/^--/' | sed -e 's/dDocent_A_Contig_/dDocent_Contig_/g' > rainbow.asm.fasta
294 |         grep -A1 "dDocent_Contig_" rainbow.fasta | mawk '!/^--/' > rainbow.n.fasta
295 | 
296 |         sed -e 's/NNNNNNNNNN/	/g' rainbow.asm.fasta | cut -f1 | seqtk seq -F I - > ref.F.fq
297 |         sed -e 's/NNNNNNNNNN/	/g' rainbow.asm.fasta | cut -f2 | seqtk seq -F I - > ref.R.fq
298 | 
299 |         seqtk seq -r ref.R.fq > ref.RC.fq
300 |         mv ref.RC.fq ref.R.fq
301 |         LENGTH=$(mawk '!/>/' rainbow.fasta | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
302 |         LENGTH=$(( $LENGTH * 5 / 4))
303 | 
304 |         pearRM -f ref.F.fq -r ref.R.fq -o overlap -p 0.001 -j 20 -n $LENGTH &>kopt.log
305 | 
306 |         rm ref.F.fq ref.R.fq
307 | 
308 |         mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' overlap.assembled.fastq > overlap.fasta
309 |         mawk '/>/' overlap.fasta > overlap.loci.names
310 |         mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' overlap.unassembled.forward.fastq > other.F
311 |         mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' overlap.unassembled.reverse.fastq > other.R
312 |         paste other.F other.R | mawk '{if ($1 ~ />/) print $1; else print $0}' | sed -e 's/	/NNNNNNNNNN/g' > other.FR
313 | 
314 |         cat other.FR overlap.fasta rainbow.n.fasta > totalover.fasta
315 | 	paste <(mawk '{if (NR % 2) print $0}' totalover.fasta) <(mawk '{if (NR % 2 == 0) print $0}' totalover.fasta) | sort -V | sed -e 's/	/\'$'\n/g' > totalover.s.fasta
316 | 	mv totalover.s.fasta totalover.fasta
317 |         rm *.F *.R
318 | fi
319 | 
320 | if [[ "$ATYPE" == "HYB" ]];then
321 | 	parallel --no-notice mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.ua.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | mawk -v x=$2 '$1 >= x' > uniq.k.$CUTOFF.c.$CUTOFF2.ua.seqs
322 | 	AS=$(cat uniq.k.$CUTOFF.c.$CUTOFF2.ua.seqs | wc -l)
323 | 	if [ "$AS" -gt 1 ]; then
324 | 		cut -f2 uniq.k.$CUTOFF.c.$CUTOFF2.ua.seqs > totaluniqseq.ua
325 | 		mawk '{c= c + 1; print ">dDocent_Contig_" c "\n" $1}' totaluniqseq.ua > uniq.full.ua.fasta
326 | 		LENGTH=$(mawk '!/>/' uniq.full.ua.fasta  | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
327 | 		LENGTH=$(($LENGTH * 3 / 4))
328 | 		seqtk seq -F I uniq.full.ua.fasta > uniq.ua.fq
329 | 		if [ "$NUMProc" -gt 8 ]; then
330 | 			NP=8
331 | 		else
332 | 			NP=$NUMProc
333 | 		fi
334 | 		MaxLen=$(mawk '!/>/' uniq.full.ua.fasta  | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
335 | 		fastp -i uniq.ua.fq -o uniq.ua.fq1 -w $NP -Q -l $MaxLen &>/dev/null
336 | 		mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' uniq.ua.fq1 > uniq.ua.fasta
337 | 		mawk '!/>/' uniq.ua.fasta > totaluniqseq.ua
338 | 		rm uniq.ua.fq*
339 | 		#Reads are first clustered using only the Forward reads using CD-hit instead of rainbow
340 | 		sed -e 's/NNNNNNNNNN/	/g' uniq.ua.fasta | cut -f1 > uniq.F.ua.fasta
341 | 		CDHIT=$(python -c "print(max("$simC" - 0.1,0.8))")
342 | 		cd-hit-est -i uniq.F.ua.fasta -o xxx -c $CDHIT -T 0 -M 0 -g 1 -d 100 &>cdhit.log
343 | 		mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed -e 's/[>dDocent_Contig_,...]//g' | $sort -g -k1 -S 2G --parallel=$NUMProc > sort.contig.cluster.ids.ua
344 | 		paste sort.contig.cluster.ids.ua totaluniqseq.ua > contig.cluster.totaluniqseq.ua
345 | 		$sort -k2,2 -g -S 2G --parallel=$NUMProc contig.cluster.totaluniqseq.ua | sed -e 's/NNNNNNNNNN/	/g' > rcluster.ua
346 | 		#CD-hit output is converted to rainbow format
347 | 		rainbow div -i rcluster.ua -o rbdiv.ua.out -f 0.5 -K 10
348 | 		if [ "$ATYPE" == "PE" ]; then
349 | 			rainbow merge -o rbasm.ua.out -a -i rbdiv.ua.out -r 2 -N10000 -R10000 -l 20 -f 0.75
350 | 		else
351 | 			rainbow merge -o rbasm.ua.out -a -i rbdiv.ua.out -r 2 -N10000 -R10000 -l 20 -f 0.75
352 | 		fi
353 | 		
354 | 		#This AWK code replaces rainbow's contig selection perl script
355 | 		cat rbasm.ua.out <(echo "E") |sed -e 's/[0-9]*:[0-9]*://g' | mawk ' {
356 | 			if (NR == 1) e=$2;
357 | 			else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_Contig_UA_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
358 | 			else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_UA_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
359 | 			else if ($1 ~/C/) clus=$2;
360 | 			else if ($1 ~/L/) len=$2;
361 | 			else if ($1 ~/S/) seq=$2;
362 | 			else if ($1 ~/N/) freq=$2;
363 | 			else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len}
364 | 			else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/) {seq1 = seq; fclus=clus; len1=len}
365 | 			else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq}
366 | 			}' > rainbow.ua.fasta
367 | 	
368 | 		seqtk seq -r rainbow.ua.fasta > rainbow.RC.fasta
369 | 		mv rainbow.RC.fasta rainbow.ua.fasta
370 | 	
371 | 		cat rainbow.ua.fasta uniq.fasta > totalover.fasta
372 | 		paste <(mawk '{if (NR % 2) print $0}' totalover.fasta) <(mawk '{if (NR % 2 == 0) print $0}' totalover.fasta) | sort -V | sed -e 's/	/\'$'\n/g' > totalover.s.fasta
373 | 		mv totalover.s.fasta totalover.fasta
374 | 	fi
375 | fi
376 | 
377 | if [[ "$ATYPE" != "PE" && "$ATYPE" != "RPE" && "$ATYPE" != "HYB" ]]; then
378 | 	cp uniq.fasta totalover.fasta
379 | 	paste <(mawk '{if (NR % 2) print $0}' totalover.fasta) <(mawk '{if (NR % 2 == 0) print $0}' totalover.fasta) | sort -V | sed -e 's/	/\'$'\n/g' > totalover.s.fasta
380 | 	mv totalover.s.fasta totalover.fasta
381 | fi
382 | cd-hit-est -i totalover.fasta -o reference.fasta.original -M 0 -T 0 -c $simC &>cdhit2.log
383 | 
384 | sed -e 's/^C/NC/g' -e 's/^A/NA/g' -e 's/^G/NG/g' -e 's/^T/NT/g' -e 's/T$/TN/g' -e 's/A$/AN/g' -e 's/C$/CN/g' -e 's/G$/GN/g' reference.fasta.original > reference.fasta
385 | 
386 | if [[ "$ATYPE" == "RPE" || "$ATYPE" == "ROL" ]]; then
387 | 	sed -i 's/dDocent/dDocentR/g' reference.fasta
388 | fi
389 | 
390 | samtools faidx reference.fasta &> index.log
391 | bwa index reference.fasta >> index.log 2>&1
392 | 
393 | SEQS=$(mawk 'END {print NR}' uniq.k.$CUTOFF.c.$CUTOFF2.seqs)
394 | TIGS=$(grep ">" -c reference.fasta)
395 | 
396 | #echo -e "\ndDocent assembled $SEQS sequences (after cutoffs) into $TIGS contigs"
397 | echo $TIGS
398 | }
399 | 
400 | rm kopt.data &>/dev/null
401 | 
402 | for ((P = $1; P <= $2; P++))
403 | 	do
404 | 	for ((i = $3; i <= $4; i++))
405 | 	do
406 | 	X=$(($P + $i))
407 | 	if [ "$X" != "2" ]; then
408 | 		for j in $(seq $minSim $incSim $maxSim)
409 | 		do
410 | 		echo "K1 is $P" "K2 is $i" "c is $j"
411 | 		SEQS=$(Reference $P $i $j)
412 | 		echo $P $i $j $SEQS >> kopt.data
413 | 		done
414 | 	fi	
415 | 	done
416 | done
417 | 
418 | cut -f4 -d " " kopt.data > plot.kopt.data
419 | gnuplot << \EOF
420 | set terminal dumb size 120, 30
421 | set autoscale
422 | unset label
423 | set title "Histogram of number of reference contigs"
424 | set ylabel "Number of Occurrences"
425 | set xlabel "Number of reference contigs"
426 | max = `sort -g plot.kopt.data | tail -1`
427 | binwidth = max/250.0
428 | bin(x,width)=width*floor(x/width) + binwidth/2.0
429 | #set xtics 10
430 | plot 'plot.kopt.data' using (bin($1,binwidth)):(1.0) smooth freq with boxes
431 | pause -1
432 | EOF
433 | 
434 | AF=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' plot.kopt.data)
435 | echo "Average contig number = $AF"
436 | echo "The top three most common number of contigs"
437 | echo -e "X\tContig number"
438 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' plot.kopt.data | sort -k1 -g -r | head -3
439 | echo "The top three most common number of contigs (with values rounded)"
440 | echo -e "X\tContig number"
441 | while read NAME; do python -c "print(round($NAME,-2))"; done < plot.kopt.data | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | sort -g -r | head -3 | sed "s/^[ \t]*//"
442 | 


--------------------------------------------------------------------------------
/scripts/Rename_SequenceFiles.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export LC_ALL=en_US.UTF-8
 3 | 
 4 | #This script can quickly rename filess within the dDocent naming convention. It needs to be passed a tab delimited
 5 | #file with the old name in one column and the new name in the other
 6 | #Example#
 7 | #PopA_001  NewPop_001
 8 | #
 9 | # This will renmae PopA_001.F.fq.gz and PopA_001.R.fq.gz to NewPop_001.F.fq.gz and NewPop_001.R.fq.gz
10 | 
11 | if [ -z "$1" ]
12 | then
13 | echo "No file with old names and new names specified."
14 | echo "Correct usage: Rename_for_dDocent.sh namesfile"
15 | exit 1
16 | else
17 | NAMES=( `cut -f2  $1 `)
18 | BARCODES=( `cut -f1 $1 `)
19 | LEN=( `wc -l $1 `)
20 | LEN=$(($LEN - 1))
21 | 
22 | echo ${NAMES[0]}
23 | echo ${BARCODES[0]}
24 | 
25 | for ((i = 0; i <= $LEN; i++));
26 | do
27 | mv ${BARCODES[$i]}.F.fq.gz ${NAMES[$i]}.F.fq.gz
28 | mv ${BARCODES[$i]}.R.fq.gz ${NAMES[$i]}.R.fq.gz &>/dev/null
29 | done
30 | fi
31 | 


--------------------------------------------------------------------------------
/scripts/dDocent_filters:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | export LC_ALL=en_US.UTF-8
  3 | 
  4 | echo "This script will automatically filter a FreeBayes generated VCF file using criteria related to site depth," 
  5 | echo "quality versus depth, strand representation, allelic balance at heterzygous individuals, and paired read representation."
  6 | echo -e "The script assumes that loci and individuals with low call rates (or depth) have already been removed. \n"
  7 | echo -e "Contact Jon Puritz (jpuritz@gmail.com) for questions and see script comments for more details on particular filters \n"
  8 | 
  9 | #Checks for correct ususage
 10 | 
 11 | if [[ -z "$2" ]]; then
 12 | echo "Usage is bash dDocent_filters.sh VCF_file Output_prefix"
 13 | exit 1
 14 | fi
 15 | 
 16 | #Filteres out sites with that on average have heterzygotes with less than a 0.28 allele balance between reads from each allele and Quality / Depth < 0.5 and ratio of mapping quality for reference and alternate alleles
 17 | 
 18 | vcffilter -s -f "AB > 0.2 & AB < 0.8 | AB < 0.01 | AB > 0.99" -s -g "QR > 0 | QA > 0 " $1 | vcffilter -s -f "QUAL / DP > 0.1" | vcffilter -s -f "MQM / MQMR > 0.25 & MQM / MQMR < 1.75" > $2
 19 | FILTERED=$(mawk '!/#/' $2 | wc -l)
 20 | OLD=$(mawk '!/#/' $1 | wc -l)
 21 | NUMFIL=$(($OLD - $FILTERED))
 22 | echo -e "Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth\n" $NUMFIL "of" $OLD "\n"
 23 | echo -e "Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth\n" $NUMFIL "of" $OLD "\n" > $2.filterstats
 24 | 
 25 | 
 26 | #Asks about filting SNPs with reads from both strands filters out loci that have reads from both strands, with some leeway for a bad individual or two
 27 | 
 28 | 
 29 | if [[ -z "$3" ]]; then
 30 | echo -e "Are reads expected to overlap?  In other words, is fragment size less than 2X the read length?  Enter yes or no."
 31 | read OL
 32 | else
 33 | OL=$3
 34 | fi
 35 | 
 36 | if [ "$OL" != "yes" ]; then
 37 | vcffilter -f "SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 50" -s $2 > $2.filAB.vcf
 38 | FILTERED=$(mawk '!/#/' $2.filAB.vcf | wc -l)
 39 | OLD=$(mawk '!/#/' $2 | wc -l)
 40 | NUMFIL=$(($OLD - $FILTERED))
 41 | echo -e "Number of additional sites filtered based on overlapping forward and reverse reads\n" $NUMFIL "of" $OLD "\n"
 42 | echo -e "Number of additional sites filtered based on overlapping forward and reverse reads\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
 43 | else
 44 | cp $2 $2.filAB.vcf
 45 | fi
 46 | 
 47 | #Filters out loci that have reads from both paired and unpaired reads
 48 | 
 49 | if [[ -z "$4" ]]; then
 50 | echo -e "Is this from a mixture of SE and PE libraries? Enter yes or no."
 51 | read PE
 52 | else
 53 | PE=$4
 54 | fi
 55 | 
 56 | if [ "$PE" != "yes" ]; then
 57 | vcffilter -f "PAIRED > 0.05 & PAIREDR > 0.05 & PAIREDR / PAIRED < 1.75 & PAIREDR / PAIRED > 0.25 | PAIRED < 0.05 & PAIREDR < 0.05" -s $2.filAB.vcf > $2.fil.vcf
 58 | FILTERED=$(mawk '!/#/' $2.fil.vcf  | wc -l)
 59 | OLD=$(mawk '!/#/' $2.filAB.vcf | wc -l)
 60 | NUMFIL=$(($OLD - $FILTERED))
 61 | echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n"
 62 | echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
 63 | 
 64 | else
 65 | vcffilter -f "PAIRED < 0.005 & PAIREDR > 0.005 | PAIRED > 0.005 & PAIREDR < 0.005" -t NP -F PASS -A $2.filAB.vcf | mawk '!/NP/' > $2.fil.vcf
 66 | FILTERED=$(mawk '!/#/' $2.fil.vcf  | wc -l)
 67 | OLD=$(mawk '!/#/' $2.filAB.vcf | wc -l)
 68 | NUMFIL=$(($OLD - $FILTERED))
 69 | echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n"
 70 | echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
 71 | 
 72 | fi
 73 | 
 74 | #Uses the VCF file to estimate the original number of individuals in the VCF file
 75 | #This is important because the INFO flags are based on this number
 76 | IND=$(grep -o -e 'NS=[0-9]*' $1 | sed s/NS=//g | sort | tail -1)
 77 | IND=$(($IND - 0 ))
 78 | 
 79 | #Creates a file with the original site depth and qual for each locus
 80 | cut -f8 $1 | grep -oe "DP=[0-9]*" | sed -s 's/DP=//g' > $1.DEPTH
 81 | mawk '!/#/' $1 | cut -f1,2,6 > $1.loci.qual
 82 | 
 83 | #Calculates the average depth and standard deviation
 84 | DEPTH=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $1.DEPTH)
 85 | SD=$(mawk '{delta = $1 - avg; avg += delta / NR; mean2 += delta * ($1 - avg); } END { print sqrt(mean2 / NR); }' $1.DEPTH)
 86 | DEPTH=$(perl -e "print int("$DEPTH") + int("$SD")")
 87 | #DEPTH=$(perl -e "print int("$DEPTH"+100*("$DEPTH"**0.5))")
 88 | 
 89 | #Filters loci above the mean depth + 1 standard deviation that have quality scores that are less than 2*DEPTH
 90 | paste $1.loci.qual $1.DEPTH | mawk -v x=$DEPTH '$4 > x'| mawk '$3 < 2 * $4' > $1.lowQDloci
 91 | LQDL=$(cat $1.lowQDloci | wc -l)
 92 | OLD=$(mawk '!/#/' $2.fil.vcf | wc -l)
 93 | echo -e "Number of sites filtered based on high depth and lower than 2*DEPTH quality score\n" $LQDL "of" $OLD "\n"  
 94 | echo -e "Number of sites filtered based on high depth and lower than 2*DEPTH quality score\n" $LQDL "of" $OLD "\n" >> $2.filterstats
 95 | 
 96 | #Recalculates site depth for sites that have not been previously filtered
 97 | vcftools --vcf $2.fil.vcf --remove-filtered NP --exclude-positions $1.lowQDloci --site-depth --out $1 2> /dev/null
 98 | cut -f3 $1.ldepth > $1.site.depth
 99 | 
100 | DP=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $1.site.depth)
101 | SD=$(mawk '{delta = $1 - avg; avg += delta / NR; mean2 += delta * ($1 - avg); } END { print sqrt(mean2 / NR); }' $1.site.depth)
102 | 
103 | #Calculates actual number of individuals in VCF file
104 | #This is important because loci will now be filtered by mean depth calculated with individuals present in VCF
105 | IND=$(mawk '/#/' $1 | tail -1 | wc -w)
106 | IND=$(($IND - 9))
107 | 
108 | mawk '!/D/' $1.site.depth | mawk -v x=$IND '{print $1/x}' > meandepthpersite
109 | 
110 | #Calculates a mean depth cutoff to use for filtering
111 | DP=$(perl -e "print ($DP+ 1.645*$SD) / $IND")
112 | PP=$(mawk '!/SUM/' $1.site.depth | sort -rn | perl -e '$d=.05;@l=<>;print $l[int($d*$#l)]' )
113 | PP=$(perl -e "print int($PP / $IND)")
114 | GP=$(perl -e "print int($PP * 1.10)")
115 | export GP
116 | 
117 | gnuplot << \EOF >> $2.filterstats
118 | set terminal dumb size 120, 30
119 | set autoscale
120 | high=system("echo $GP")
121 | set xrange [10:high] 
122 | unset label
123 | set title "Histogram of mean depth per site"
124 | set ylabel "Number of Occurrences"
125 | set xlabel "Mean Depth"
126 | #set yr [0:100000]
127 | binwidth=1
128 | bin(x,width)=width*floor(x/width) + binwidth/2.0
129 | set xtics floor(high/20)
130 | set lmargin 10
131 | plot 'meandepthpersite' using (bin($1,binwidth)):(1.0) smooth freq with boxes
132 | pause -1
133 | EOF
134 | 
135 | gnuplot << \EOF 
136 | set terminal dumb size 120, 30
137 | set autoscale
138 | high=system("echo $GP")
139 | set xrange [10:high] 
140 | unset label
141 | set title "Histogram of mean depth per site"
142 | set ylabel "Number of Occurrences"
143 | set xlabel "Mean Depth"
144 | #set yr [0:100000]
145 | binwidth=1
146 | bin(x,width)=width*floor(x/width) + binwidth/2.0
147 | set xtics floor(high/20)
148 | set lmargin 10
149 | plot 'meandepthpersite' using (bin($1,binwidth)):(1.0) smooth freq with boxes
150 | pause -1
151 | EOF
152 | 
153 | 
154 | if [[ -z "$5" ]]; then
155 | echo "If distrubtion looks normal, a 1.645 sigma cutoff (~90% of the data) would be" $DP
156 | echo "The 95% cutoff would be" $PP
157 | echo "Would you like to use a different maximum mean depth cutoff than "$PP", yes or no"
158 | 
159 | read NEWCUTOFF
160 | else
161 | NEWCUTOFF=$5
162 | fi
163 | 
164 | if [ "$NEWCUTOFF" != "yes" ]; then
165 | echo -e "Maximum mean depth cutoff is" $PP >> $2.filterstats
166 | 
167 | #Combines all filters to create a final filtered VCF file
168 | vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL --max-meanDP $PP --recode 2> /dev/null
169 | FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf  | wc -l)
170 | OLD=$(mawk '!/#/' $2.fil.vcf | wc -l)
171 | NUMFIL=$(($OLD - $FILTERED))
172 | echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n"
173 | echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
174 | 
175 | else
176 | 	if [[ -z "$6" ]]; then
177 | 	echo "Please enter new cutoff"
178 | 	read PP
179 | 	else
180 | 	PP=$6
181 | 	fi
182 | echo -e "Maximum mean depth cutoff is" $PP >> $2.filterstats
183 | #Combines all filters to create a final filtered VCF file
184 | vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL --max-meanDP $PP --recode 2> /dev/null
185 | FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf  | wc -l)
186 | OLD=$(mawk '!/#/' $2.fil.vcf | wc -l)
187 | NUMFIL=$(($OLD - $FILTERED))
188 | echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n"
189 | echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
190 | fi
191 | 
192 | vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL1 --max-meanDP $PP --exclude-positions $1.lowQDloci --recode 2> /dev/null
193 | vcftools --vcf $2.FIL1.recode.vcf --site-depth --out $2.FIL1 2> /dev/null
194 | mawk '!/CHR/' $2.FIL1.ldepth | mawk '{
195 | if (NR == 1) {chrom=$1;i=1;pos[i]=$2;dp[i]=$3}
196 | else if ($1 == chrom) {i++;pos[i]=$2;dp[i]=$3}
197 | else if ($1 != chrom) {for (x in dp) dpp= dpp + dp[x]; adp=dpp / i; adp= adp / 10 * 6; for (j = 1; j <= i; j++) if (dp[j] < adp) print chrom"\t"pos[j];chrom=$1;i=1;delete pos;pos[i]=$2;delete dp;dpp=0;dp[i]=$3}
198 | }' > $2.dpmismatch.loci
199 | 
200 | OLD=$(mawk '!/#/' $2.FIL1.recode.vcf | wc -l)
201 | DPMM=$(cat $2.dpmismatch.loci | wc -l)
202 | echo -e "Number of sites filtered based on within locus depth mismatch\n" $DPMM "of" $OLD "\n"
203 | echo -e "Number of sites filtered based on within locus depth mismatch\n" $DPMM "of" $OLD "\n" >> $2.filterstats
204 | 
205 | vcftools --vcf $2.FIL1.recode.vcf --exclude-positions $2.dpmismatch.loci --recode --recode-INFO-all --out $2.FIL 2> /dev/null
206 | 
207 | OLD=$(mawk '!/#/' $1 | wc -l)
208 | FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf  | wc -l)
209 | NUMFIL=$(($OLD - $FILTERED))
210 | 
211 | echo -e "Total number of sites filtered\n" $NUMFIL "of" $OLD "\n"
212 | echo -e "Total number of sites filtered\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
213 | 
214 | echo -e "Remaining sites\n" $FILTERED "\n"
215 | echo -e "Remaining sites\n" $FILTERED "\n" >> $2.filterstats
216 | 
217 | echo -e "Filtered VCF file is called $2.FIL.recode.vcf\n"
218 | echo "Filter stats stored in $2.filterstats"
219 | 
220 | 


--------------------------------------------------------------------------------
/scripts/filter_hwe_by_pop.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | 
  3 | #Script written by Chris Hollenbeck.  Contact him at chollenbeck07@neo.tamu.edu
  4 | 
  5 | use strict;
  6 | use Getopt::Long;
  7 | use Pod::Usage;
  8 | 
  9 | pod2usage(-verbose => 1) if @ARGV == 0;
 10 | 
 11 | my $vcffile = '';
 12 | my $outfile = 'new.hwe';
 13 | my $popmap = '';
 14 | my $hwe = 0.001;
 15 | my $cutoff = 0.25;
 16 | 
 17 | GetOptions(	'vcf|v=s' => \$vcffile,
 18 | 			'out|o=s' => \$outfile,
 19 | 			'popmap|p=s' => \$popmap,
 20 | 			'hwe|h=s' => \$hwe,
 21 | 			'cutoff|c=s' => \$cutoff,
 22 | 			);
 23 | 
 24 | unless ($vcffile) {
 25 | 	print "\nNeed to specify a VCF file (-v) for input\n\n";
 26 | 	pod2usage(-verbose => 1);
 27 | }
 28 | 
 29 | unless ($popmap) {
 30 | 	print "\nNeed to specify a population map (-p) for input\n\n";
 31 | 	pod2usage(-verbose => 1);
 32 | }
 33 | 
 34 | 
 35 | open(POP, "<", $popmap) or die $!;
 36 | my %pops;
 37 | while(<POP>) {
 38 | 	next if $_ =~ /^\s/;
 39 | 	chomp;
 40 | 	my ($sample, $pop) = split;
 41 | 	$pops{$pop} = [] unless $pops{$pop};
 42 | 	push @{$pops{$pop}}, $sample;
 43 | }
 44 | close POP;
 45 | 
 46 | my %exclude_count;
 47 | foreach my $pop (sort keys %pops) {
 48 | 	open(INDO, ">", $pop . '.inds') or die $!;
 49 | 	foreach my $ind (@{$pops{$pop}}) {
 50 | 		print INDO $ind, "\n";
 51 | 	}
 52 | 	close INDO;
 53 | 
 54 | 	my $indfile = $pop . '.inds';
 55 | 
 56 | 	print "Processing population: $pop (" , scalar(@{$pops{$pop}}) , " inds)", "\n";
 57 | 
 58 | 	my $ouput = `vcftools --vcf $vcffile --keep $indfile --hardy --out $pop 2>&1`;
 59 | 
 60 | 	open(HWEI, "<", $pop . '.hwe') or die $!;
 61 | 
 62 | 	<HWEI>; 	# process the header
 63 | 	while(<HWEI>) {
 64 | 		last if $_ =~ /^\s/;
 65 | 		chomp;
 66 | 		my ($locus, $pos, $obs, $exp, $chisq, $pvalue, @rest) = split;
 67 | 		$exclude_count{"$locus-$pos"}++ if $pvalue < $hwe;
 68 | 
 69 | 	}
 70 | 	close HWEI;
 71 | 
 72 | }
 73 | 
 74 | open(HWEO, ">", 'exclude.hwe') or die $!;
 75 | foreach my $snp (keys %exclude_count) {
 76 | 	my ($locus, $site) = split('-', $snp);
 77 | 	if ($exclude_count{$snp} / scalar(keys %pops) > $cutoff) {
 78 | 		print HWEO join("\t", $locus, $site), "\n";
 79 | 	}
 80 | }
 81 | close HWEO;
 82 | 
 83 | my $output = `vcftools --vcf $vcffile --exclude-positions exclude.hwe --recode --recode-INFO-all --out $outfile 2>&1`;
 84 | my $filt_output = `vcftools --vcf $vcffile --positions exclude.hwe --hardy --out filtered 2>&1`;
 85 | 
 86 | print "Outputting results of HWE test for filtered loci to 'filtered.hwe'\n";
 87 | 
 88 | my $kept;
 89 | my $total;
 90 | if ($output =~ /kept (\d+) out of a possible (\d+) Sites/) {
 91 | 	$kept = $1;
 92 | 	$total = $2;
 93 | }
 94 | 
 95 | print "Kept $kept of a possible $total loci (filtered " , $total - $kept , ' loci)', "\n";
 96 | 
 97 | __END__
 98 | 
 99 | =head1 NAME
100 | 
101 | filter_hwe_by_pop.pl
102 | 
103 | =head1 SYNOPSIS
104 | 
105 | filter_hwe_by_pop.pl -v <vcffile> -p <popmap> [options]
106 | 
107 | Options:
108 |      -v     <vcffile>	input vcf file
109 |      -p		<popmap>	tab-separated file of samples and population designations
110 | 	 -h		[hwe]	minimum Hardy-Weinberg p-value cutoff for SNPs
111 | 	 -c		[cutoff]	proportion of all populations that a locus can be below HWE cutoff without being filtered
112 |      -o		[out]	name of outfile
113 | 
114 | 
115 | =head1 OPTIONS
116 | 
117 | =over 8
118 | 
119 | =item B<-v, --vcffile>
120 | 
121 | VCF input file
122 | 
123 | =item B<-p, --popmap>
124 | 
125 | File with names of individuals and population designations, one per line
126 | 
127 | =item B<-h, --hwe>
128 | 
129 | Minimum cutoff for Hardy-Weinberg p-value (for test as implemented in vcftools) [Default: 0.001]
130 | 
131 | =item B<-c, --cutoff>
132 | 
133 | Proportion of all populations that a locus can be below HWE cutoff without being filtered. For example, choosing 0.5 will filter SNPs that are below the p-value threshold in 50% or more of the populations. [Default: 0.25]
134 | 
135 | =item B<-o, --out>
136 | 
137 | Name of outfile, by vcftools conventions (will be named X.recode.vcf)
138 | 
139 | =back
140 | 
141 | =head1 DESCRIPTION
142 | 
143 | B<filter_hwe_by_pop.pl> is a Perl wrapper for vcftools, designed to run tests for HWE on multiple populations and exclude loci that fall beneath a given threshold from the entire dataset
144 | 
145 | =cut
146 | 


--------------------------------------------------------------------------------
/scripts/filter_missing_ind.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export LC_ALL=en_US.UTF-8
 3 | 
 4 | #check for vcftools version
 5 | VCFTV=$(vcftools | grep VCF | grep -oh '[0-9]*[a-z]*)$' | sed 's/[a-z)]//')
 6 |         if [ "$VCFTV" -lt "10" ]; then
 7 |                 echo "The version of VCFtools installed in your" '$PATH' "is not optimized for dDocent."
 8 |                 echo "Please install at least version 0.1.11"
 9 |                 exit 1
10 |         elif [ "$VCFTV" -lt "13" ]; then
11 |                 VCFMISSINGFLAG="--missing"
12 |         elif [ "$VCFTV" -ge "13" ]; then
13 |                 VCFMISSINGFLAG="--missing-indv"
14 |         fi
15 | 
16 | if [[ -z "$2" ]]; then
17 | echo "Usuage is filter_missing_ind.sh vcf_file name_prefix_for_new_vcf_file"
18 | exit 1
19 | fi
20 | 
21 | vcftools --vcf $1 $VCFMISSINGFLAG --out $2
22 | 
23 | CUTOFF=$(mawk '!/IN/' $2.imiss | cut -f5 | sort -rn | perl -e '$d=.14;@l=<>;print $l[int($d*$#l)]')
24 | 
25 | mawk '!/IN/' $2.imiss | cut -f5 > totalmissing
26 | 
27 | gnuplot << \EOF
28 | set terminal dumb size 120, 30
29 | set autoscale
30 | unset label
31 | set title "Histogram of % missing data per individual"
32 | set ylabel "Number of Occurrences"
33 | set xlabel "% of missing data"
34 | #set yr [0:100000]
35 | binwidth=0.01
36 | bin(x,width)=width*floor(x/width) + binwidth/2.0
37 | set lmargin 10
38 | plot 'totalmissing' using (bin($1,binwidth)):(1.0) smooth freq with boxes
39 | pause -1
40 | EOF
41 | 
42 | if [[ -z "$3" ]]; then
43 | 	echo "The 85% cutoff would be" $CUTOFF
44 | 	echo "Would you like to set a different cutoff, yes or no"
45 | 
46 | 	read NEWCUTOFF
47 | else
48 | 	NEWCUTOFF=$3
49 | fi
50 | 
51 | if [ "$NEWCUTOFF" != "yes" ]; then
52 | 
53 | 	mawk -v x=$CUTOFF '$5 > x' $2.imiss | cut -f1 > lowDP.indv
54 | 
55 | 	vcftools --vcf $1 --remove lowDP.indv --recode --recode-INFO-all --out $2
56 | 
57 | else
58 | 	if [[ -z "$4" ]]; then
59 | 		echo "Please enter new cutoff"
60 | 		read CUTOFF2
61 | 
62 | 	else
63 | 		CUTOFF2=$4
64 | 	fi
65 | 	CUTPRINT=$(python -c "print($CUTOFF2 * 100)")
66 | 	echo "All individuals with more than" $CUTPRINT"% missing data will be removed."
67 | 	mawk -v x=$CUTOFF2 '$5 > x' $2.imiss | cut -f1 > lowDP.indv
68 | 
69 | vcftools --vcf $1 --remove lowDP.indv --recode --recode-INFO-all --out $2
70 | fi
71 | 


--------------------------------------------------------------------------------
/scripts/pop_missing_filter.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export LC_ALL=en_US.UTF-8
 3 | 
 4 | #check for vcftools version
 5 | VCFTV=$(vcftools | grep VCF | grep -oh '[0-9]*[a-z]*)$' | sed 's/[a-z)]//')
 6 |         if [ "$VCFTV" -lt "10" ]; then
 7 |                 echo "The version of VCFtools installed in your" '$PATH' "is not optimized for dDocent."
 8 |                 echo "Please install at least version 0.1.11"
 9 |                 exit 1
10 |         elif [ "$VCFTV" -lt "13" ]; then
11 |                 VCFMISSINGFLAG="--missing"
12 |         elif [ "$VCFTV" -ge "13" ]; then
13 |                 VCFMISSINGFLAG="--missing-site"
14 |         fi
15 | 
16 | if [[ -z "$2" ]]; then
17 | echo "Usage is pop_missing_filter vcffile popmap proportion_missing_per_pop number_of_pops_for_cutoff name_for_output"
18 | exit 1
19 | fi
20 | 
21 | POPS=( `cut -f2 $2 | sort | uniq `)
22 | rm badloci
23 | 
24 | for i in "${POPS[@]}"
25 | do
26 | grep -w $i $2 | cut -f1 > keep.$i
27 | vcftools --vcf $1 --keep keep.$i $VCFMISSINGFLAG --out $i 
28 | mawk '!/CHROM/' $i.lmiss | mawk -v x=$3 '$6 > x' | cut -f1,2 >> badloci
29 | done
30 | 
31 | mawk '!/CH/' badloci | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | mawk -v x=$4 '$1 >= x' | cut -f2,3  > loci.to.remove
32 | 
33 | #sort badloci | uniq > loci.to.remove
34 | 
35 | vcftools --vcf $1 --exclude-positions loci.to.remove --recode --recode-INFO-all --out $5
36 | 
37 | 


--------------------------------------------------------------------------------
/scripts/remake_reference.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | export LC_ALL=en_US.UTF-8
  4 | export SHELL=bash
  5 | v="2.8.7"
  6 | 
  7 | if [[ -z "$5" ]]; then
  8 | echo "Usage is sh remake_reference.sh K1 K2 similarity% Assembly_Type Number_of_Processors"
  9 | exit 1
 10 | fi
 11 | 
 12 | if ! sort --version | fgrep GNU &>/dev/null; then
 13 | 	sort=gsort
 14 | else
 15 | 	sort=sort
 16 | fi
 17 | 
 18 | DEP=(mawk samtools rainbow gnuplot seqtk cd-hit-est parallel pearRM fastp)
 19 | NUMDEP=0
 20 | for i in "${DEP[@]}"
 21 | do
 22 | 	if which $i &> /dev/null; then
 23 | 		foo=0
 24 | 	else
 25 |     		echo "The dependency" $i "is not installed or is not in your" '$PATH'"."
 26 |     		NUMDEP=$((NUMDEP + 1))
 27 | 	fi
 28 | done
 29 | 
 30 | 
 31 | FASTP=$(fastp -v 2>&1 | cut -f2 -d " ")
 32 | FASTP1=$(echo $FASTP | cut -f1 -d ".")
 33 | FASTP2=$(echo $FASTP | cut -f2 -d ".")
 34 | FASTP3=$(echo $FASTP | cut -f3 -d ".")
 35 | 	if [ "$FASTP1" -lt "2" ]; then
 36 | 		if [ "$FASTP2" -lt "20" ]; then
 37 | 			if [ "$FASTP2" -lt "5" ]; then
 38 | 				echo "The version of fastp installed in your" '$PATH' "is not optimized for dDocent."
 39 | 				echo "Please install version 0.19.5 or above"
 40 | 				exit 1
 41 | 			fi
 42 | 		fi
 43 | 	fi
 44 | 	
 45 | if [ $NUMDEP -gt 0 ]; then
 46 | 	echo -e "\nPlease install all required software before running ReferenceOpt again."
 47 | 	exit 1
 48 | else
 49 | 	echo -e "\nAll required software is installed!"
 50 | fi
 51 | 
 52 | echo -e "\ndDocent remake_reference version $v"
 53 | 
 54 | ATYPE=$4
 55 | NUMProc=$5
 56 | CUTOFF=$1
 57 | CUTOFF2=$2
 58 | simC=$3
 59 | 
 60 | 
 61 | ls *.F.fq.gz > namelist
 62 | sed -i'' -e 's/.F.fq.gz//g' namelist
 63 | NAMES=( `cat "namelist" `)
 64 | 
 65 | getAssemblyInfo(){
 66 | echo "nope!"
 67 | }
 68 | 
 69 | Reference(){
 70 | 
 71 | CUTOFF=$1
 72 | CUTOFF2=$2
 73 | simC=$3
 74 | 
 75 | AWK1='BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}'
 76 | AWK2='!/>/'
 77 | AWK3='!/NNN/'
 78 | AWK4='{for(i=0;i<$1;i++)print}'
 79 | PERLT='while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}'
 80 | SED1='s/^[ \t]*//'
 81 | SED2='s/\s/\t/g'
 82 | FRL=$(gunzip -c ${NAMES[0]}.F.fq.gz | mawk '{ print length() | "sort -rn" }' | head -1)
 83 | 
 84 | special_uniq(){
 85 | 	mawk -v x=$1 '$1 >= x' $2  |cut -f2 | sed -e 's/NNNNNNNNNN/	/g' | cut -f1 | uniq
 86 | }
 87 | export -f special_uniq
 88 | 
 89 | if [ ${NAMES[@]:(-1)}.F.fq.gz -nt ${NAMES[@]:(-1)}.uniq.seqs ];then
 90 | 	if [[ "$ATYPE" == "PE" || "$ATYPE" == "RPE" ]]; then
 91 | 	#If PE assembly, creates a concatenated file of every unique for each individual in parallel
 92 | 		cat namelist | parallel --no-notice -j $NUMProc "gunzip -c {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.forward"
 93 | 		cat namelist | parallel --no-notice -j $NUMProc "gunzip -c {}.R.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.reverse"
 94 | 		if [ "$ATYPE" = "RPE" ]; then
 95 | 			cat namelist | parallel --no-notice -j $NUMProc "paste {}.forward {}.reverse | $sort -k1 -S 200M > {}.fr"
 96 | 			cat namelist | parallel --no-notice -j $NUMProc "cut -f1 {}.fr | uniq -c > {}.f.uniq && cut -f2 {}.fr > {}.r"
 97 | 			cat namelist | parallel --no-notice -j $NUMProc "mawk '$AWK4' {}.f.uniq > {}.f.uniq.e" 
 98 | 			cat namelist | parallel --no-notice -j $NUMProc "paste -d '-' {}.f.uniq.e {}.r | mawk '$AWK3'| sed -e 's/-/NNNNNNNNNN/' | sed -e '$SED1' | sed -e '$SED2'> {}.uniq.seqs"
 99 | 			rm *.f.uniq.e *.f.uniq *.r *.fr
100 | 		else
101 | 			cat namelist | parallel --no-notice -j $NUMProc "paste -d '-' {}.forward {}.reverse | mawk '$AWK3'| sed -e 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.seqs"
102 | 		fi
103 | 		rm *.forward
104 | 		rm *.reverse
105 | 	fi
106 | 	
107 | 	if [ "$ATYPE" == "SE" ]; then
108 | 	#if SE assembly, creates files of every unique read for each individual in parallel
109 | 		cat namelist | parallel --no-notice -j $NUMProc "gunzip -c {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' | perl -e '$PERLT' > {}.uniq.seqs"
110 | 	fi
111 | 	
112 | 	if [ "$ATYPE" == "OL" ]; then
113 | 	#If OL assembly, dDocent assumes that the marjority of PE reads will overlap, so the software PEAR is used to merge paired reads into single reads
114 | 		for i in "${NAMES[@]}";
115 |         		do
116 |         		gunzip -c $i.R.fq.gz | head -2 | tail -1 >> lengths.txt
117 |         		done	
118 |         	MaxLen=$(mawk '{ print length() | "sort -rn" }' lengths.txt| head -1)
119 | 		LENGTH=$(( $MaxLen / 3))
120 | 		for i in "${NAMES[@]}"
121 | 			do
122 | 			pearRM -f $i.F.fq.gz -r $i.R.fq.gz -o $i -j $NUMProc -n $LENGTH 
123 | 			done
124 | 		cat namelist | parallel --no-notice -j $NUMProc "mawk '$AWK1' {}.assembled.fastq | mawk '$AWK2' | perl -e '$PERLT' > {}.uniq.seqs"
125 | 	fi
126 | 	if [ "$ATYPE" == "HYB" ]; then
127 | 	#If HYB assembly, dDocent assumes some PE reads will overlap but that some will not, so the OL method performed and remaining reads are then put through PE method
128 | 		for i in "${NAMES[@]}";
129 |       		do
130 |       		gunzip -c $i.R.fq.gz | head -2 | tail -1 >> lengths.txt
131 |       		done	
132 |     		MaxLen=$(mawk '{ print length() | "sort -rn" }' lengths.txt| head -1)
133 |     		LENGTH=$(( $MaxLen / 3))
134 | 		for i in "${NAMES[@]}"
135 | 			do
136 | 			pearRM -f $i.F.fq.gz -r $i.R.fq.gz -o $i -j $NUMProc -n $LENGTH &>kopt.log
137 | 			done
138 | 		cat namelist | parallel --no-notice -j $NUMProc "mawk '$AWK1' {}.assembled.fastq | mawk '$AWK2' | perl -e '$PERLT' > {}.uniq.seqs"
139 | 		
140 | 		cat namelist | parallel --no-notice -j $NUMProc "cat {}.unassembled.forward.fastq | mawk '$AWK1' | mawk '$AWK2' > {}.forward"
141 | 		cat namelist | parallel --no-notice -j $NUMProc "cat {}.unassembled.reverse.fastq | mawk '$AWK1' | mawk '$AWK2' > {}.reverse"
142 | 		cat namelist | parallel --no-notice -j $NUMProc "paste -d '-' {}.forward {}.reverse | mawk '$AWK3'| sed -e 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.ua.seqs"
143 | 		rm *.forward
144 | 		rm *.reverse
145 | 	fi	
146 | 	
147 | fi
148 | 
149 | #Create a data file with the number of unique sequences and the number of occurrences
150 | 
151 | if [ -f "uniq.seqs.gz" ]; then
152 | 	if [ uniq.seqs.gz -nt uniq.seqs ]; then
153 | 	gunzip uniq.seqs.gz 2>/dev/null
154 | 	fi
155 | fi
156 | 
157 | if [ ! -f "uniq.seqs" ]; then
158 | 	cat *.uniq.seqs > uniq.seqs
159 | fi
160 | 	
161 | if [[ -z $CUTOFF || -z $CUTOFF2 ]]; then
162 | getAssemblyInfo
163 | fi
164 | 
165 | if [[ "$ATYPE" == "RPE" || "$ATYPE" == "ROL" ]]; then
166 |   	parallel --no-notice -j $NUMProc --env special_uniq special_uniq $CUTOFF {} ::: *.uniq.seqs  | $sort --parallel=$NUMProc -S 2G | uniq -c > uniqCperindv
167 | else
168 | 	parallel --no-notice -j $NUMProc mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' > uniqCperindv
169 | fi
170 | 
171 | #Now that data cutoffs have been chosen, reduce data set to specified set of unique reads, convert to FASTA format,
172 | #and remove reads with substantial amounts of adapters
173 | 
174 | if [[ "$ATYPE" == "RPE" || "$ATYPE" == "ROL" ]]; then
175 |   parallel --no-notice -j $NUMProc mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | sed -e 's/NNNNNNNNNN/-/' >  total.uniqs
176 |   cut -f 1 -d "-" total.uniqs > total.u.F
177 |   cut -f 2 -d "-" total.uniqs > total.u.R
178 |   paste total.u.F total.u.R | $sort -k1 --parallel=$NUMProc -S 2G > total.fr
179 |  
180 |   parallel --no-notice --env special_uniq special_uniq $CUTOFF {} ::: *.uniq.seqs  | $sort --parallel=$NUMProc -S 2G | uniq -c > total.f.uniq
181 |   join -1 2 -2 1 -o 1.1,1.2,2.2 total.f.uniq total.fr | mawk '{print $1 "\t" $2 "NNNNNNNNNN" $3}' | mawk -v x=$CUTOFF2 '$1 >= x' > uniq.k.$CUTOFF.c.$CUTOFF2.seqs
182 |   rm total.uniqs total.u.* total.fr total.f.uniq* 
183 |   
184 | else
185 | 	parallel --no-notice mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | mawk -v x=$CUTOFF2 '$1 >= x' > uniq.k.$CUTOFF.c.$CUTOFF2.seqs
186 | fi
187 | #$sort -k1 -r -n uniq.k.$CUTOFF.c.$CUTOFF2.seqs | cut -f 2 > totaluniqseq
188 | $sort -k1 -r -n --parallel=$NUMProc -S 2G uniq.k.$CUTOFF.c.$CUTOFF2.seqs |cut -f2 > totaluniqseq
189 | mawk '{c= c + 1; print ">dDocent_Contig_" c "\n" $1}' totaluniqseq > uniq.full.fasta
190 | LENGTH=$(mawk '!/>/' uniq.full.fasta  | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
191 | LENGTH=$(($LENGTH * 3 / 4))
192 | seqtk seq -F I uniq.full.fasta > uniq.fq
193 | if [ "$NUMProc" -gt 8 ]; then
194 | 	NP=8
195 | else
196 | 	NP=$NUMProc
197 | fi
198 | fastp -i uniq.fq -o uniq.fq1 -w $NP -Q &> assemble.trim.log
199 | mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' uniq.fq1 | paste - - | sort -k1,1 -V | tr "\t" "\n" > uniq.fasta
200 | mawk '!/>/' uniq.fasta > totaluniqseq
201 | rm uniq.fq*
202 | 
203 | if [[ "$ATYPE" == "PE" || "$ATYPE" == "RPE" ]]; then
204 | 	pmerge(){
205 | 		num=$( echo $1 | sed -e 's/^0*//g')
206 | 		if [ "$num" -le 100 ]; then
207 | 			j=$num
208 | 			k=$(($num -1))
209 | 		else
210 | 			num=$(($num - 99))
211 |            		j=$(python -c "print ("$num" * 100)")
212 |                 	k=$(python -c "print ("$j" - 100)")
213 | 		fi
214 |                 mawk -v x="$j" -v y="$k" '$5 <= x && $5 > y'  rbdiv.out > rbdiv.out.$1
215 | 	   
216 | 	   	if [ -s "rbdiv.out.$1" ]; then
217 |            		rainbow merge -o rbasm.out.$1 -a -i rbdiv.out.$1 -r 2 -N10000 -R10000 -l 20 -f 0.75
218 |            	fi
219 |         }
220 | 	
221 | 	export -f pmerge
222 | 	
223 |         #Reads are first clustered using only the Forward reads using CD-hit instead of rainbow
224 |         if [ "$ATYPE" == "PE" ]; then
225 | 		sed -e 's/NNNNNNNNNN/	/g' uniq.fasta | cut -f1 > uniq.F.fasta
226 | 	  	CDHIT=$(python -c "print (max("$simC" - 0.1,0.8))")
227 | 	  	cd-hit-est -i uniq.F.fasta -o xxx -c $CDHIT -T $NUMProc -M 0 -g 1 -d 100 &>cdhit.log
228 | 	  	mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed -e 's/[>dDocent_Contig_,...]//g' | $sort -g -k1 -S 2G --parallel=$NUMProc > sort.contig.cluster.ids
229 | 	  	paste sort.contig.cluster.ids totaluniqseq > contig.cluster.totaluniqseq
230 |           
231 |      	else
232 |         	sed -e 's/NNNNNNNNNN/	/g' totaluniqseq | cut -f1 | $sort --parallel=$NUMProc -S 2G| uniq | mawk '{c= c + 1; print ">dDocent_Contig_" c "\n" $1}' > uniq.F.fasta
233 | 		CDHIT=$(python -c "print (max("$simC" - 0.1,0.8))")
234 | 		cd-hit-est -i uniq.F.fasta -o xxx -c $CDHIT -T $NUMProc -M 0 -g 1 -d 100 &>cdhit.log
235 |   		mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed -e 's/[>dDocent_Contig_,...]//g' | $sort -g -k1 -S 2G --parallel=$NUMProc > sort.contig.cluster.ids
236 |   		paste sort.contig.cluster.ids <(mawk '!/>/' uniq.F.fasta) > contig.cluster.Funiq
237 |   		sed -e 's/NNNNNNNNNN/	/g' totaluniqseq | $sort --parallel=$NUMProc -k1 -S 2G | mawk '{print $0 "\t" NR}'  > totaluniqseq.CN
238 |   		join -t $'\t' -1 3 -2 1 contig.cluster.Funiq totaluniqseq.CN -o 2.3,1.2,2.1,2.2 > contig.cluster.totaluniqseq
239 | 	fi	
240 | 	
241 | 	#CD-hit output is converted to rainbow format
242 | 	$sort -k2,2 -g contig.cluster.totaluniqseq -S 2G --parallel=$NUMProc | sed -e 's/NNNNNNNNNN/	/g' > rcluster
243 | 	rainbow div -i rcluster -o rbdiv.out -f 0.5 -K 10
244 |         CLUST=(`tail -1 rbdiv.out | cut -f5`)
245 | 	CLUST1=$(( $CLUST / 100 + 1))
246 | 	CLUST2=$(( $CLUST1 + 100 ))
247 | 	
248 | 	seq -w 1 $CLUST2 | parallel --no-notice -j $NUMProc --env pmerge pmerge {}
249 | 	
250 |         cat rbasm.out.[0-9]* > rbasm.out
251 |         rm rbasm.out.[0-9]* rbdiv.out.[0-9]*
252 | 
253 | 	#This AWK code replaces rainbow's contig selection perl script
254 | 	LENGTH=$(cut -f3 rbdiv.out |mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
255 | 
256 | 	LENGTH=$(( $LENGTH * 11 / 10 ))
257 | 
258 | 	cat rbasm.out <(echo "E") |sed -e 's/[0-9]*:[0-9]*://g' | mawk -v mlen=$LENGTH  '{
259 |                 if (NR == 1) e=$2;
260 |                 else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_A_Contig_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
261 |                 else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
262 |                 else if ($1 ~/C/) clus=$2;
263 |                 else if ($1 ~/L/) len=$2;
264 |                 else if ($1 ~/S/) seq=$2;
265 |                 else if ($1 ~/N/) freq=$2;
266 |                 else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len}
267 |                 else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/ && $0 ~/^R 0/ && len <= mlen) {seq1 = seq; fclus=clus;lenf=len}
268 |                 else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/ && $0 ~!/^R 0/ && len > mlen) {seq1 = seq; fclus=clus; len1=len}
269 |                 else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/ && $0 ~!/^R 0/ && len <= mlen) {seq1 = seq; fclus=clus; lenf=len}
270 |                 else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq}
271 |                 }' > rainbow.fasta
272 | 
273 | 
274 |         seqtk seq -r rainbow.fasta > rainbow.RC.fasta
275 |         mv rainbow.RC.fasta rainbow.fasta
276 | 
277 |         #The rainbow assembly is checked for overlap between newly assembled Forward and Reverse reads using the software PEAR
278 | 
279 |         grep -A1 "dDocent_A_Contig_" rainbow.fasta | mawk '!/^--/' | sed -e 's/dDocent_A_Contig_/dDocent_Contig_/g' > rainbow.asm.fasta
280 |         grep -A1 "dDocent_Contig_" rainbow.fasta | mawk '!/^--/' > rainbow.n.fasta
281 | 
282 |         sed -e 's/NNNNNNNNNN/	/g' rainbow.asm.fasta | cut -f1 | seqtk seq -F I - > ref.F.fq
283 |         sed -e 's/NNNNNNNNNN/	/g' rainbow.asm.fasta | cut -f2 | seqtk seq -F I - > ref.R.fq
284 | 
285 |         seqtk seq -r ref.R.fq > ref.RC.fq
286 |         mv ref.RC.fq ref.R.fq
287 |         LENGTH=$(mawk '!/>/' rainbow.fasta | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
288 |         LENGTH=$(( $LENGTH * 5 / 4))
289 | 
290 |         pearRM -f ref.F.fq -r ref.R.fq -o overlap -p 0.001 -j 20 -n $LENGTH &>kopt.log
291 | 
292 |         rm ref.F.fq ref.R.fq
293 | 
294 |         mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' overlap.assembled.fastq > overlap.fasta
295 |         mawk '/>/' overlap.fasta > overlap.loci.names
296 |         mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' overlap.unassembled.forward.fastq > other.F
297 |         mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' overlap.unassembled.reverse.fastq > other.R
298 |         paste other.F other.R | mawk '{if ($1 ~ />/) print $1; else print $0}' | sed -e 's/	/NNNNNNNNNN/g' > other.FR
299 | 
300 |         cat other.FR overlap.fasta rainbow.n.fasta > totalover.fasta
301 | 	paste <(mawk '{if (NR % 2) print $0}' totalover.fasta) <(mawk '{if (NR % 2 == 0) print $0}' totalover.fasta) | sort -V | sed -e 's/	/\'$'\n/g' > totalover.s.fasta
302 | 	mv totalover.s.fasta totalover.fasta
303 |         rm *.F *.R
304 | fi
305 | 
306 | if [[ "$ATYPE" == "HYB" ]];then
307 | 	parallel --no-notice mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.ua.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | mawk -v x=$2 '$1 >= x' > uniq.k.$CUTOFF.c.$CUTOFF2.ua.seqs
308 | 	AS=$(cat uniq.k.$CUTOFF.c.$CUTOFF2.ua.seqs | wc -l)
309 | 	if [ "$AS" -gt 1 ]; then
310 | 		cut -f2 uniq.k.$CUTOFF.c.$CUTOFF2.ua.seqs > totaluniqseq.ua
311 | 		mawk '{c= c + 1; print ">dDocent_Contig_" c "\n" $1}' totaluniqseq.ua > uniq.full.ua.fasta
312 | 		LENGTH=$(mawk '!/>/' uniq.full.ua.fasta  | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
313 | 		LENGTH=$(($LENGTH * 3 / 4))
314 | 		seqtk seq -F I uniq.full.ua.fasta > uniq.ua.fq
315 | 		if [ "$NUMProc" -gt 8 ]; then
316 | 			NP=8
317 | 		else
318 | 			NP=$NUMProc
319 | 		fi
320 | 		fastp -i uniq.ua.fq -o uniq.ua.fq1 -w $NP -Q &>/dev/null
321 | 		mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' uniq.ua.fq1 > uniq.ua.fasta
322 | 		mawk '!/>/' uniq.ua.fasta > totaluniqseq.ua
323 | 		rm uniq.ua.fq*
324 | 		#Reads are first clustered using only the Forward reads using CD-hit instead of rainbow
325 | 		sed -e 's/NNNNNNNNNN/	/g' uniq.ua.fasta | cut -f1 > uniq.F.ua.fasta
326 | 		CDHIT=$(python -c "print(max("$simC" - 0.1,0.8))")
327 | 		cd-hit-est -i uniq.F.ua.fasta -o xxx -c $CDHIT -T 0 -M 0 -g 1 -d 100 &>cdhit.log
328 | 		mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed -e 's/[>dDocent_Contig_,...]//g' | $sort -g -k1 -S 2G --parallel=$NUMProc > sort.contig.cluster.ids.ua
329 | 		paste sort.contig.cluster.ids.ua totaluniqseq.ua > contig.cluster.totaluniqseq.ua
330 | 		$sort -k2,2 -g -S 2G --parallel=$NUMProc contig.cluster.totaluniqseq.ua | sed -e 's/NNNNNNNNNN/	/g' > rcluster.ua
331 | 		#CD-hit output is converted to rainbow format
332 | 		rainbow div -i rcluster.ua -o rbdiv.ua.out -f 0.5 -K 10
333 | 		if [ "$ATYPE" == "PE" ]; then
334 | 			rainbow merge -o rbasm.ua.out -a -i rbdiv.ua.out -r 2 -N10000 -R10000 -l 20 -f 0.75
335 | 		else
336 | 			rainbow merge -o rbasm.ua.out -a -i rbdiv.ua.out -r 2 -N10000 -R10000 -l 20 -f 0.75
337 | 		fi
338 | 		
339 | 		#This AWK code replaces rainbow's contig selection perl script
340 | 		cat rbasm.ua.out <(echo "E") |sed -e 's/[0-9]*:[0-9]*://g' | mawk ' {
341 | 			if (NR == 1) e=$2;
342 | 			else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_Contig_UA_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
343 | 			else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_UA_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
344 | 			else if ($1 ~/C/) clus=$2;
345 | 			else if ($1 ~/L/) len=$2;
346 | 			else if ($1 ~/S/) seq=$2;
347 | 			else if ($1 ~/N/) freq=$2;
348 | 			else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len}
349 | 			else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/) {seq1 = seq; fclus=clus; len1=len}
350 | 			else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq}
351 | 			}' > rainbow.ua.fasta
352 | 	
353 | 		seqtk seq -r rainbow.ua.fasta > rainbow.RC.fasta
354 | 		mv rainbow.RC.fasta rainbow.ua.fasta
355 | 	
356 | 		cat rainbow.ua.fasta uniq.fasta > totalover.fasta
357 | 		paste <(mawk '{if (NR % 2) print $0}' totalover.fasta) <(mawk '{if (NR % 2 == 0) print $0}' totalover.fasta) | sort -V | sed -e 's/	/\'$'\n/g' > totalover.s.fasta
358 | 		mv totalover.s.fasta totalover.fasta
359 | 	fi
360 | fi
361 | 
362 | if [[ "$ATYPE" != "PE" && "$ATYPE" != "RPE" && "$ATYPE" != "HYB" ]]; then
363 | 	cp uniq.fasta totalover.fasta
364 | 	paste <(mawk '{if (NR % 2) print $0}' totalover.fasta) <(mawk '{if (NR % 2 == 0) print $0}' totalover.fasta) | sort -V | sed -e 's/	/\'$'\n/g' > totalover.s.fasta
365 | 	mv totalover.s.fasta totalover.fasta
366 | fi
367 | cd-hit-est -i totalover.fasta -o reference.fasta.original -M 0 -T 0 -c $simC &>cdhit2.log
368 | 
369 | sed -e 's/^C/NC/g' -e 's/^A/NA/g' -e 's/^G/NG/g' -e 's/^T/NT/g' -e 's/T$/TN/g' -e 's/A$/AN/g' -e 's/C$/CN/g' -e 's/G$/GN/g' reference.fasta.original > reference.fasta
370 | 
371 | if [[ "$ATYPE" == "RPE" || "$ATYPE" == "ROL" ]]; then
372 | 	sed -i 's/dDocent/dDocentR/g' reference.fasta
373 | fi
374 | 
375 | samtools faidx reference.fasta &> index.log
376 | bwa index reference.fasta >> index.log 2>&1
377 | 
378 | SEQS=$(mawk 'END {print NR}' uniq.k.$CUTOFF.c.$CUTOFF2.seqs)
379 | TIGS=$(grep ">" -c reference.fasta)
380 | 
381 | echo -e "\ndDocent assembled $SEQS sequences (after cutoffs) into $TIGS contigs"
382 | 
383 | }
384 | 
385 | Reference $CUTOFF $CUTOFF2 $simC
386 | 
387 | 


--------------------------------------------------------------------------------
/scripts/remove.bad.hap.loci.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export LC_ALL=en_US.UTF-8
 3 | 
 4 | if [ -z "$2" ]; then
 5 | echo "Correct usage is sh remove.bad.hap.loci.sh file_with_bad_Loci vcf_file"
 6 | exit 1
 7 | fi
 8 | 
 9 | NAME=$(echo $2 | sed -e 's/\.recode.*//g') 
10 | 
11 | grep -vwf <(cut -f1 $1) $2 > $NAME.filtered.vcf
12 | 


--------------------------------------------------------------------------------
/scripts/test:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | export LC_ALL=en_US.UTF-8
  3 | export SHELL=bash
  4 | v="2.7.8"
  5 | 
  6 | if [[ -z "$6" ]]; then
  7 | echo "Usage is sh ReferenceOpt.sh minK1 maxK1 minK2 maxK2 Assembly_Type Number_of_Processors"
  8 | echo -e "\n\n"
  9 | echo "Optionally, a new range of similarities can be entered as well:"
 10 | echo "ReferenceOpt.sh minK1 maxK1 minK2 maxK2 Assembly_Type Number_of_Processors minSim maxSim increment"
 11 | echo -e "\nFor example, to scale between 0.95 and 0.99 using 0.005 increments:\nReferenceOpt.sh minK1 maxK1 minK2 maxK2 Assembly_Type Number_of_Processors 0.95 0.99 0.005"
 12 | exit
 13 | fi
 14 | 
 15 | if ! sort --version | fgrep GNU &>/dev/null; then
 16 | 	sort=gsort
 17 | else
 18 | 	sort=sort
 19 | fi
 20 | 
 21 | DEP=(mawk samtools rainbow gnuplot seqtk cd-hit-est parallel pearRM fastp)
 22 | NUMDEP=0
 23 | for i in "${DEP[@]}"
 24 | do
 25 | 	if which $i &> /dev/null; then
 26 | 		foo=0
 27 | 	else
 28 |     		echo "The dependency" $i "is not installed or is not in your" '$PATH'"."
 29 |     		NUMDEP=$((NUMDEP + 1))
 30 | 	fi
 31 | done
 32 | 
 33 | 
 34 | FASTP=$(fastp -v 2>&1 | cut -f2 -d " ")
 35 | FASTP1=$(echo $FASTP | cut -f1 -d ".")
 36 | FASTP2=$(echo $FASTP | cut -f2 -d ".")
 37 | FASTP3=$(echo $FASTP | cut -f3 -d ".")
 38 | 	if [ "$FASTP1" -lt "2" ]; then
 39 | 		if [ "$FASTP2" -lt "20" ]; then
 40 | 			if [ "$FASTP2" -lt "5" ]; then
 41 | 				echo "The version of fastp installed in your" '$PATH' "is not optimized for dDocent."
 42 | 				echo "Please install version 0.19.5 or above"
 43 | 				exit 1
 44 | 			fi
 45 | 		fi
 46 | 	fi
 47 | 
 48 | 
 49 | ATYPE=$5
 50 | NUMProc=$6
 51 | 
 52 | if [[ -z "$7" ]]; then
 53 | 
 54 | 	minSim=0.8
 55 | 	maxSim=0.98
 56 | 	incSim=0.02
 57 | else
 58 | 
 59 | 	minSim=$7 
 60 | 	maxSim=$8
 61 | 	incSim=$9
 62 | fi
 63 | 
 64 | if [ $NUMDEP -gt 0 ]; then
 65 | 	echo -e "\nPlease install all required software before running ReferenceOpt again."
 66 | 	exit 1
 67 | else
 68 | 	echo -e "\nAll required software is installed!"
 69 | fi
 70 | 
 71 | echo "dDocent ReferenceOpt version $v"
 72 | 
 73 | ls *.F.fq.gz > namelist
 74 | sed -i'' -e 's/.F.fq.gz//g' namelist
 75 | NAMES=( `cat "namelist" `)
 76 | 
 77 | getAssemblyInfo(){
 78 | echo "nope!"
 79 | }
 80 | 
 81 | 
 82 | Reference(){
 83 | 
 84 | CUTOFF=$1
 85 | CUTOFF2=$2
 86 | simC=$3
 87 | 
 88 | AWK1='BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}'
 89 | AWK2='!/>/'
 90 | AWK3='!/NNN/'
 91 | AWK4='{for(i=0;i<$1;i++)print}'
 92 | PERLT='while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}'
 93 | SED1='s/^[ \t]*//'
 94 | SED2='s/\s/\t/g'
 95 | FRL=$(gunzip -c ${NAMES[0]}.F.fq.gz | mawk '{ print length() | "sort -rn" }' | head -1)
 96 | 
 97 | special_uniq(){
 98 | 	mawk -v x=$1 '$1 >= x' $2  |cut -f2 | sed -e 's/NNNNNNNNNN/	/g' | cut -f1 | uniq
 99 | }
100 | export -f special_uniq
101 | 
102 | if [ ${NAMES[@]:(-1)}.F.fq.gz -nt ${NAMES[@]:(-1)}.uniq.seqs ];then
103 | 	if [[ "$ATYPE" == "PE" || "$ATYPE" == "RPE" ]]; then
104 | 	#If PE assembly, creates a concatenated file of every unique for each individual in parallel
105 | 		cat namelist | parallel --no-notice -j $NUMProc "gunzip -c {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.forward"
106 | 		cat namelist | parallel --no-notice -j $NUMProc "gunzip -c {}.R.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.reverse"
107 | 		if [ "$ATYPE" = "RPE" ]; then
108 | 			cat namelist | parallel --no-notice -j $NUMProc "paste {}.forward {}.reverse | $sort -k1 -S 200M > {}.fr"
109 | 			cat namelist | parallel --no-notice -j $NUMProc "cut -f1 {}.fr | uniq -c > {}.f.uniq && cut -f2 {}.fr > {}.r"
110 | 			cat namelist | parallel --no-notice -j $NUMProc "mawk '$AWK4' {}.f.uniq > {}.f.uniq.e" 
111 | 			cat namelist | parallel --no-notice -j $NUMProc "paste -d '-' {}.f.uniq.e {}.r | mawk '$AWK3'| sed 's/-/NNNNNNNNNN/' | sed -e '$SED1' | sed -e '$SED2'> {}.uniq.seqs"
112 | 			rm *.f.uniq.e *.f.uniq *.r *.fr
113 | 		else
114 | 			cat namelist | parallel --no-notice -j $NUMProc "paste -d '-' {}.forward {}.reverse | mawk '$AWK3'| sed 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.seqs"
115 | 		fi
116 | 		rm *.forward
117 | 		rm *.reverse
118 | 	fi
119 | 	
120 | 	if [ "$ATYPE" == "SE" ]; then
121 | 	#if SE assembly, creates files of every unique read for each individual in parallel
122 | 		cat namelist | parallel --no-notice -j $NUMProc "gunzip -c {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' | perl -e '$PERLT' > {}.uniq.seqs"
123 | 	fi
124 | 	
125 | 	if [ "$ATYPE" == "OL" ]; then
126 | 	#If OL assembly, dDocent assumes that the marjority of PE reads will overlap, so the software PEAR is used to merge paired reads into single reads
127 | 		for i in "${NAMES[@]}";
128 |         		do
129 |         		gunzip -c $i.R.fq.gz | head -2 | tail -1 >> lengths.txt
130 |         		done	
131 |         	MaxLen=$(mawk '{ print length() | "sort -rn" }' lengths.txt| head -1)
132 | 		LENGTH=$(( $MaxLen / 3))
133 | 		for i in "${NAMES[@]}"
134 | 			do
135 | 			pearRM -f $i.F.fq.gz -r $i.R.fq.gz -o $i -j $NUMProc -n $LENGTH 
136 | 			done
137 | 		cat namelist | parallel --no-notice -j $NUMProc "mawk '$AWK1' {}.assembled.fastq | mawk '$AWK2' | perl -e '$PERLT' > {}.uniq.seqs"
138 | 	fi
139 | 	if [ "$ATYPE" == "HYB" ]; then
140 | 	#If HYB assembly, dDocent assumes some PE reads will overlap but that some will not, so the OL method performed and remaining reads are then put through PE method
141 | 		for i in "${NAMES[@]}";
142 |       		do
143 |       		gunzip -c $i.R.fq.gz | head -2 | tail -1 >> lengths.txt
144 |       		done	
145 |     		MaxLen=$(mawk '{ print length() | "sort -rn" }' lengths.txt| head -1)
146 |     		LENGTH=$(( $MaxLen / 3))
147 | 		for i in "${NAMES[@]}"
148 | 			do
149 | 			pearRM -f $i.F.fq.gz -r $i.R.fq.gz -o $i -j $NUMProc -n $LENGTH &>kopt.log
150 | 			done
151 | 		cat namelist | parallel --no-notice -j $NUMProc "mawk '$AWK1' {}.assembled.fastq | mawk '$AWK2' | perl -e '$PERLT' > {}.uniq.seqs"
152 | 		
153 | 		cat namelist | parallel --no-notice -j $NUMProc "cat {}.unassembled.forward.fastq | mawk '$AWK1' | mawk '$AWK2' > {}.forward"
154 | 		cat namelist | parallel --no-notice -j $NUMProc "cat {}.unassembled.reverse.fastq | mawk '$AWK1' | mawk '$AWK2' > {}.reverse"
155 | 		cat namelist | parallel --no-notice -j $NUMProc "paste -d '-' {}.forward {}.reverse | mawk '$AWK3'| sed 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.ua.seqs"
156 | 		rm *.forward
157 | 		rm *.reverse
158 | 	fi	
159 | 	
160 | fi
161 | 
162 | #Create a data file with the number of unique sequences and the number of occurrences
163 | 
164 | if [ -f "uniq.seqs.gz" ]; then
165 | 	if [ uniq.seqs.gz -nt uniq.seqs ]; then
166 | 	gunzip uniq.seqs.gz 2>/dev/null
167 | 	fi
168 | fi
169 | 
170 | if [ ! -f "uniq.seqs" ]; then
171 | 	cat *.uniq.seqs > uniq.seqs
172 | fi
173 | 	
174 | if [[ -z $CUTOFF || -z $CUTOFF2 ]]; then
175 | getAssemblyInfo
176 | fi
177 | 
178 | if [[ "$ATYPE" == "RPE" || "$ATYPE" == "ROL" ]]; then
179 |   	parallel --no-notice -j $NUMProc --env special_uniq special_uniq $CUTOFF {} ::: *.uniq.seqs  | $sort --parallel=$NUMProc -S 2G | uniq -c > uniqCperindv
180 | else
181 | 	parallel --no-notice -j $NUMProc mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' > uniqCperindv
182 | fi
183 | 
184 | #Now that data cutoffs have been chosen, reduce data set to specified set of unique reads, convert to FASTA format,
185 | #and remove reads with substantial amounts of adapters
186 | 
187 | if [[ "$ATYPE" == "RPE" || "$ATYPE" == "ROL" ]]; then
188 |   parallel --no-notice -j $NUMProc mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | sed 's/NNNNNNNNNN/-/' >  total.uniqs
189 |   cut -f 1 -d "-" total.uniqs > total.u.F
190 |   cut -f 2 -d "-" total.uniqs > total.u.R
191 |   paste total.u.F total.u.R | $sort -k1 --parallel=$NUMProc -S 2G > total.fr
192 |  
193 |   parallel --no-notice --env special_uniq special_uniq $CUTOFF {} ::: *.uniq.seqs  | $sort --parallel=$NUMProc -S 2G | uniq -c > total.f.uniq
194 |   join -1 2 -2 1 -o 1.1,1.2,2.2 total.f.uniq total.fr | mawk '{print $1 "\t" $2 "NNNNNNNNNN" $3}' | mawk -v x=$CUTOFF2 '$1 >= x' > uniq.k.$CUTOFF.c.$CUTOFF2.seqs
195 |   rm total.uniqs total.u.* total.fr total.f.uniq* 
196 |   
197 | else
198 | 	parallel --no-notice mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | mawk -v x=$CUTOFF2 '$1 >= x' > uniq.k.$CUTOFF.c.$CUTOFF2.seqs
199 | fi
200 | $sort -k1 -r -n uniq.k.$CUTOFF.c.$CUTOFF2.seqs | cut -f 2 > totaluniqseq
201 | mawk '{c= c + 1; print ">dDocent_Contig_" c "\n" $1}' totaluniqseq > uniq.full.fasta
202 | LENGTH=$(mawk '!/>/' uniq.full.fasta  | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
203 | LENGTH=$(($LENGTH * 3 / 4))
204 | seqtk seq -F I uniq.full.fasta > uniq.fq
205 | if [ "$NUMProc" -gt 8 ]; then
206 | 	NP=8
207 | else
208 | 	NP=$NUMProc
209 | fi
210 | fastp -i uniq.fq -o uniq.fq1 -w $NP -Q &> assemble.trim.log
211 | mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' uniq.fq1 | paste - - | sort -k1,1 -V | tr "\t" "\n" > uniq.fasta
212 | mawk '!/>/' uniq.fasta > totaluniqseq
213 | rm uniq.fq*
214 | 
215 | if [[ "$ATYPE" == "PE" || "$ATYPE" == "RPE" ]]; then
216 | 	pmerge(){
217 | 		num=$( echo $1 | sed 's/^0*//g')
218 | 		if [ "$num" -le 100 ]; then
219 | 			j=$num
220 | 			k=$(($num -1))
221 | 		else
222 | 			num=$(($num - 99))
223 |            		j=$(python -c "print ("$num" * 100)")
224 |                 	k=$(python -c "print ("$j" - 100)")
225 | 		fi
226 |                 mawk -v x="$j" -v y="$k" '$5 <= x && $5 > y'  rbdiv.out > rbdiv.out.$1
227 | 	   
228 | 	   	if [ -s "rbdiv.out.$1" ]; then
229 |            		rainbow merge -o rbasm.out.$1 -a -i rbdiv.out.$1 -r 2 -N10000 -R10000 -l 20 -f 0.75
230 |            	fi
231 |         }
232 | 	
233 | 	export -f pmerge
234 | 	
235 |         #Reads are first clustered using only the Forward reads using CD-hit instead of rainbow
236 |         if [ "$ATYPE" == "PE" ]; then
237 | 		sed -e 's/NNNNNNNNNN/	/g' uniq.fasta | cut -f1 > uniq.F.fasta
238 | 	  	CDHIT=$(python -c "print (max("$simC" - 0.1,0.8))")
239 | 	  	cd-hit-est -i uniq.F.fasta -o xxx -c $CDHIT -T $NUMProc -M 0 -g 1 -d 100 &>cdhit.log
240 | 	  	mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed 's/[>dDocent_Contig_,...]//g' | $sort -g -k1 -S 2G --parallel=$NUMProc > sort.contig.cluster.ids
241 | 	  	paste sort.contig.cluster.ids totaluniqseq > contig.cluster.totaluniqseq
242 |           
243 |      	else
244 |         	sed -e 's/NNNNNNNNNN/	/g' totaluniqseq | cut -f1 | $sort --parallel=$NUMProc -S 2G| uniq | mawk '{c= c + 1; print ">dDocent_Contig_" c "\n" $1}' > uniq.F.fasta
245 | 		CDHIT=$(python -c "print (max("$simC" - 0.1,0.8))")
246 | 		cd-hit-est -i uniq.F.fasta -o xxx -c $CDHIT -T $NUMProc -M 0 -g 1 -d 100 &>cdhit.log
247 |   		mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed 's/[>dDocent_Contig_,...]//g' | $sort -g -k1 -S 2G --parallel=$NUMProc > sort.contig.cluster.ids
248 |   		paste sort.contig.cluster.ids <(mawk '!/>/' uniq.F.fasta) > contig.cluster.Funiq
249 |   		sed -e 's/NNNNNNNNNN/	/g' totaluniqseq | $sort --parallel=$NUMProc -k1 -S 2G | mawk '{print $0 "\t" NR}'  > totaluniqseq.CN
250 |   		join -t $'\t' -1 3 -2 1 contig.cluster.Funiq totaluniqseq.CN -o 2.3,1.2,2.1,2.2 > contig.cluster.totaluniqseq
251 | 	fi	
252 | 	
253 | 	#CD-hit output is converted to rainbow format
254 | 	$sort -k2,2 -g contig.cluster.totaluniqseq -S 2G --parallel=$NUMProc | sed -e 's/NNNNNNNNNN/	/g' > rcluster
255 | 	rainbow div -i rcluster -o rbdiv.out -f 0.5 -K 10
256 |         CLUST=(`tail -1 rbdiv.out | cut -f5`)
257 | 	CLUST1=$(( $CLUST / 100 + 1))
258 | 	CLUST2=$(( $CLUST1 + 100 ))
259 | 	
260 | 	seq -w 1 $CLUST2 | parallel --no-notice -j $NUMProc --env pmerge pmerge {}
261 | 	
262 |         cat rbasm.out.[0-9]* > rbasm.out
263 |         rm rbasm.out.[0-9]* rbdiv.out.[0-9]*
264 | 
265 | 	#This AWK code replaces rainbow's contig selection perl script
266 |   	cat rbasm.out <(echo "E") |sed 's/[0-9]*:[0-9]*://g' | mawk ' {
267 | 		if (NR == 1) e=$2;
268 | 		else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
269 | 		else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
270 | 		else if ($1 ~/C/) clus=$2;
271 | 		else if ($1 ~/L/) len=$2;
272 | 		else if ($1 ~/S/) seq=$2;
273 | 		else if ($1 ~/N/) freq=$2;
274 | 		else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len}
275 | 		else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/) {seq1 = seq; fclus=clus; len1=len}
276 | 		else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq}
277 | 		}' > rainbow.fasta
278 | 
279 | 	seqtk seq -r rainbow.fasta > rainbow.RC.fasta
280 | 	mv rainbow.RC.fasta rainbow.fasta
281 | 
282 | 	#The rainbow assembly is checked for overlap between newly assembled Forward and Reverse reads using the software PEAR
283 | 	sed -e 's/NNNNNNNNNN/	/g' rainbow.fasta | cut -f1 | seqtk seq -F I - > ref.F.fq
284 | 	sed -e 's/NNNNNNNNNN/	/g' rainbow.fasta | cut -f2 | seqtk seq -F I - > ref.R.fq
285 | 
286 | 	seqtk seq -r ref.R.fq > ref.RC.fq
287 | 	mv ref.RC.fq ref.R.fq
288 | 	LENGTH=$(mawk '!/>/' rainbow.fasta | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
289 | 	LENGTH=$(( $LENGTH * 5 / 4))
290 | 	
291 | 	pearRM -f ref.F.fq -r ref.R.fq -o overlap -p 0.001 -j $NUMProc -n $LENGTH &>kopt.log
292 | 
293 | 	rm ref.F.fq ref.R.fq
294 | 
295 | 	mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' overlap.assembled.fastq > overlap.fasta
296 | 	mawk '/>/' overlap.fasta > overlap.loci.names
297 | 	mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' overlap.unassembled.forward.fastq > other.F
298 | 	mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' overlap.unassembled.reverse.fastq > other.R
299 | 	paste other.F other.R | mawk '{if ($1 ~ />/) print $1; else print $0}' | sed 's/	/NNNNNNNNNN/g' > other.FR
300 | 
301 | 	cat other.FR overlap.fasta > totalover.fasta
302 | 
303 | 	rm *.F *.R
304 | fi
305 | 
306 | if [[ "$ATYPE" == "HYB" ]];then
307 | 	parallel --no-notice mawk -v x=$CUTOFF \''$1 >= x'\' ::: *.uniq.ua.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | mawk -v x=$2 '$1 >= x' > uniq.k.$CUTOFF.c.$CUTOFF2.ua.seqs
308 | 	AS=$(cat uniq.k.$CUTOFF.c.$CUTOFF2.ua.seqs | wc -l)
309 | 	if [ "$AS" -gt 1 ]; then
310 | 		cut -f2 uniq.k.$CUTOFF.c.$CUTOFF2.ua.seqs > totaluniqseq.ua
311 | 		mawk '{c= c + 1; print ">dDocent_Contig_" c "\n" $1}' totaluniqseq.ua > uniq.full.ua.fasta
312 | 		LENGTH=$(mawk '!/>/' uniq.full.ua.fasta  | mawk '(NR==1||length<shortest){shortest=length} END {print shortest}')
313 | 		LENGTH=$(($LENGTH * 3 / 4))
314 | 		seqtk seq -F I uniq.full.ua.fasta > uniq.ua.fq
315 | 		if [ "$NUMProc" -gt 8 ]; then
316 | 			NP=8
317 | 		else
318 | 			NP=$NumProc
319 | 		fi
320 | 		fastp -i uniq.ua.fq -o uniq.ua.fq1 -w $NP -Q &>/dev/null
321 | 		mawk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' uniq.ua.fq1 > uniq.ua.fasta
322 | 		mawk '!/>/' uniq.ua.fasta > totaluniqseq.ua
323 | 		rm uniq.ua.fq*
324 | 		#Reads are first clustered using only the Forward reads using CD-hit instead of rainbow
325 | 		sed -e 's/NNNNNNNNNN/	/g' uniq.ua.fasta | cut -f1 > uniq.F.ua.fasta
326 | 		CDHIT=$(python -c "print(max("$simC" - 0.1,0.8))")
327 | 		cd-hit-est -i uniq.F.ua.fasta -o xxx -c $CDHIT -T 0 -M 0 -g 1 -d 100 &>cdhit.log
328 | 		mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed 's/[>dDocent_Contig_,...]//g' | $sort -g -k1 -S 2G --parallel=$NUMProc > sort.contig.cluster.ids.ua
329 | 		paste sort.contig.cluster.ids.ua totaluniqseq.ua > contig.cluster.totaluniqseq.ua
330 | 		$sort -k2,2 -g -S 2G --parallel=$NUMProc contig.cluster.totaluniqseq.ua | sed -e 's/NNNNNNNNNN/	/g' > rcluster.ua
331 | 		#CD-hit output is converted to rainbow format
332 | 		rainbow div -i rcluster.ua -o rbdiv.ua.out -f 0.5 -K 10
333 | 		if [ "$ATYPE" == "PE" ]; then
334 | 			rainbow merge -o rbasm.ua.out -a -i rbdiv.ua.out -r 2 -N10000 -R10000 -l 20 -f 0.75
335 | 		else
336 | 			rainbow merge -o rbasm.ua.out -a -i rbdiv.ua.out -r 2 -N10000 -R10000 -l 20 -f 0.75
337 | 		fi
338 | 		
339 | 		#This AWK code replaces rainbow's contig selection perl script
340 | 		cat rbasm.ua.out <(echo "E") |sed 's/[0-9]*:[0-9]*://g' | mawk ' {
341 | 			if (NR == 1) e=$2;
342 | 			else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_Contig_UA_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
343 | 			else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_UA_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
344 | 			else if ($1 ~/C/) clus=$2;
345 | 			else if ($1 ~/L/) len=$2;
346 | 			else if ($1 ~/S/) seq=$2;
347 | 			else if ($1 ~/N/) freq=$2;
348 | 			else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len}
349 | 			else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/) {seq1 = seq; fclus=clus; len1=len}
350 | 			else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq}
351 | 			}' > rainbow.ua.fasta
352 | 	
353 | 		seqtk seq -r rainbow.ua.fasta > rainbow.RC.fasta
354 | 		mv rainbow.RC.fasta rainbow.ua.fasta
355 | 	
356 | 		cat rainbow.ua.fasta uniq.fasta > totalover.fasta
357 | 
358 | 	fi
359 | fi
360 | 
361 | if [[ "$ATYPE" != "PE" && "$ATYPE" != "RPE" && "$ATYPE" != "HYB" ]]; then
362 | 	cp uniq.fasta totalover.fasta
363 | fi
364 | cd-hit-est -i totalover.fasta -o reference.fasta.original -M 0 -T 0 -c $simC &>cdhit2.log
365 | 
366 | sed -e 's/^C/NC/g' -e 's/^A/NA/g' -e 's/^G/NG/g' -e 's/^T/NT/g' -e 's/T$/TN/g' -e 's/A$/AN/g' -e 's/C$/CN/g' -e 's/G$/GN/g' reference.fasta.original > reference.fasta
367 | 
368 | if [[ "$ATYPE" == "RPE" || "$ATYPE" == "ROL" ]]; then
369 | 	sed -i 's/dDocent/dDocentR/g' reference.fasta
370 | fi
371 | 
372 | samtools faidx reference.fasta &> index.log
373 | bwa index reference.fasta >> index.log 2>&1
374 | 
375 | SEQS=$(mawk 'END {print NR}' uniq.k.$CUTOFF.c.$CUTOFF2.seqs)
376 | TIGS=$(grep ">" -c reference.fasta)
377 | 
378 | #echo -e "\ndDocent assembled $SEQS sequences (after cutoffs) into $TIGS contigs"
379 | echo $SEQS
380 | }
381 | 
382 | rm kopt.data &>/dev/null
383 | 
384 | for ((P = $1; P <= $2; P++))
385 | 	do
386 | 	for ((i = $3; i <= $4; i++))
387 | 	do
388 | 	X=$(($P + $i))
389 | 	if [ "$X" != "2" ]; then
390 | 		for j in $(seq $minSim $incSim $maxSim)
391 | 		do
392 | 		echo "K1 is $P" "K2 is $i" "c is $j"
393 | 		SEQS=$(Reference $P $i $j)
394 | 		echo $P $i $j $SEQS >> kopt.data
395 | 		done
396 | 	fi	
397 | 	done
398 | done
399 | 
400 | cut -f4 -d " " kopt.data > plot.kopt.data
401 | gnuplot << \EOF
402 | set terminal dumb size 120, 30
403 | set autoscale
404 | unset label
405 | set title "Histogram of number of reference contigs"
406 | set ylabel "Number of Occurrences"
407 | set xlabel "Number of reference contigs"
408 | max = `sort -g plot.kopt.data | tail -1`
409 | binwidth = max/250.0
410 | bin(x,width)=width*floor(x/width) + binwidth/2.0
411 | #set xtics 10
412 | plot 'plot.kopt.data' using (bin($1,binwidth)):(1.0) smooth freq with boxes
413 | pause -1
414 | EOF
415 | 
416 | AF=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' plot.kopt.data)
417 | echo "Average contig number = $AF"
418 | echo "The top three most common number of contigs"
419 | echo -e "X\tContig number"
420 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' plot.kopt.data | sort -k1 -g -r | head -3
421 | echo "The top three most common number of contigs (with values rounded)"
422 | echo -e "X\tContig number"
423 | while read NAME; do python -c "print(round($NAME,-2))"; done < plot.kopt.data | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | sort -g -r | head -3 | sed "s/^[ \t]*//"
424 | 


--------------------------------------------------------------------------------
/scripts/untested/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpuritz/dDocent/414d6f3f3fba0be75d01da2797629c3808571113/scripts/untested/.DS_Store


--------------------------------------------------------------------------------
/scripts/untested/BS_reference_to_fasta.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #sed -i 's/\..*//g' $1
 4 | 
 5 | NAMES=( `mawk '!/#/' $1 | cut -f1 | sort | uniq `)
 6 | 
 7 | rm outliers.fasta
 8 | 
 9 | for i in "${NAMES[@]}"
10 | do
11 | grep -wA1 $i reference.fasta >> outliers.fasta 
12 | done
13 | 
14 | 


--------------------------------------------------------------------------------
/scripts/untested/ErrorCount.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "This script counts the number of potential genotyping errors due to low read depth"
 4 | echo "It report a low range, based on a 50% binomial probability of observing the second allele in a heterozygote and a high range based on a 25% probability."
 5 | 
 6 | R1H01S0=$(grep -oh '0[/|]1:1:' $1 | wc -l) 
 7 | R1H01S1=$(grep -oh '1[/|]0:1:' $1 | wc -l) 
 8 | R1H0S1=$(grep -oh '0[/|]0:1:' $1 | wc -l) 
 9 | R1H1S1=$(grep -oh '1[/|]1:1:' $1 | wc -l) 
10 | 
11 | R1GEN=$(python -c "print $R1H01S0+$R1H01S1+$R1H0S1+$R1H1S1")
12 | R1ERR1=$(python -c "print $R1GEN/2")
13 | R1ERR2=$(python -c "print $R1GEN*0.75")
14 | 
15 | echo "Potential genotyping errors from genotypes from only 1 read range from $R1ERR1 to $R1ERR2" 
16 | 
17 | R2H01S0=$(grep -oh '0[/|]1:2:[02]' $1 | wc -l) 
18 | R2H01S2=$(grep -oh '1[/|]0:2:[02]' $1 | wc -l) 
19 | R2H0S1=$(grep -oh '0[/|]0:2:' $1 | wc -l) 
20 | R2H1S1=$(grep -oh '1[/|]1:2:' $1 | wc -l)
21 | 
22 | R2GEN=$(python -c "print $R2H01S0+$R2H01S2+$R2H1S1+$R2H0S1")
23 | R2ERR1=$(python -c "print $R2GEN/4")
24 | R2ERR2=$(python -c "print $R2GEN* 0.5625")
25 | 
26 | echo "Potential genotyping errors from genotypes from only 2 reads range from $R2ERR1 to $R2ERR2" 
27 | 
28 | R3H01S0=$(grep -oh '0[/|]1:3:[03]' $1 | wc -l) 
29 | R3H01S2=$(grep -oh '1[/|]0:3:[03]' $1 | wc -l) 
30 | R3H0S1=$(grep -oh '0[/|]0:3:' $1 | wc -l)
31 | R3H1S3=$(grep -oh '1[/|]1:3:' $1 | wc -l)
32 | 
33 | R3GEN=$(python -c "print $R3H01S0+$R3H01S2+$R3H0S1+$R3H1S3")
34 | R3ERR1=$(python -c "print $R3GEN/8")
35 | R3ERR2=$(python -c "print $R3GEN*0.42")
36 | 
37 | echo "Potential genotyping errors from genotypes from only 3 reads range from $R3ERR1 to $R3ERR2"
38 | 
39 | R4H0=$(grep -oh '0[/|]0:4:' $1 | wc -l)
40 | R4H1=$(grep -oh '1[/|]1:4:' $1 | wc -l)
41 | 
42 | R4GEN=$(python -c "print $R4H0+$R4H1")
43 | R4ERR1=$(python -c "print $R4GEN/16")
44 | R4ERR2=$(python -c "print $R4GEN*0.316")
45 | 
46 | echo "Potential genotyping errors from genotypes from only 4 reads range from $R4ERR1 to $R4ERR2"
47 | 
48 | R5H0=$(grep -oh '0[/|]0:5:' $1 | wc -l)
49 | R5H1=$(grep -oh '1[/|]1:5:' $1 | wc -l)
50 | 
51 | R5GEN=$(python -c "print $R5H0+$R5H1")
52 | R5ERR1=$(python -c "print $R5GEN/32")
53 | R5ERR2=$(python -c "print int(	$R5GEN*0.237)")
54 | 
55 | echo "Potential genotyping errors from genotypes from only 5 reads range from $R5ERR1 to $R5ERR2"
56 | 
57 | IND=$(mawk '/#/' $1 | tail -1 | wc -w)
58 | IND=$(($IND - 9))
59 | LOCI=$(mawk '!/#/' $1 | wc -l)
60 | MISSING=$(grep -Fwo ./.:. $1 | wc -l)
61 | GENO=$(( $IND * $LOCI  ))
62 | 
63 | echo $IND "number of individuals and" $LOCI "equals" $GENO "total genotypes"
64 | GENO=$(( $GENO - $MISSING))
65 | echo Total genotypes not counting missing data $GENO
66 | 
67 | TOTERR1=$(python -c "print $R1ERR1+$R2ERR1+$R3ERR1+$R4ERR1+$R5ERR1")
68 | TOTERR2=$(python -c "print $R1ERR2+$R2ERR2+$R3ERR2+$R4ERR2+$R5ERR2")
69 | ERRRATEL=$(python -c "print $TOTERR1/float($GENO)")
70 | ERRRATEH=$(python -c "print $TOTERR2/float($GENO)")
71 | 
72 | echo "Total potential error rate is between $ERRRATEL and $ERRRATEH"
73 | 
74 | ALL=$(($R1GEN+$R2GEN+$R3GEN+$R4GEN+$R5GEN))
75 | ERRALL=$(python -c "print $ALL/float($GENO)")
76 | 
77 | echo "SCORCHED EARTH SCENARIO"
78 | echo "WHAT IF ALL LOW DEPTH HOMOZYGOTE GENOTYPES ARE ERRORS?????"
79 | echo "The total SCORCHED EARTH error rate is $ERRALL."
80 | 


--------------------------------------------------------------------------------
/scripts/untested/Filter_VCF_best_SNP_per_contig.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Script to take the best SNP from every contig in a vcffile
 4 | 
 5 | if [[ -z "$1" ]]; then
 6 | echo "Usage is bash Filter_VCF_best_SNP_per_contig.sh vcffile"
 7 | exit 1
 8 | fi
 9 | 
10 | NAME=$(echo $1 | sed -e 's/\.recode.*//g') 
11 | 
12 | cat $1 | mawk 'BEGIN{last_loc = 0} { 
13 | 		if ($1 ~/#/) print $0;
14 | 		else if ($1 ~!/#/ && last_loc == 0) {last_contig=$0; last_loc=$1; last_qual=$6}
15 | 		else if ($1 == last_loc && $6 > last_qual) {last_contig=$0; last_loc=$1; last_qual=$6}
16 | 		else if ($1 != last_loc) {print last_contig; last_contig=$0; last_loc=$1; last_qual=$6}
17 | 		} END{print last_contig}' > $NAME.filtered1SNPper.vcf
18 | 
19 | 
20 | echo "Filtered VCF file is saved under name" $NAME.filtered1SNPper.vcf
21 | 
22 | 


--------------------------------------------------------------------------------
/scripts/untested/Filter_one_random_snp_per_contig.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #Script to take random SNP from every contig in a vcffile
 4 | 
 5 | if [[ -z "$1" ]]; then
 6 | echo "Usage is bash Filter_one_random_snp_per_contig.sh vcffile"
 7 | exit 1
 8 | fi
 9 | 
10 | #Calculate number of SNPs
11 | Loci=(`mawk '!/#/' $1 | wc -l `)
12 | 
13 | #Generate list of random numbers
14 | seq 1 500000 | shuf | head -$Loci > nq
15 | 
16 | #create temporary file that has a random number assigned to each SNP in first column
17 | cat <(mawk '/^#/' $1) <(paste <(mawk '!/#/' $1 | cut -f1-5) nq <(mawk '!/#/' $1 | cut -f7- ) )> temp
18 | 
19 | #Get name of VCF file
20 | NAME=$(echo $1 | sed -e 's/\.recode.*//g' | sed -e 's/.vcf//g' ) 
21 | 
22 | #Use awk (mawk) to parse file and select one snp per contig (one with largest random number)
23 | cat temp | mawk 'BEGIN{last_loc = 0} { 
24 | 		if ($1 ~/#/) print $0;
25 | 		else if ($1 ~!/#/ && last_loc == 0) {last_contig=$0; last_loc=$1; last_qual=$6}
26 | 		else if ($1 == last_loc && $6 > last_qual) {last_contig=$0; last_loc=$1; last_qual=$6}
27 | 		else if ($1 != last_loc) {print last_contig; last_contig=$0; last_loc=$1; last_qual=$6}
28 | 		} END{print last_contig}' | mawk 'NF > 0' > $NAME.filtered1SNPper.vcf
29 | 
30 | #Remove temp file
31 | rm temp
32 | 
33 | #Announce triumphant completion
34 | echo "Filtered VCF file is saved under name" $NAME.filtered1SNPper.vcf
35 | 


--------------------------------------------------------------------------------
/scripts/untested/ReferenceOpt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ -z "$2" ]]; then
 4 | echo "Usage is sh ReferenceOpt.sh lowK highK"
 5 | exit 1
 6 | fi
 7 | 
 8 | Reference(){
 9 | 
10 | 
11 | if [ -f "uniq.seqs.gz" ]; then
12 | 	if [ uniq.seqs.gz -nt uniq.seqs ]; then
13 | 	gunzip uniq.seqs.gz 2>/dev/null
14 | 	fi
15 | fi
16 | 
17 | mawk -v x=$1 '$1 >= x' uniq.seqs | cut -f 2 > totaluniqseq
18 | rm uniq.fasta &>/dev/null
19 | cat totaluniqseq | while read line
20 | 	do
21 | 	echo ">Contig"$i >>uniq.fasta
22 | 	echo $line >> uniq.fasta
23 | 	i=$(($i + 1))
24 | 	done
25 | 
26 | sed -e 's/NNNNNNNNNN/\t/g' uniq.fasta | cut -f1 > uniq.F.fasta
27 | sed -e 's/NNNNNNNNNN/\t/g' uniq.fasta | cut -f2 > uniq.R.fasta
28 | 
29 | seqtk seq -r uniq.R.fasta > uniq.RC.fasta
30 | rm uniq.R.fasta
31 | 
32 | #Now use rainbow to cluster and assemble reads into longer contigs
33 | rainbow cluster -m 6 -1 uniq.F.fasta -2 uniq.RC.fasta > rcluster 2> rainbow.log
34 | rainbow div -i rcluster -o rbdiv.out -f 0.01
35 | rainbow merge -o rbasm.out -a -i rbdiv.out -r 2
36 | select_best_rbcontig_plus_read1.pl rbasm.out rbdiv.out >rainbow.fasta
37 | 
38 | #cd-hit to cluster reads based on sequence similarity
39 | cd-hit-est -i rainbow.fasta -o reference.fasta -M 0 -T 0 -c $2 &>cdhit.log
40 | 
41 | SEQS=$(cat reference.fasta | wc -l)
42 | SEQS=$(($SEQS / 2 ))
43 | echo $SEQS
44 | }
45 | 
46 | rm kopt.data &>/dev/null
47 | 
48 | for ((i = $1; i <= $2; i++))
49 | do
50 | echo "K is $i" "c is 0.80"
51 | SEQS=$(Reference $i 0.80)
52 | echo $i 0.80 $SEQS >> kopt.data
53 | 	for j in {0.82,0.84,0.86,0.88,0.9,0.92,0.94,0.96,0.98}
54 | 	do
55 | 	echo "K is $i" "c is $j"
56 | 	cd-hit-est -i rainbow.fasta -o reference.fasta -M 0 -T 0 -c $j &>cdhit.log
57 | 	SEQS=$(cat reference.fasta | wc -l)
58 | 	SEQS=$(($SEQS / 2 ))
59 | 	echo $i $j $SEQS >> kopt.data
60 | 	done
61 | done
62 | 
63 | cut -f3 -d " " kopt.data > plot.kopt.data
64 | gnuplot << \EOF
65 | set terminal dumb size 120, 30
66 | set autoscale
67 | unset label
68 | set title "Histogram of number of reference contigs"
69 | set ylabel "Number of Occurrences"
70 | set xlabel "Number of reference contigs"
71 | max = `sort -g plot.kopt.data | tail -1`
72 | binwidth = max/250.0
73 | bin(x,width)=width*floor(x/width) + binwidth/2.0
74 | #set xtics 10
75 | plot 'plot.kopt.data' using (bin($1,binwidth)):(1.0) smooth freq with boxes
76 | pause -1
77 | EOF
78 | 
79 | AF=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' plot.kopt.data)
80 | echo "Average contig number = $AF"
81 | echo "The top three most common number of contigs"
82 | echo -e "X\tContig number"
83 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' plot.kopt.data | sort -k1 -g -r | head -3
84 | echo "The top three most common number of contigs (with values rounded)"
85 | echo -e "X\tContig number"
86 | while read NAME; do python -c "print round($NAME,-2)"; done < plot.kopt.data | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | sort -g -r | head -3 | sed "s/^[ \t]*//"
87 | 


--------------------------------------------------------------------------------
/scripts/untested/VCFtoOutlierOnly.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ -z "$1" ]]; then
 4 | echo "Usage: VCFtoOutlierOnly.sh Vcffile BayescanOutput FDR Prefix_for_output"
 5 | exit 1
 6 | fi
 7 | 
 8 | mawk '!/#/' $1 | cut -f1,2 > totalloci
 9 | mawk '!/qval/' $2 | cut -f2-6 > BS.noheader
10 | paste totalloci BS.noheader | mawk -v x=$3 '$6 <= x' | cut -f1,2 > Outlier.list
11 | vcftools --vcf $1 --positions Outlier.list --recode --recode-INFO-all --out $4.outlieronly
12 | vcftools --vcf $1 --exclude-positions Outlier.list --recode --recode-INFO-all --out $4.neutralonly
13 | 


--------------------------------------------------------------------------------
/scripts/untested/count_hets.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | vcftools --vcf $1 --max-alleles 2 --recode --out out --site-depth &>/dev/null
 3 | HET=( `mawk '!/#/ {print $1,gsub(/0[\/\|]1/,"")}' out.recode.vcf | cut -f2 -d " "` )
 4 | HETT=( `mawk '!/#/ {print $1,gsub(/1[\/\|]0/,"")}' out.recode.vcf | cut -f2 -d " "` )
 5 | DEPTH=( `mawk '!/SUM/' "out.ldepth" | cut -f3`)
 6 | 
 7 | LEN=${#HET[@]}
 8 | LEN=$((LEN - 1))
 9 | rm hapcounts 2>/dev/null
10 | 
11 | for ((i = 0; i <= $LEN; i++));
12 | do
13 | HETTT=$((${HET[$i]} + ${HETT[$i]} ))
14 | echo $HETTT ${DEPTH[$i]} >> hapcounts
15 | done
16 | 
17 | gnuplot << \EOF
18 | set terminal dumb size 120, 30
19 | set autoscale
20 | f(x) = a * x + b
21 | fit f(x) "hapcounts" using 2:1 via b, a
22 | unset label
23 | set title "Number of Unique Sequences with More than X Occurrences"
24 | set xlabel "Number of Occurrences"
25 | set ylabel "Number of Unique Sequences
26 | plot 'hapcounts' using 2:1 with dots, f(x) title "Model Fit" with points
27 | pause -1
28 | EOF
29 | 


--------------------------------------------------------------------------------
/scripts/untested/count_loci:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mawk '!/#/' $1 | cut -f1 | sort | uniq | wc -l
4 | 


--------------------------------------------------------------------------------
/scripts/untested/dDocent_filters:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | echo "This script will automatically filter a FreeBayes generated VCF file using criteria related to site depth," 
  4 | echo "quality versus depth, strand representation, allelic balance at heterzygous individuals, and paired read representation."
  5 | echo -e "The script assumes that loci and individuals with low call rates (or depth) have already been removed. \n"
  6 | echo -e "Contact Jon Puritz (jpuritz@gmail.com) for questions and see script comments for more details on particular filters \n"
  7 | 
  8 | #Checks for correct ususage
  9 | 
 10 | if [[ -z "$2" ]]; then
 11 | echo "Usage is sh FB_filters.sh VCF_file Output_prefix"
 12 | exit 1
 13 | fi
 14 | 
 15 | #Filteres out sites with that on average have heterzygotes with less than a 0.28 allele balance between reads from each allele and Quality / Depth < 0.5 and ratio of mapping quality for reference and alternate alleles
 16 | 
 17 | vcffilter -f "AB > 0.28 | AB < 0.01" -s -g "QR > 0 | QA > 0 " $1 | vcffilter -f "QUAL / DP > 0.25" | vcffilter -f "MQM / MQMR > 0.9 & MQM / MQMR < 1.05" > $2
 18 | FILTERED=$(mawk '!/#/' $2 | wc -l)
 19 | OLD=$(mawk '!/#/' $1 | wc -l)
 20 | NUMFIL=$(($OLD - $FILTERED))
 21 | echo -e "Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth\n" $NUMFIL "of" $OLD "\n"
 22 | echo -e "Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth\n" $NUMFIL "of" $OLD "\n" > $2.filterstats
 23 | 
 24 | #Filters out loci that have reads from both strands, with some leeway for a bad individual or two
 25 | 
 26 | vcffilter -f "SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 50" -s $2 > $2.filAB.vcf
 27 | FILTERED=$(mawk '!/#/' $2.filAB.vcf | wc -l)
 28 | OLD=$(mawk '!/#/' $2 | wc -l)
 29 | NUMFIL=$(($OLD - $FILTERED))
 30 | echo -e "Number of additional sites filtered based on overlapping forward and reverse reads\n" $NUMFIL "of" $OLD "\n"
 31 | echo -e "Number of additional sites filtered based on overlapping forward and reverse reads\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
 32 | 
 33 | #Filters out loci that have reads from both paired and unpaired reads
 34 | 
 35 | echo -e "Is this from a mixture of SE and PE libraries? Enter yes or no."
 36 | 
 37 | read PE
 38 | 
 39 | if [ "$PE" != "yes" ]; then
 40 | vcffilter -f "PAIRED > 0.05 & PAIREDR > 0.05 & PAIREDR / PAIRED < 1.75 & PAIREDR / PAIRED > 0.25 | PAIRED < 0.05 & PAIREDR < 0.05" -s $2.filAB.vcf > $2.fil.vcf
 41 | FILTERED=$(mawk '!/#/' $2.fil.vcf  | wc -l)
 42 | OLD=$(mawk '!/#/' $2.filAB.vcf | wc -l)
 43 | NUMFIL=$(($OLD - $FILTERED))
 44 | echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n"
 45 | echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
 46 | 
 47 | else
 48 | vcffilter -f "PAIRED < 0.005 & PAIREDR > 0.005 | PAIRED > 0.005 & PAIREDR < 0.005" -t NP -F PASS -A $2.filAB.vcf | mawk '!/NP/' > $2.fil.vcf
 49 | FILTERED=$(mawk '!/#/' $2.fil.vcf  | wc -l)
 50 | OLD=$(mawk '!/#/' $2.filAB.vcf | wc -l)
 51 | NUMFIL=$(($OLD - $FILTERED))
 52 | echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n"
 53 | echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
 54 | 
 55 | fi
 56 | 
 57 | #Uses the VCF file to estimate the original number of individuals in the VCF file
 58 | #This is important because the INFO flags are based on this number
 59 | IND=$(grep -o -e 'NS=[0-9]*' $1 | sed s/NS=//g | sort | tail -1)
 60 | IND=$(($IND - 0 ))
 61 | 
 62 | #Creates a file with the original site depth and qual for each locus
 63 | cut -f8 $1 | grep -oe "DP=[0-9]*" | sed -s 's/DP=//g' > $1.DEPTH
 64 | mawk '!/#/' $1 | cut -f1,2,6 > $1.loci.qual
 65 | 
 66 | #Calculates the average depth and standard deviation
 67 | DEPTH=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $1.DEPTH)
 68 | SD=$(mawk '{delta = $1 - avg; avg += delta / NR; mean2 += delta * ($1 - avg); } END { print sqrt(mean2 / NR); }' $1.DEPTH)
 69 | DEPTH=$(python -c "print int("$DEPTH") + int("$SD")")
 70 | #DEPTH=$(python -c "print int("$DEPTH"+100*("$DEPTH"**0.5))")
 71 | 
 72 | #Filters loci above the mean depth + 1 standard deviation that have quality scores that are less than 2*DEPTH
 73 | paste $1.loci.qual $1.DEPTH | mawk -v x=$DEPTH '$4 > x'| mawk '$3 < 2 * $4' > $1.lowQDloci
 74 | LQDL=$(cat $1.lowQDloci | wc -l)
 75 | OLD=$(mawk '!/#/' $2.fil.vcf | wc -l)
 76 | echo -e "Number of sites filtered based on high depth and lower than 2*DEPTH quality score\n" $LQDL "of" $OLD "\n"  
 77 | echo -e "Number of sites filtered based on high depth and lower than 2*DEPTH quality score\n" $LQDL "of" $OLD "\n" >> $2.filterstats
 78 | 
 79 | #Recalculates site depth for sites that have not been previously filtered
 80 | vcftools --vcf $2.fil.vcf --remove-filtered NP --exclude-positions $1.lowQDloci --site-depth --out $1 > /dev/null
 81 | cut -f3 $1.ldepth > $1.site.depth
 82 | 
 83 | DP=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $1.site.depth)
 84 | SD=$(mawk '{delta = $1 - avg; avg += delta / NR; mean2 += delta * ($1 - avg); } END { print sqrt(mean2 / NR); }' $1.site.depth)
 85 | 
 86 | #Calculates actual number of individuals in VCF file
 87 | #This is important because loci will now be filtered by mean depth calculated with individuals present in VCF
 88 | IND=$(mawk '/#/' $1 | tail -1 | wc -w)
 89 | IND=$(($IND - 9))
 90 | 
 91 | mawk '!/D/' $1.site.depth | mawk -v x=$IND '{print $1/x}' > meandepthpersite
 92 | 
 93 | gnuplot << \EOF >> $2.filterstats
 94 | set terminal dumb size 120, 30
 95 | set autoscale
 96 | set xrange [10:150] 
 97 | unset label
 98 | set title "Histogram of mean depth per site"
 99 | set ylabel "Number of Occurrences"
100 | set xlabel "Mean Depth"
101 | #set yr [0:100000]
102 | binwidth=1
103 | bin(x,width)=width*floor(x/width) + binwidth/2.0
104 | set xtics 5
105 | plot 'meandepthpersite' using (bin($1,binwidth)):(1.0) smooth freq with boxes
106 | pause -1
107 | EOF
108 | 
109 | gnuplot << \EOF 
110 | set terminal dumb size 120, 30
111 | set autoscale
112 | set xrange [10:150] 
113 | unset label
114 | set title "Histogram of mean depth per site"
115 | set ylabel "Number of Occurrences"
116 | set xlabel "Mean Depth"
117 | #set yr [0:100000]
118 | binwidth=1
119 | bin(x,width)=width*floor(x/width) + binwidth/2.0
120 | set xtics 5
121 | plot 'meandepthpersite' using (bin($1,binwidth)):(1.0) smooth freq with boxes
122 | pause -1
123 | EOF
124 | 
125 | 
126 | #Calculates a mean depth cutoff to use for filtering
127 | DP=$(python -c "print ($DP+ 1.645*$SD) / $IND")
128 | echo "If distrubtion looks normal, a 1.645 sigma cutoff (~90% of the data) would be" $DP
129 | echo "Would you like to use a different maximum mean depth cutoff, yes or no"
130 | 
131 | read NEWCUTOFF
132 | 
133 | if [ "$NEWCUTOFF" != "yes" ]; then
134 | 
135 | echo -e "Maximum mean depth cutoff is" $DP >> $2.filterstats
136 | 
137 | #Combines all filters to create a final filtered VCF file
138 | vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL --max-meanDP $DP --recode > /dev/null
139 | FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf  | wc -l)
140 | OLD=$(mawk '!/#/' $2.fil.vcf | wc -l)
141 | NUMFIL=$(($OLD - $FILTERED))
142 | echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n"
143 | echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
144 | 
145 | vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL --max-meanDP $DP --exclude-positions $1.lowQDloci --recode > /dev/null
146 | 
147 | OLD=$(mawk '!/#/' $1 | wc -l)
148 | FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf  | wc -l)
149 | NUMFIL=$(($OLD - $FILTERED))
150 | 
151 | echo -e "Total number of sites filtered\n" $NUMFIL "of" $OLD "\n"
152 | echo -e "Total number of sites filtered\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
153 | 
154 | echo -e "Remaining sites\n" $FILTERED "\n"
155 | echo -e "Remaining sites\n" $FILTERED "\n" >> $2.filterstats
156 | 
157 | echo -e "Filtered VCF file is called Output_prefix.FIL.recode.vcf\n"
158 | echo "Filter stats stored in $2.filterstats"
159 | 
160 | else
161 | echo "Please enter new cutoff"
162 | read DP
163 | echo -e "Maximum mean depth cutoff is" $DP >> $2.filterstats
164 | #Combines all filters to create a final filtered VCF file
165 | vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL --max-meanDP $DP --recode > /dev/null
166 | FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf  | wc -l)
167 | OLD=$(mawk '!/#/' $2.fil.vcf | wc -l)
168 | NUMFIL=$(($OLD - $FILTERED))
169 | echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n"
170 | echo -e "Number of sites filtered based on maximum mean depth\nn" $NUMFIL "of" $OLD "\n" >> $2.filterstats
171 | 
172 | vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL --max-meanDP $DP --exclude-positions $1.lowQDloci --recode > /dev/null
173 | 
174 | OLD=$(mawk '!/#/' $1 | wc -l)
175 | FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf  | wc -l)
176 | NUMFIL=$(($OLD - $FILTERED))
177 | 
178 | echo -e "Total number of sites filtered\n" $NUMFIL "of" $OLD "\n"
179 | echo -e "Total number of sites filtered\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
180 | 
181 | echo -e "Remaining sites\n" $FILTERED "\n"
182 | echo -e "Remaining sites\n" $FILTERED "\n" >> $2.filterstats
183 | 
184 | echo -e "Filtered VCF file is called Output_prefix.FIL.recode.vcf\n"
185 | echo "Filter stats stored in $2.filterstats"
186 | 
187 | fi
188 | 


--------------------------------------------------------------------------------
/scripts/untested/dDocent_filters_lite:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | echo "This script will automatically filter a FreeBayes generated VCF file using criteria related to site depth," 
  4 | echo "quality versus depth, strand representation, allelic balance at heterzygous individuals, and paired read representation."
  5 | echo -e "The script assumes that loci and individuals with low call rates (or depth) have already been removed. \n"
  6 | echo -e "Contact Jon Puritz (jpuritz@gmail.com) for questions and see script comments for more details on particular filters \n"
  7 | 
  8 | #Checks for correct ususage
  9 | 
 10 | if [[ -z "$2" ]]; then
 11 | echo "Usage is sh FB_filters.sh VCF_file Output_prefix"
 12 | exit 1
 13 | fi
 14 | 
 15 | #Filteres out sites with that on average have heterzygotes with less than a 0.28 allele balance between reads from each allele and Quality / Depth < 0.5 and ratio of mapping quality for reference and alternate alleles
 16 | 
 17 | vcffilter -f "AB > 0.25 | AB < 0.01" -s -g "QR > 0 | QA > 0 " $1 | vcffilter -f "QUAL / DP > 0.25" | vcffilter -f "MQM / MQMR > 0.75 & MQM / MQMR < 1.25" > $2
 18 | FILTERED=$(mawk '!/#/' $2 | wc -l)
 19 | OLD=$(mawk '!/#/' $1 | wc -l)
 20 | NUMFIL=$(($OLD - $FILTERED))
 21 | echo -e "Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth\n" $NUMFIL "of" $OLD "\n"
 22 | echo -e "Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth\n" $NUMFIL "of" $OLD "\n" > $2.filterstats
 23 | 
 24 | #Filters out loci that have reads from both strands, with some leeway for a bad individual or two
 25 | 
 26 | vcffilter -f "SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 50" -s $2 > $2.filAB.vcf
 27 | FILTERED=$(mawk '!/#/' $2.filAB.vcf | wc -l)
 28 | OLD=$(mawk '!/#/' $2 | wc -l)
 29 | NUMFIL=$(($OLD - $FILTERED))
 30 | echo -e "Number of additional sites filtered based on overlapping forward and reverse reads\n" $NUMFIL "of" $OLD "\n"
 31 | echo -e "Number of additional sites filtered based on overlapping forward and reverse reads\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
 32 | 
 33 | #Filters out loci that have reads from both paired and unpaired reads
 34 | 
 35 | echo -e "Is this from a mixture of SE and PE libraries? Enter yes or no."
 36 | 
 37 | read PE
 38 | 
 39 | if [ "$PE" != "yes" ]; then
 40 | vcffilter -f "PAIRED > 0.05 & PAIREDR > 0.05 & PAIREDR / PAIRED < 1.75 & PAIREDR / PAIRED > 0.25 | PAIRED < 0.05 & PAIREDR < 0.05" -s $2.filAB.vcf > $2.fil.vcf
 41 | FILTERED=$(mawk '!/#/' $2.fil.vcf  | wc -l)
 42 | OLD=$(mawk '!/#/' $2.filAB.vcf | wc -l)
 43 | NUMFIL=$(($OLD - $FILTERED))
 44 | echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n"
 45 | echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
 46 | 
 47 | else
 48 | vcffilter -f "PAIRED < 0.005 & PAIREDR > 0.005 | PAIRED > 0.005 & PAIREDR < 0.005" -t NP -F PASS -A $2.filAB.vcf | mawk '!/NP/' > $2.fil.vcf
 49 | FILTERED=$(mawk '!/#/' $2.fil.vcf  | wc -l)
 50 | OLD=$(mawk '!/#/' $2.filAB.vcf | wc -l)
 51 | NUMFIL=$(($OLD - $FILTERED))
 52 | echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n"
 53 | echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
 54 | 
 55 | fi
 56 | 
 57 | #Uses the VCF file to estimate the original number of individuals in the VCF file
 58 | #This is important because the INFO flags are based on this number
 59 | IND=$(grep -o -e 'NS=[0-9]*' $1 | sed s/NS=//g | sort | tail -1)
 60 | IND=$(($IND - 0 ))
 61 | 
 62 | #Creates a file with the original site depth and qual for each locus
 63 | cut -f8 $1 | grep -oe "DP=[0-9]*" | sed -s 's/DP=//g' > $1.DEPTH
 64 | mawk '!/#/' $1 | cut -f1,2,6 > $1.loci.qual
 65 | 
 66 | #Calculates the average depth and standard deviation
 67 | DEPTH=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $1.DEPTH)
 68 | SD=$(mawk '{delta = $1 - avg; avg += delta / NR; mean2 += delta * ($1 - avg); } END { print sqrt(mean2 / NR); }' $1.DEPTH)
 69 | DEPTH=$(python -c "print int("$DEPTH") + int("$SD")")
 70 | #DEPTH=$(python -c "print int("$DEPTH"+100*("$DEPTH"**0.5))")
 71 | 
 72 | #Filters loci above the mean depth + 1 standard deviation that have quality scores that are less than 2*DEPTH
 73 | paste $1.loci.qual $1.DEPTH | mawk -v x=$DEPTH '$4 > x'| mawk '$3 < 2 * $4' > $1.lowQDloci
 74 | LQDL=$(cat $1.lowQDloci | wc -l)
 75 | OLD=$(mawk '!/#/' $2.fil.vcf | wc -l)
 76 | echo -e "Number of sites filtered based on high depth and lower than 2*DEPTH quality score\n" $LQDL "of" $OLD "\n"  
 77 | echo -e "Number of sites filtered based on high depth and lower than 2*DEPTH quality score\n" $LQDL "of" $OLD "\n" >> $2.filterstats
 78 | 
 79 | #Recalculates site depth for sites that have not been previously filtered
 80 | vcftools --vcf $2.fil.vcf --remove-filtered NP --exclude-positions $1.lowQDloci --site-depth --out $1 > /dev/null
 81 | cut -f3 $1.ldepth > $1.site.depth
 82 | 
 83 | DP=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $1.site.depth)
 84 | SD=$(mawk '{delta = $1 - avg; avg += delta / NR; mean2 += delta * ($1 - avg); } END { print sqrt(mean2 / NR); }' $1.site.depth)
 85 | 
 86 | #Calculates actual number of individuals in VCF file
 87 | #This is important because loci will now be filtered by mean depth calculated with individuals present in VCF
 88 | IND=$(mawk '/#/' $1 | tail -1 | wc -w)
 89 | IND=$(($IND - 9))
 90 | 
 91 | mawk '!/D/' $1.site.depth | mawk -v x=$IND '{print $1/x}' > meandepthpersite
 92 | 
 93 | gnuplot << \EOF >> $2.filterstats
 94 | set terminal dumb size 120, 30
 95 | set autoscale
 96 | set xrange [10:150] 
 97 | unset label
 98 | set title "Histogram of mean depth per site"
 99 | set ylabel "Number of Occurrences"
100 | set xlabel "Mean Depth"
101 | #set yr [0:100000]
102 | binwidth=1
103 | bin(x,width)=width*floor(x/width) + binwidth/2.0
104 | set xtics 5
105 | plot 'meandepthpersite' using (bin($1,binwidth)):(1.0) smooth freq with boxes
106 | pause -1
107 | EOF
108 | 
109 | gnuplot << \EOF 
110 | set terminal dumb size 120, 30
111 | set autoscale
112 | set xrange [10:150] 
113 | unset label
114 | set title "Histogram of mean depth per site"
115 | set ylabel "Number of Occurrences"
116 | set xlabel "Mean Depth"
117 | #set yr [0:100000]
118 | binwidth=1
119 | bin(x,width)=width*floor(x/width) + binwidth/2.0
120 | set xtics 5
121 | plot 'meandepthpersite' using (bin($1,binwidth)):(1.0) smooth freq with boxes
122 | pause -1
123 | EOF
124 | 
125 | 
126 | #Calculates a mean depth cutoff to use for filtering
127 | DP=$(python -c "print ($DP+ 1.645*$SD) / $IND")
128 | echo "If distrubtion looks normal, a 1.645 sigma cutoff (~90% of the data) would be" $DP
129 | echo "Would you like to use a different maximum mean depth cutoff, yes or no"
130 | 
131 | read NEWCUTOFF
132 | 
133 | if [ "$NEWCUTOFF" != "yes" ]; then
134 | 
135 | echo -e "Maximum mean depth cutoff is" $DP >> $2.filterstats
136 | 
137 | #Combines all filters to create a final filtered VCF file
138 | vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL --max-meanDP $DP --recode > /dev/null
139 | FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf  | wc -l)
140 | OLD=$(mawk '!/#/' $2.fil.vcf | wc -l)
141 | NUMFIL=$(($OLD - $FILTERED))
142 | echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n"
143 | echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
144 | 
145 | vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL --max-meanDP $DP --exclude-positions $1.lowQDloci --recode > /dev/null
146 | 
147 | OLD=$(mawk '!/#/' $1 | wc -l)
148 | FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf  | wc -l)
149 | NUMFIL=$(($OLD - $FILTERED))
150 | 
151 | echo -e "Total number of sites filtered\n" $NUMFIL "of" $OLD "\n"
152 | echo -e "Total number of sites filtered\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
153 | 
154 | echo -e "Remaining sites\n" $FILTERED "\n"
155 | echo -e "Remaining sites\n" $FILTERED "\n" >> $2.filterstats
156 | 
157 | echo -e "Filtered VCF file is called Output_prefix.FIL.recode.vcf\n"
158 | echo "Filter stats stored in $2.filterstats"
159 | 
160 | else
161 | echo "Please enter new cutoff"
162 | read DP
163 | echo -e "Maximum mean depth cutoff is" $DP >> $2.filterstats
164 | #Combines all filters to create a final filtered VCF file
165 | vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL --max-meanDP $DP --recode > /dev/null
166 | FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf  | wc -l)
167 | OLD=$(mawk '!/#/' $2.fil.vcf | wc -l)
168 | NUMFIL=$(($OLD - $FILTERED))
169 | echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n"
170 | echo -e "Number of sites filtered based on maximum mean depth\nn" $NUMFIL "of" $OLD "\n" >> $2.filterstats
171 | 
172 | vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL --max-meanDP $DP --exclude-positions $1.lowQDloci --recode > /dev/null
173 | 
174 | OLD=$(mawk '!/#/' $1 | wc -l)
175 | FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf  | wc -l)
176 | NUMFIL=$(($OLD - $FILTERED))
177 | 
178 | echo -e "Total number of sites filtered\n" $NUMFIL "of" $OLD "\n"
179 | echo -e "Total number of sites filtered\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
180 | 
181 | echo -e "Remaining sites\n" $FILTERED "\n"
182 | echo -e "Remaining sites\n" $FILTERED "\n" >> $2.filterstats
183 | 
184 | echo -e "Filtered VCF file is called Output_prefix.FIL.recode.vcf\n"
185 | echo "Filter stats stored in $2.filterstats"
186 | 
187 | fi
188 | 


--------------------------------------------------------------------------------
/scripts/untested/dup_sample_filter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/env bash
 2 | 
 3 | #This script will automatically remove sites in VCF files that do not have congruent genotypes across duplicate individuals
 4 | #It will automatically only consider genotypes that have at least 5 reads
 5 | 
 6 | #
 7 | 
 8 | if [[ -z "$2" ]]; then
 9 | echo "Usage is bash dup_sam_filter.sh VCF_file [File with duplicate sample names]"
10 | echo "The list of names should have one line per pair of duplicate samples with tab separating the two names for the same individual"
11 | exit 1
12 | fi
13 | 
14 | echo "This script assumes that duplicate samples are named in the convention of PopA_001 and PopA_001a"
15 | 
16 | NAMES=( `cut -f1  $2 `)
17 | NAM=( `cut -f2 $2 `)
18 | LEN=( `wc -l $2 `)
19 | LEN=$(($LEN - 1))
20 | 
21 | 
22 | for ((i = 0; i <= $LEN; i++));
23 | do
24 | echo "${NAMES[$i]}" > keep.${NAMES[$i]}
25 | echo "${NAM[$i]}" > keep.${NAM[$i]}
26 | 
27 | vcftools --vcf $1 --keep keep.${NAMES[$i]} --recode --recode-INFO-all --minDP 5 --out ${NAMES[$i]}
28 | vcftools --vcf $1 --keep keep.${NAM[$i]} --recode --recode-INFO-all --minDP 5 --out ${NAM[$i]}
29 | 
30 | paste <(mawk '!/#/' ${NAMES[$i]}.recode.vcf | cut -f1,2,10 | cut -f1 -d ":") <(mawk '!/#/' ${NAM[$i]}.recode.vcf | cut -f1,2,10 | cut -f1 -d ":") | mawk '$3 != $6' | mawk '!/\.\//' | cut -f1,2 > bad.loci.${NAMES[$i]}.${NAM[$i]}
31 | 
32 | done
33 | 
34 | 
35 | NAMES=( `cut -f1  $2 | sort | uniq `)
36 | LEN=( `cut -f1 $2 | sort | uniq | wc -l `)
37 | LEN=$(($LEN - 1))
38 | 
39 | 
40 | cat bad.loci.${NAMES[0]}.* > total.bad.loci
41 | rm ${NAMES[0]}.recode.vcf ${NAM[0]}.recode.vcf keep.${NAMES[0]} keep.${NAM[0]}
42 | 
43 | for ((i = 1; i <= $LEN; i++));
44 | do
45 | cat bad.loci.${NAMES[$i]}.* >> total.bad.loci 
46 | rm ${NAMES[$i]}.recode.vcf ${NAM[$i]}.recode.vcf keep.${NAMES[$i]} keep.${NAM[$i]}
47 | 
48 | done
49 | 
50 | cat total.bad.loci | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' > mismatched.loci
51 | rm total.bad.loci
52 | 


--------------------------------------------------------------------------------
/scripts/untested/filter_missing_ind.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | vcftools --vcf $1 --missing --out $2
 4 | 
 5 | CUTOFF=$(mawk '!/IN/' $2.imiss | cut -f5 | sort -rn | perl -e '$d=.14;@l=<>;print $l[int($d*$#l)]')
 6 | #echo $CUTOFF
 7 | 
 8 | mawk '!/IN/' $2.imiss | cut -f5 > totalmissing
 9 | 
10 | gnuplot << \EOF 
11 | set terminal dumb size 120, 30
12 | set autoscale 
13 | unset label
14 | set title "Histogram of % missing data per individual"
15 | set ylabel "Number of Occurrences"
16 | set xlabel "% of missing data"
17 | #set yr [0:100000]
18 | binwidth=0.01
19 | bin(x,width)=width*floor(x/width) + binwidth/2.0
20 | plot 'totalmissing' using (bin($1,binwidth)):(1.0) smooth freq with boxes
21 | pause -1
22 | EOF
23 | 
24 | echo "The 85% cutoff would be" $CUTOFF
25 | echo "Would you like to set a different cutoff, yes or no"
26 | 
27 | read NEWCUTOFF
28 | 
29 | if [ "$NEWCUTOFF" != "yes" ]; then
30 | 
31 | mawk -v x=$CUTOFF '$5 > x' $2.imiss | cut -f1 > lowDP.indv
32 | 
33 | vcftools --vcf $1 --remove lowDP.indv --recode --recode-INFO-all --out $2
34 | 
35 | else
36 | 
37 | echo "Please enter new cutoff"
38 | 
39 | read CUTOFF2
40 | 
41 | mawk -v x=$CUTOFF2 '$5 > x' $2.imiss | cut -f1 > lowDP.indv
42 | 
43 | vcftools --vcf $1 --remove lowDP.indv --recode --recode-INFO-all --out $2
44 | fi
45 | 


--------------------------------------------------------------------------------
/scripts/untested/filter_paralog.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NAME=$(echo $1 | sed -e 's/\.recode.*//g') 
 4 | 
 5 | vcfallelicprimitives --keep-info --keep-geno $1 > $NAME.prim.vcf
 6 | vcftools --vcf $NAME.prim.vcf --remove-indels --recode --recode-INFO-all --out SNP.$NAME >/dev/null
 7 | 
 8 | bgzip SNP.$NAME.recode.vcf
 9 | tabix -p vcf SNP.$NAME.recode.vcf.gz
10 | vcf-annotate --filter c=3,25 SNP.$NAME.recode.vcf.gz > SNP.$NAME.vcf
11 | vcftools --vcf SNP.$NAME.vcf --keep-filtered SnpCluster --recode --recode-INFO-all --out SNP.$NAME.1
12 | mawk '!/#/' SNP.$NAME.1.recode.vcf | cut -f1,2 > SNP.$NAME.LOC
13 | HET=( `mawk '!/#/ {print $1,gsub(/0[\/\|]1/,"")}' SNP.$NAME.1.recode.vcf  | cut -f2 -d " "` )
14 | HETT=( `mawk '!/#/ {print $1,gsub(/1[\/\|]0/,"")}' SNP.$NAME.1.recode.vcf  | cut -f2 -d " "` )
15 | 
16 | LEN=${#HET[@]}
17 | LEN=$((LEN - 1))
18 | rm SNP.$NAME.hets 2>/dev/null
19 | 
20 | for ((i = 0; i <= $LEN; i++));
21 | do
22 | HETTT=$((${HET[$i]} + ${HETT[$i]} ))
23 | echo $HETTT >> SNP.$NAME.hets
24 | done
25 | 
26 | paste SNP.$NAME.LOC SNP.$NAME.hets > SNP.$NAME.hhets
27 | CUT=$(cut -f3 SNP.$NAME.hhets | sort -g | perl -e '$d=.9;@l=<>;print $l[int($d*$#l)]')
28 | mawk -v x=$CUT '$3 > x' SNP.$NAME.hhets | cut -f1,2 > snpclusters.to.filter
29 | 
30 | vcftools --gzvcf SNP.$NAME.recode.vcf.gz --exclude-positions snpclusters.to.filter --recode --recode-INFO-all --out SNP.$NAME.SCfiltered
31 | if [[ -n "$2" ]]; then
32 | vcftools --vcf SNP.$NAME.SCfiltered.recode.vcf --exclude-positions $2 --recode --recode-INFO-all --out SNP.$NAME.SCfilteredF
33 | fi
34 | 
35 | #gnuplot << \EOF 
36 | #set terminal dumb size 120, 30
37 | #set autoscale
38 | #set xrange [1:150]
39 | #unset label
40 | #set title "Histogram of number of heterozygotes"
41 | #set ylabel "Number of Occurrences"
42 | #set xlabel "Mean Depth"
43 | #set yr [0:100000]
44 | #binwidth=1
45 | #bin(x,width)=width*floor(x/width) + binwidth/2.0
46 | #set xtics 5
47 | #plot 'SNP.$NAME.hhets' using (bin($1,binwidth)):(1.0) smooth freq with boxes
48 | #pause -1
49 | #EOF
50 | 


--------------------------------------------------------------------------------
/scripts/untested/maf_gp_filter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | paste <(mawk '/Contig.*:/' $1 ) <(mawk '/Over/' $1) | mawk '$4 > 0.95' | cut -f1 | sed 's/://g' > Hap.loc.belowMAF05
 4 | 
 5 | LOC=$(mawk '/Contig/' $2 | wc -l)   ####This gives you the number of loci, use it in the next line of code
 6 | 
 7 | paste <(seq $LOC ) <(mawk '/Contig/' $2) > BS.loci.numbered
 8 | 
 9 | grep -wf Hap.loc.belowMAF05 <(sed 's/://g' BS.loci.numbered) | cut -f1 > bs.loci.lowmaf  
10 | 
11 | 


--------------------------------------------------------------------------------
/scripts/untested/multi.maf.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [[ -z "$3" ]]; then
 4 | echo "Usuage is multi.maf.sh [vcffile] [maf] [prefix for outfile]"
 5 | exit 1
 6 | fi
 7 | 
 8 | 
 9 | paste <(cut -f1,2 $1 | mawk '!/#/' ) <(cut -f8 $1 | grep -oh "AF=.*;AN" | sed -e 's/AF=//g' -e 's/;AN//g' -e 's/,/\t/g' | mawk '{ for(i=1; i<=NF;i++) j+=$i; print j; j=0 }') > all.maf.frq
10 | 
11 | 
12 | mawk -v x=$2 '$3 > x && $3 < 1-x' all.maf.frq | cut -f1,2 > maf.loci.to.keep
13 | 
14 | PREFIX=$3
15 | 
16 | vcftools --vcf $1 --positions maf.loci.to.keep --recode --recode-INFO-all --out $PREFIX
17 | 


--------------------------------------------------------------------------------
/scripts/untested/old/BS_reference_to_fasta.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #sed -i 's/\..*//g' $1
 4 | 
 5 | NAMES=( `cat $1 | cut -f1 | sort | uniq `)
 6 | 
 7 | for i in "${NAMES[@]}"
 8 | do
 9 | grep -A1 $i reference.fasta >> outliers.fasta 
10 | done
11 | 
12 | 


--------------------------------------------------------------------------------
/scripts/untested/old/ErrorCount.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "This script counts the number of potential genotyping errors due to low read depth, assuming that the probability of observing either allele in a heterozygote is 0.5"
 4 | 
 5 | R1H01S0=$(grep -wo 0/1:1:0 $1 | wc -l) 
 6 | R1H01S1=$(grep -wo 0/1:1:1 $1 | wc -l) 
 7 | R1H0S1=$(grep -wo 0/0:1:0 $1 | wc -l) 
 8 | R1H0S0=$(grep -wo 0/0:1:1 $1 | wc -l) 
 9 | R1H1S1=$(grep -wo 1/1:1:0 $1 | wc -l) 
10 | R1H1S0=$(grep -wo 1/1:1:1 $1 | wc -l) 
11 | 
12 | R1GEN=$(python -c "print $R1H01S0+$R1H01S1+$R1H0S0+$R1H0S1+$R1H1S0+$R1H1S1")
13 | R1ERR=$(python -c "print $R1GEN/2")
14 | 
15 | echo $R1ERR "maximum genotyping errors from genotypes from only 1 read"
16 | 
17 | R2H01S0=$(grep -wo 0/1:2:0 $1 | wc -l) 
18 | R2H01S2=$(grep -wo 0/1:2:2 $1 | wc -l) 
19 | R2H0S1=$(grep -wo 0/0:2:0 $1 | wc -l) 
20 | R2H0S01=$(grep -wo 0/0:2:1 $1 | wc -l)
21 | R2H0S0=$(grep -wo 0/0:2:2 $1 | wc -l)
22 | R2H1S01=$(grep -wo 1/1:2:2 $1 | wc -l) 
23 | R2H1S0=$(grep -wo 1/1:2:1 $1 | wc -l) 
24 | R2H1S1=$(grep -wo 1/1:2:0 $1 | wc -l)
25 | 
26 | R2GEN=$(python -c "print $R2H01S0+$R2H01S1+$R2H0S0+$R2H0S1+$R2H0S01+$R2H1S0+$R2H1S1+$R2H1S01")
27 | R2ERR=$(python -c "print $R2GEN/4")
28 | 
29 | echo $R2ERR "maximum genotyping errors from genotypes with only 2 reads"
30 | 
31 | R3H01S0=$(grep -wo 0/1:3:0 $1 | wc -l) 
32 | R3H01S2=$(grep -wo 0/1:3:3 $1 | wc -l) 
33 | R3H0S1=$(grep -wo 0/0:3:0 $1 | wc -l) 
34 | R3H0S01=$(grep -wo 0/0:3:1 $1 | wc -l)
35 | R3H0S0=$(grep -wo 0/0:3:2 $1 | wc -l)
36 | R3H0S3=$(grep -wo 0/0:3:3 $1 | wc -l)
37 | R3H1S01=$(grep -wo 1/1:3:2 $1 | wc -l) 
38 | R3H1S0=$(grep -wo 1/1:3:1 $1 | wc -l) 
39 | R3H1S1=$(grep -wo 1/1:3:0 $1 | wc -l)
40 | R3H1S3=$(grep -wo 1/1:3:3 $1 | wc -l)
41 | 
42 | R3GEN=$(python -c "print $R3H01S0+$R3H01S1+$R3H0S0+$R3H0S1+$R3H0S01+$R3H1S0+$R3H1S1+$R3H1S01+$R3H0S3+$R3H1S3")
43 | R3ERR=$(python -c "print $R3GEN/8")
44 | 
45 | echo $R3ERR "maximum genotyping errors from genotypes with only 3 reads"
46 | 
47 | 
48 | IND=$(mawk '/#/' $1 | tail -1 | wc -w)
49 | IND=$(($IND - 9))
50 | LOCI=$(mawk '!/#/' $1 | wc -l)
51 | GENO=$(( $IND * $LOCI ))
52 | 
53 | echo $IND "number of individuals and" $LOCI "equals" $GENO "total genotypes"
54 | 
55 | TOTERR=$(python -c "print $R1ERR+$R2ERR+$R3ERR")
56 | ERRRATE=$(python -c "print $TOTERR/float($GENO)")
57 | 
58 | echo "Total potential error rate equals" $ERRRATE
59 | 


--------------------------------------------------------------------------------
/scripts/untested/old/FB_filters.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #!/bin/bash
 4 | 
 5 | if [[ -z "$2" ]]; then
 6 | echo "Usage is sh FB_filters.sh VCF_file Output_prefix"
 7 | exit 1
 8 | fi
 9 | 
10 | IND=$(mawk '/#/' $1 | tail -1 | wc -w)
11 | IND=$(($IND - 9))
12 | vcffilter -f "AB > 0.28" $1 > $2
13 | vcffilter -f "SAR > 50 & SAF < 50 | SAF > 50 & SAR < 50" -t PASS -F OL $2 > $2.fil1.vcf
14 | vcffilter -f "PAIRED < 0.1 & PAIREDR > 0.1 | PAIRED > 0.1 & PAIREDR < 0.1" -t NP -F PASS -A $2.fil1.vcf > $2.fil2.vcf
15 | vcffilter -f "PAIRED > 0.75" -t PASS -F NP2 -A $2.fil2.vcf > $2.fil3.vcf
16 | vcffilter -f "SRR > 50 & SRF < 50 | SRF > 50 & SRR < 50" -t PASS -F OL2 -A $2.fil3.vcf > $2.fil4.vcf
17 | vcftools --vcf $2.fil4.vcf --keep-filtered PASS,PASS,PASS,PASS --keep-filtered PASS,PASS,NP2,PASS --site-depth --out $1 
18 | cut -f3 $1.ldepth > $1.site.depth
19 | DP=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $1.site.depth)
20 | #DP=$(mawk '{print $1}' $1.site.depth | sort -rn | perl -e '$d=.025;@l=<>;print $l[int($d*$#l)]')
21 | SD=$(mawk '{delta = $1 - avg; avg += delta / NR; mean2 += delta * ($1 - avg); } END { print sqrt(mean2 / NR); }' $1.site.depth)
22 | #SD=$(awk '{sum+=$1; sumsq+=$1*$1} END {print sqrt(sumsq/NR - (sum/NR)**2)}' $1.site.depth)
23 | DP=$(python -c "print ($DP+ $SD) / $IND")
24 | echo $DP
25 | vcftools --vcf $2.fil4.vcf --keep-filtered PASS,PASS,PASS,PASS --keep-filtered PASS,PASS,NP2,PASS --recode-INFO-all --out $2.FIL --max-meanDP $DP --recode 
26 | 


--------------------------------------------------------------------------------
/scripts/untested/old/FB_filters4:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "This script will automatically filter a FreeBayes generated VCF file using criteria related to site depth," 
 4 | echo "quality versus depth, strand representation, allelic balance at heterzygous individuals, and paired read representation."
 5 | echo "The script assumes that loci and individuals with low call rates (or depth) have already been removed."
 6 | 
 7 | echo "Contact Jon Puritz (jpuritz@gmail.com) for questions and see script comments for more details on particular filters"
 8 | 
 9 | #Checks for correct ususage
10 | 
11 | if [[ -z "$2" ]]; then
12 | echo "Usage is sh FB_filters.sh VCF_file Output_prefix"
13 | exit 1
14 | fi
15 | 
16 | #Filteres out sites with that on average have heterzygotes with less than a 0.28 allele balance between reads from each allele and Quality / Depth < 0.5
17 | vcffilter -f "AB > 0.28" -s -g "QR > 0 | QA > 0 " $1 | vcffilter -f "QUAL / DP > 0.5" > $2
18 | #Filters out loci that have reads from both strands, with some leeway for a bad individual or two
19 | vcffilter -f "SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 50" $2 > $2.filAB.vcf
20 | #Filters out loci that have reads from both paired and unpaired reads
21 | vcffilter -f "PAIRED > 0.05 & PAIREDR > 0.05 & PAIREDR / PAIRED < 1.75 & PAIREDR / PAIRED > 0.25 | PAIRED < 0.05 & PAIREDR < 0.05" $2.filAB.vcf > $2.fil.vcf
22 | 
23 | 
24 | #Uses the VCF file to estimate the original number of individuals in the VCF file
25 | #This is important because the INFO flags are based on this number
26 | IND=$(grep -o -e 'NS=[0-9]*' $1 | sed s/NS=//g | sort | tail -1)
27 | IND=$(($IND - 0 ))
28 | 
29 | #Creates a file with the original site depth and qual for each locus
30 | cut -f8 $1 | grep -oe "DP=[0-9]*" | sed -s 's/DP=//g' > $1.DEPTH
31 | mawk '!/#/' $1 | cut -f1,2,6 > $1.loci.qual
32 | 
33 | #Calculates the average depth and standard deviation
34 | DEPTH=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $1.DEPTH)
35 | SD=$(mawk '{delta = $1 - avg; avg += delta / NR; mean2 += delta * ($1 - avg); } END { print sqrt(mean2 / NR); }' $1.DEPTH)
36 | DEPTH=$(python -c "print int("$DEPTH") + int("$SD")")
37 | 
38 | #Filters loci above the mean depth + 1 standard deviation that have quality scores that are less than 2*DEPTH
39 | paste $1.loci.qual $1.DEPTH | mawk -v x=$DEPTH '$4 > x'| mawk '$3 < 2 * $4' > $1.lowQDloci
40 | 
41 | #Recalculates site depth for sites that have not been previously filtered
42 | vcftools --vcf $2.fil.vcf --remove-filtered NP --site-depth --exclude-positions $1.lowQDloci --out $1 
43 | cut -f3 $1.ldepth > $1.site.depth
44 | DP=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $1.site.depth)
45 | SD=$(mawk '{delta = $1 - avg; avg += delta / NR; mean2 += delta * ($1 - avg); } END { print sqrt(mean2 / NR); }' $1.site.depth)
46 | #Calculates actual number of individuals in VCF file
47 | #This is important because loci will now be filtered by mean depth calculated with individuals present in VCF
48 | IND=$(mawk '/#/' $1 | tail -1 | wc -w)
49 | IND=$(($IND - 9))
50 | #Calculates a mean depth cutoff to use for filtering
51 | DP=$(python -c "print ($DP+ 1.5*$SD) / $IND")
52 | #Combines all filters to create a final filtered VCF file
53 | vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL --max-meanDP $DP --exclude-positions $1.lowQDloci --recode 
54 | echo "Filtered VCF file is called Output_prefix.FIL.recode.vcf"
55 | 


--------------------------------------------------------------------------------
/scripts/untested/old/LositantoOutlierVCF.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ -z "$1" ]]; then
 4 | echo "Usage: LositantoOutlierVCF.sh Vcffile LositanFile FDR Prefix_for_output"
 5 | exit 1
 6 | fi
 7 | 
 8 | PVALUE=$(echo "1-$3/2" | bc -l)
 9 | 
10 | mawk '!/#/' $1 | cut -f1,2 > totalloci
11 | mawk '!/Het/' $2 | cut -f2-5 > LS.noheader
12 | paste totalloci LS.noheader | awk -v x=$PVALUE '$5 >= x' | cut -f1,2 > Outlier.list
13 | vcftools --vcf $1 --positions Outlier.list --recode --recode-INFO-all --out $4.outlieronly
14 | vcftools --vcf $1 --exclude-positions Outlier.list --recode --recode-INFO-all --out $4.neutralonly
15 | 
16 | rm totalloci
17 | rm LS.noheader
18 | mv Outlier.list $4.Outlier.list
19 | 


--------------------------------------------------------------------------------
/scripts/untested/old/NI.dDocent.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #########dDocent 1.0##################################################################################
  4 | 
  5 | 
  6 | #This script serves as a bash wrapper to QC, assemble, map, and call SNPs from double digest RAD data.
  7 | #It requires that your raw data are split up by tagged individual and follow the naming convenction of:
  8 | 
  9 | #Sample1.F.fq and Sample1.R.fq
 10 | 
 11 | #This is the non-interactive version, you must enter 5 variables with the shell script.
 12 | 
 13 | #Correct usage is: sh NI.dDocent.sh [c] [A] [B] [O] [email address]
 14 | 
 15 | #All five parameters must be specificed
 16 | 
 17 | #The first parameter is the c parameter:
 18 | 
 19 | #This paramter changes the similarity threshhold to cluster sequences.  
 20 | #This is sensitive to the amount of natural variation in your population.
 21 | #Set it too high and you will have homologous loci that will stay seperate in your analysis and most likely be lost.
 22 | #Set it too low and you will possibly cluster paralogs 
 23 | 
 24 | #The other parameters control mapping
 25 | 
 26 | #-A controls the match score, -B controls the mismatch score, and -O controls the Gap Penalty
 27 | #Optimal mapping will likely involved tweaking these parameters, especially -0
 28 | 
 29 | #I have commonly used -A 2 -B 4 -O 2, -A 4 -B 7 -O 4, -A 3, -B 5, -O 4 for examples
 30 | 
 31 | ###################################################################################################
 32 | 
 33 | if [ ! $# == 5 ]; then
 34 | 	echo "You must enter all 5 parameters."
 35 | 	echo -e "Correct usage is: sh NI.dDocent.sh [c] [A] [B] [O] [email address] \nAll five parameters must be specificed\n\nThe first parameter is the c parameter: \nThis paramter changes the similarity threshhold to cluster sequences. \nThis is sensitive to the amount of natural variation in your population. \nSet it too high and you will have homologous loci that will stay seperate in your analysis and most likely be lost.\nSet it too low and you will possibly cluster paralogs. \n" 
 36 | 	echo -e "The next 3  parameters control mapping: \n-A controls the match score, -B controls the mismatch score, and -O controls the Gap Penalty\nOptimal mapping will likely involved tweaking these parameters, especially -O \n"
 37 | 	echo -e "I have commonly used -A 2 -B 4 -O 2, -A 4 -B 7 -O 4, -A 3, -B 5, -O 4 \n"
 38 | 	echo -e "The last parameter is your email address.  dDocent will email you when your analysis is complete. \n"
 39 | 	echo -e "Default usage would be sh NI.dDocent.sh 0.9 1 4 6 jpuritz@gmail.com"
 40 | 	exit 1
 41 | fi
 42 | 
 43 | NumInd=$(ls *.F.fq | wc -l)
 44 | NumInd=$(($NumInd - 0))
 45 | 
 46 | echo -e "dDocent 1.0 (Non-interactive Version) by J. Puritz for Gold lab \n"
 47 | echo -e "Contact jpuritz@gmail.com with any problems \n\n "
 48 | if [ $NumInd -gt 9 ]
 49 |         then
 50 | 	MinAll=0.05
 51 | 	MaxSize=9
 52 |         else
 53 | 	MinAll=$(echo "scale=2; 1 / (2 * $NumInd) " | bc)
 54 | 	MaxSize=$(( $NumInd - 1 ))
 55 | fi
 56 | 
 57 | 
 58 | ls *.F.fq > namelist
 59 | sed -i 's/.F.fq//g' namelist
 60 | NAMES=( `cat "namelist" `)
 61 | 
 62 | ls -S *.F.fq > sizelist
 63 | sed -i 's/.F.fq//g' sizelist
 64 | SIZE=( `cat "sizelist" `)
 65 | 
 66 | 	echo "Removing the _1 character and replacing with /1 in the name of every sequence"
 67 | 	for i in "${NAMES[@]}"
 68 | 	do	
 69 | 	sed -e 's:_2$:/2:g' $i.R.fq > $i.Ra.fq
 70 | 	sed -e 's:_1$:/1:g' $i.F.fq > $i.Fa.fq
 71 | 	mv $i.Ra.fq $i.R.fq
 72 | 	mv $i.Fa.fq $i.F.fq
 73 | 	done
 74 | 
 75 | TrimReads () 
 76 | { for i in "${NAMES[@]}"
 77 | do
 78 | echo "Trimming $i"
 79 | trim_galore --paired -q 10 --length 20 -a GATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATATCGTATGCCGTCTTCTGCTTG -a2 GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCG --stringency 10 $i.F.fq $i.R.fq 2> $i.trim.log
 80 | mv $i.F_val_1.fq $i.R1.fq
 81 | mv $i.R_val_2.fq $i.R2.fq
 82 | done
 83 | }
 84 | 
 85 | #Use Rainbow to cluster and assemble reads
 86 | 
 87 | setupRainbow ()
 88 | { echo "Concatenating F and R reads of up to 10 individuals for assembly"
 89 | cat ${SIZE[0]}.F.fq > forward
 90 | cat ${SIZE[0]}.R.fq > reverse
 91 | 
 92 | for ((i = 1; i <= $MaxSize; i++));
 93 | do
 94 | cat ${SIZE[$i]}.F.fq >> forward
 95 | cat ${SIZE[$i]}.R.fq >> reverse
 96 | done
 97 | 
 98 | seqtk seq -r forward > forwardRC
 99 | mergefq.pl reverse forwardRC concat.fq
100 | 
101 | #Use Rainbow to cluster and assemble reads
102 | echo "Using rainbow to cluster and assemble reads"
103 | rainbow cluster -1 concat.fq -m 6 > cat.rbcluster.out 2> log
104 | rainbow div -i cat.rbcluster.out -o cat.rbdiv.out -f $MinAll
105 | rainbow merge -a -i cat.rbdiv.out -o cat.rbasm.out
106 | select_best_rbcontig.pl cat.rbasm.out > rainbow
107 | cat rainbow | sed s/./N/96 | sed s/./N/97 | sed s/./N/98 | sed s/./N/99 | sed s/./N/100 | sed s/./N/101 | sed s/./N/102 | sed s/./N/103 | sed s/./N/104 | sed s/./N/105 > rainbowN
108 | echo "Now using cd-hit to cluster reference sequences by similarity. The -c parameter (% similarity to cluster) may need to be changed for your taxa"
109 | cd-hit-est -i rainbowN -o referencegenome -T 0 -c $1 -M 0 -l 30 &>cdhit.log
110 | }
111 | 
112 | 	echo "Trimming reads and simultaneously assemblying reference sequences"	
113 | 	TrimReads & 2> trim.log; setupRainbow $1 2> rainbow.log
114 | 	wait
115 | 
116 | 
117 | ##Use BWA to map reads to assembly
118 | echo "Using BWA to map reads.  You may need to adjust -A -B and -O parameters for your taxa."
119 | bwa0.7 index -a bwtsw referencegenome &> index.log
120 | 
121 | for i in "${NAMES[@]}"
122 | do
123 | bwa0.7 mem referencegenome $i.R1.fq $i.R2.fq -t 32 -a -T 10 -A $2 -B $3 -O $4 > $i.sam 2> bwa.$i.log
124 | done
125 | 
126 | ##Convert Sam to Bam and remove low quality, ambiguous mapping
127 | for i in "${NAMES[@]}"
128 | do
129 | samtools view -bT referencegenome -q1 $i.sam > $i.bam 2>$i.bam.log
130 | samtools sort $i.bam $i
131 | done
132 | 
133 | samtools faidx referencegenome
134 | 
135 | #Calling of SNPs from two samples must have a minimum read of depth of 10 and below 200 with a minimum quality score of 20
136 | echo "Using samtools to pileup reads"
137 | samtools mpileup -D -f referencegenome *.bam >mpileup 2> mpileup.log
138 | echo "Using VarScan2 to call SNPs with at least 5 reads (within 1 individual), 95% probability, and at least 2 reads for the minor allele"
139 | java -jar /usr/local/bin/VarScan.v2.3.5.jar mpileup2snp mpileup --output-vcf --min-coverage 5 --strand-filter 0 --min-var-freq 0.1 --p-value 0.05 >SNPS.vcf 2>varscan.log
140 | echo "Using VCFtools to parse SNPS.vcf for SNPS that are not indels and are called in at least 90% of individuals"
141 | vcftools --vcf SNPS.vcf --geno 0.9 --out Final --counts --recode --non-ref-af 0.001 --remove-indels &>VCFtools.log
142 | 
143 | tail Final.log	
144 | 
145 | if [ ! -d "logfiles" ];	then
146 | mkdir logfiles
147 | fi
148 | 
149 | mv *.log *.txt log ./logfiles
150 | echo `pwd` `date` | mailx -s "It's Finished" $5
151 | 


--------------------------------------------------------------------------------
/scripts/untested/old/Rename_for_dDocent.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$1"]
 4 | then
 5 | echo "No file with barcodes and sample names specified."
 6 | echo "Correct usage: Rename_for_dDocent.sh barcodefile"
 7 | exit 1
 8 | else
 9 | NAMES=( `cut -f1  $1 `)
10 | BARCODES=( `cut -f2 $1 `)
11 | LEN=( `wc -l $1 `)
12 | LEN=$(($LEN - 0))
13 | 
14 | echo ${NAMES[0]}
15 | echo ${BARCODES[0]}
16 | 
17 | for ((i = 0; i <= $LEN; i++));
18 | do
19 | mv sample_${BARCODES[$i]}.1.fq ${NAMES[$i]}.F.fq
20 | mv sample_${BARCODES[$i]}.2.fq ${NAMES[$i]}.R.fq
21 | done
22 | fi
23 | 
24 | 


--------------------------------------------------------------------------------
/scripts/untested/old/ToSNP.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | NAME=$(echo $1 | sed -e 's/\.recode.*//g')
4 | 
5 | vcfallelicprimitives --keep-info --keep-geno $1 > $NAME.prim.vcf
6 | vcftools --vcf $NAME.prim.vcf --remove-indels --recode --recode-INFO-all --out SNP.$NAME
7 | 


--------------------------------------------------------------------------------
/scripts/untested/old/VCFtoOutlierOnly.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ -z "$1" ]]; then
 4 | echo "Usage: VCFtoOutlierOnly.sh Vcffile BayescanOutput FDR Prefix_for_output"
 5 | exit 1
 6 | fi
 7 | 
 8 | mawk '!/#/' $1 | cut -f1,2 > totalloci
 9 | mawk '!/qval/' $2 | cut -f2-6 > BS.noheader
10 | paste totalloci BS.noheader | mawk -v x=$3 '$6 <= x' | cut -f1,2 > Outlier.list
11 | vcftools --vcf $1 --positions Outlier.list --recode --recode-INFO-all --out $4.outlieronly
12 | vcftools --vcf $1 --exclude-positions Outlier.list --recode --recode-INFO-all --out $4.neutralonly
13 | 


--------------------------------------------------------------------------------
/scripts/untested/old/coverage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ls *-RG.bam >bamlist.list; bamtools merge -list bamlist.list > full.bam
 4 | samtools index full.bam &>/dev/null
 5 | samtools idxstats full.bam 2>/dev/null | cut -f3 > coverage
 6 | rm full.bam*
 7 | 
 8 | gnuplot << \EOF 
 9 | set terminal dumb size 120, 30
10 | set autoscale
11 | #set xrange [10:150] 
12 | unset label
13 | set title "Histogram of coverage per contig"
14 | set ylabel "Number of Occurrences"
15 | set xlabel "Coverage"
16 | #set yr [0:100000]
17 | binwidth=10
18 | bin(x,width)=width*floor(x/width) + binwidth/2.0
19 | set xtics 100
20 | plot 'coverage' using (bin($1,binwidth)):(1.0) smooth freq with boxes
21 | pause -1
22 | EOF
23 | 
24 | AVE=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' coverage)
25 | NZ=$(mawk '$1 == 0' coverage | wc -l)
26 | NZ=$(($NZ + 0))
27 | 
28 | echo "The average coverage per contig is" $AVE
29 | echo "There are" $NZ "contigs with zero coverage"
30 | 


--------------------------------------------------------------------------------
/scripts/untested/old/dDocent.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #########dDocent 1.0##################################################################################
  4 | 
  5 | 
  6 | #This script serves as an interactive bash wrapper to QC, assemble, map, and call SNPs from double digest RAD data.
  7 | #It requires that your raw data are split up by tagged individual and follow the naming convenction of:
  8 | 
  9 | #Sample1.F.fq and Sample1.R.fq
 10 | 
 11 | #############################################################################################
 12 | 
 13 | NumInd=$(ls *.F.fq | wc -l)
 14 | NumInd=$(($NumInd - 0))
 15 | 
 16 | echo -e "dDocent 1.0 by J. Puritz for Gold lab \n"
 17 | echo -e "Contact jpuritz@gmail.com with any problems \n\n "
 18 | 
 19 | if [ $NumInd -gt 9 ]
 20 |         then
 21 | 	MinAll=0.05
 22 |         MaxSize=9
 23 |         else
 24 | 	MinAll=$(echo "scale=2; 1 / (2 * $NumInd) " | bc)
 25 |         MaxSize=$(( $NumInd - 1 ))
 26 | fi
 27 | 
 28 | 
 29 | ls *.F.fq > namelist
 30 | sed -i 's/.F.fq//g' namelist
 31 | NAMES=( `cat "namelist" `)
 32 | 
 33 | ls -S *.F.fq > sizelist
 34 | sed -i 's/.F.fq//g' sizelist
 35 | SIZE=( `cat "sizelist" `)
 36 | 
 37 | 
 38 | echo "$NumInd individuals are detected is this correct? Enter yes or no and press [ENTER]"
 39 | 
 40 | read Indcorrect
 41 | 
 42 | if [ "$Indcorrect" == "no" ]; then
 43 |         echo "Please double check that all fastq files are named Ind01.F.fq and Ind01.R.fq"
 44 |         exit 1
 45 | elif [ "$Indcorrect" == "yes" ]; then
 46 |     	echo "Proceeding with $NumInd individuals"
 47 | else
 48 | 	echo "Incorrect Input"
 49 | 	exit 1
 50 | fi
 51 | 
 52 | echo "Were these reads processed with STACKS process_radtags?  Type yes or no and press [Enter]"
 53 | echo "If you don't know, answer yes."
 54 | 
 55 | read STACKS
 56 | 
 57 | if [ "$STACKS" == "yes" ]; then
 58 | 
 59 | 	echo "The _1 character will be replaced with /1 in the name of every sequence"
 60 | 	
 61 | elif [ "$STACKS" == "no" ]; then
 62 |     	echo "Proceeding with out sequence name alteration"
 63 | else
 64 | 	echo "Incorrect input, assuming no."
 65 | fi
 66 | 
 67 | echo "Do you want to quality trim your reads?" 
 68 | echo "Answering yes will simultaneously trim reads and perform reference assembly"
 69 | echo "Type yes or no and press [ENTER]?"
 70 | 
 71 | read TRIM
 72 | 
 73 | if [ "$TRIM" == "yes" ]; then
 74 | 	echo "Reads will be trimmed and simultaneously assembled"
 75 | 	echo "Reads will be assembled with Rainbow"
 76 |     echo "CD-HIT will cluster reference sequences by similarity. The -c parameter (% similarity to cluster) may need to be changed for your taxa."
 77 |     echo "Would you like to enter a new c parameter now? Type yes or no and press [ENTER]"
 78 |     read optC
 79 |     echo $optC
 80 |     if [ "$optC" == "no" ]; then
 81 |             echo "Proceeding with default 0.9 value."
 82 |             simC=0.9
 83 |         elif [ "$optC" == "yes" ]; then
 84 |             echo "Please enter new value for c. Enter in decimal form (For 90%, enter 0.9)"
 85 |             read newC
 86 |             simC=$newC
 87 |         else
 88 |             echo "Incorrect input. Proceeding with the default value."
 89 |             simC=0.9
 90 |         fi
 91 |         
 92 | else
 93 | 	echo "Do you need to perform an assembly? Type no and press [ENTER] if you want to skip to read mapping and SNP calling"
 94 | 	read ASSEMBLY
 95 | 	if [ "$ASSEMBLY" == "yes" ]; then
 96 | 		echo "Reads will be assembled with Rainbow"
 97 | 		echo "CD-HIT will cluster reference sequences by similarity. The -c parameter (% similarity to cluster) may need to be changed for your taxa"
 98 | 		echo "Would you like to enter a new c parameter now? Type yes or no and press [ENTER]"
 99 | 		read optC2
100 | 			if [ "$optC2" == "no" ]; then
101 |         		echo "Proceeding with default 0.9 value."
102 | 				simC=0.9	
103 | 			elif [ "$optC2" == "yes" ]; then
104 |     			echo "Please enter new value for c. Enter in decimal form (For 90%, enter 0.9)"
105 | 				read newC2
106 | 				simC=$newC2
107 | 			else
108 | 				echo "Incorrect input. Proceeding with the default value."
109 | 				simC=0.9
110 | 			fi
111 | 	fi
112 | fi
113 | 
114 | echo "BWA will be used to map reads.  You may need to adjust -A -B and -O parameters for your taxa."
115 | echo "Would you like to enter a new parameters now? Type yes or no and press [ENTER]"
116 | read optq
117 | 
118 | if [ "$optq" == "yes" ]; then
119 |         echo "Please enter new value for A (match score).  It should be an integer.  Default is 1."
120 |         read newA
121 |         optA=$newA
122 |                 echo "Please enter new value for B (mismatch score).  It should be an integer.  Default is 4."
123 |         read newB
124 |         optB=$newB
125 |                 echo "Please enter new value for O (gap penalty).  It should be an integer.  Default is 6."
126 |         read newO
127 |         optO=$newO
128 | else
129 |                 echo "Proceeding with default values for BWA read mapping."
130 |                 optA=1
131 |                 optB=4
132 |                 optO=6
133 | fi
134 | 
135 | echo ""
136 | echo "Please enter your email address.  dDocent will email you when it is finished running."
137 | echo "Don't worry; dDocent has no financial need to sell your email address to spammers."
138 | read MAIL
139 | echo ""
140 | echo ""
141 | echo "At this point, all configuration information has been enter and dDocent may take several hours to run." 
142 | echo "It is recommended that you move this script to a background operation and disable terminal input and output."
143 | echo "All data and logfiles will still be recorded."
144 | echo "To do this:"
145 | echo "Press control and Z simultaneously"
146 | echo "Type 'bg' without the quotes and press enter"
147 | echo "Type 'disown -h' again without the quotes and press enter"
148 | echo ""
149 | echo "Now sit back, relax, and wait for your analysis to finish."
150 | 
151 | main(){
152 | if [ "$STACKS" == "yes" ]; then
153 | 
154 | 	echo "Removing the _1 character and replacing with /1 in the name of every sequence"
155 | 	for i in "${NAMES[@]}"
156 | 	do	
157 | 	sed -e 's:_2$:/2:g' $i.R.fq > $i.Ra.fq
158 | 	sed -e 's:_1$:/1:g' $i.F.fq > $i.Fa.fq
159 | 	mv $i.Ra.fq $i.R.fq
160 | 	mv $i.Fa.fq $i.F.fq
161 | 	done
162 | 
163 | fi
164 | 
165 | 
166 | if [ "$TRIM" == "yes" ]; then
167 | 	echo "Trimming reads and simultaneously assemblying reference sequences"	
168 | 	TrimReads & 2> trim.log
169 | 	setupRainbow 2> rainbow.log
170 | 	wait
171 | else
172 | 	if [ "$ASSEMBLY" == "yes" ]; then
173 | 		setupRainbow 2> rainbow.log
174 | 	fi
175 | fi
176 | 
177 | ##Use BWA to map reads to assembly
178 | 
179 | bwa0.7 index -a bwtsw referencegenome &> index.log
180 | 
181 | for i in "${NAMES[@]}"
182 | do
183 | bwa0.7 mem referencegenome $i.R1.fq $i.R2.fq -t 32 -a -T 10 -A $optA -B $optB -O $optO > $i.sam 2> bwa.$i.log
184 | done
185 | 
186 | ##Convert Sam to Bam and remove low quality, ambiguous mapping
187 | for i in "${NAMES[@]}"
188 | do
189 | samtools view -bT referencegenome -q1 $i.sam > $i.bam 2>$i.bam.log
190 | samtools sort $i.bam $i
191 | done
192 | 
193 | samtools faidx referencegenome
194 | 
195 | #Calling of SNPs from two samples must have a minimum read of depth of 10 and below 200 with a minimum quality score of 20
196 | echo "Using samtools to pileup reads"
197 | samtools mpileup -D -f referencegenome *.bam >mpileup 2> mpileup.log
198 | echo "Using VarScan2 to call SNPs with at least 5 reads (within 1 individual), 95% probability, and at least 2 reads for the minor allele"
199 | java -jar /usr/local/bin/VarScan.v2.3.5.jar mpileup2snp mpileup --output-vcf --min-coverage 5 --strand-filter 0 --min-var-freq 0.1 --p-value 0.05 >SNPS.vcf 2>varscan.log
200 | 
201 | ###Code to rename samples in VCF file
202 | echo "Renaming samples in VCF file."
203 | j=( `wc -l namelist `)
204 | h=1
205 | while [ $h -le $j ]
206 | do
207 | t="Sample"$h
208 | b=$h-1
209 | SS1="$t"
210 | SS2="${NAMES[$b]}"
211 | sed -i 's/'$t'/'$SS2'/' SNPS.vcf 
212 | let h++
213 | done
214 | 
215 | echo "Using VCFtools to parse SNPS.vcf for SNPS that are not indels and are called in at least 90% of individuals"
216 | vcftools --vcf SNPS.vcf --geno 0.9 --out Final --counts --recode --non-ref-af 0.001 --remove-indels &>VCFtools.log
217 | 
218 | tail Final.log	
219 | 
220 | if [ ! -d "logfiles" ]; then
221 | mkdir logfiles
222 | fi
223 | mv *.txt *.log log ./logfiles 2> /dev/null
224 | 
225 | echo -e "dDocent has finished with an analysis in" `pwd` "\n\n"`date` "\n\ndDocent 1.0 \nThe 'd' is silent, hillbilly." | mailx -s "dDocent has finished" $MAIL
226 | }
227 | 
228 | TrimReads () 
229 | { for i in "${NAMES[@]}"
230 | do
231 | echo "Trimming Sample $i"
232 | trim_galore --paired -q 10 --length 20 -a GATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATATCGTATGCCGTCTTCTGCTTG -a2 GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCG --stringency 10 $i.F.fq $i.R.fq 2> $i.trim.log
233 | mv $i.F_val_1.fq $i.R1.fq
234 | mv $i.R_val_2.fq $i.R2.fq
235 | done
236 | }
237 | 
238 | #Use Rainbow to cluster and assemble reads
239 | 
240 | setupRainbow ()
241 | { echo "Concatenating F and R reads of up to 10 individuals for assembly"
242 | cat ${SIZE[0]}.F.fq > forward
243 | cat ${SIZE[0]}.R.fq > reverse
244 | 
245 | for ((i = 1; i <= $MaxSize; i++));
246 | do
247 | cat ${SIZE[$i]}.F.fq >> forward
248 | cat ${SIZE[$i]}.R.fq >> reverse
249 | done
250 | 
251 | seqtk seq -r forward > forwardRC
252 | mergefq.pl reverse forwardRC concat.fq
253 | 
254 | #Use Rainbow to cluster and assemble reads
255 | echo "Using rainbow to cluster and assemble reads"
256 | rainbow cluster -1 concat.fq -m 6 > cat.rbcluster.out 2> log
257 | rainbow div -i cat.rbcluster.out -o cat.rbdiv.out -f $MinAll
258 | rainbow merge -a -i cat.rbdiv.out -o cat.rbasm.out
259 | select_best_rbcontig.pl cat.rbasm.out > rainbow
260 | cat rainbow | sed s/./N/96 | sed s/./N/97 | sed s/./N/98 | sed s/./N/99 | sed s/./N/100 | sed s/./N/101 | sed s/./N/102 | sed s/./N/103 | sed s/./N/104 | sed s/./N/105 > rainbowN
261 | cd-hit-est -i rainbowN -o referencegenome -T 0 -c $simC -M 0 -l 30 &>cdhit.log
262 | }
263 | 
264 | main
265 | 


--------------------------------------------------------------------------------
/scripts/untested/old/dDocent.sh.backup:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | t
  3 | #########dDocent 1.0##################################################################################
  4 | 
  5 | 
  6 | #This script serves as an interactive bash wrapper to QC, assemble, map, and call SNPs from double digest RAD data.
  7 | #It requires that your raw data are split up by tagged individual and follow the naming convenction of:
  8 | 
  9 | #Sample1.F.fq and Sample1.R.fq
 10 | 
 11 | #############################################################################################
 12 | 
 13 | NumInd=$(ls *.F.fq | wc -l)
 14 | NumInd=$(($NumInd - 0))
 15 | 
 16 | echo -e "dDocent 1.0 by J. Puritz for Gold lab \n"
 17 | echo -e "Contact jpuritz@gmail.com with any problems \n\n "
 18 | 
 19 | echo "$NumInd individuals are detected is this correct? Enter yes or no and press [ENTER]"
 20 | 
 21 | read Indcorrect
 22 | 
 23 | if [ "$Indcorrect" == "no" ]; then
 24 |         echo "Please double check that all fastq files are named Ind01.F.fq and Ind01.R.fq"
 25 |         exit 1
 26 | elif [ "$Indcorrect" == "yes" ]; then
 27 |     	echo "Proceeding with $NumInd individuals"
 28 | else
 29 | 	echo "Incorrect Input"
 30 | 	exit 1
 31 | fi
 32 | 
 33 | if [ $NumInd -gt 9 ]
 34 |         then
 35 | 	MinAll=0.05
 36 | 	MaxSize=9
 37 |         else
 38 | 	MinAll=$(echo "scale=2; 1 / (2 * $NumInd) " | bc)
 39 | 	MaxSize=$(( $NumInd - 1 ))
 40 | fi
 41 | 
 42 | 
 43 | ls *.F.fq > namelist
 44 | sed -i 's/.F.fq//g' namelist
 45 | NAMES=( `cat "namelist" `)
 46 | 
 47 | ls -S *.F.fq > sizelist
 48 | sed -i 's/.F.fq//g' sizelist
 49 | SIZE=( `cat "sizelist" `)
 50 | 
 51 | 
 52 | echo "Were these reads processed with STACKS process_radtags?  Type yes or no and press [Enter]"
 53 | echo "If you don't know, answer yes."
 54 | 
 55 | read STACKS
 56 | 
 57 | if [ "$STACKS" == "yes" ]; then
 58 | 
 59 | 	echo "Removing the _1 character and replacing with /1 in the name of every sequence"
 60 | 	for i in "${NAMES[@]}"
 61 | 	do	
 62 | 	sed -e 's:_2$:/2:g' $i.R.fq > $i.Ra.fq
 63 | 	sed -e 's:_1$:/1:g' $i.F.fq > $i.Fa.fq
 64 | 	mv $i.Ra.fq $i.R.fq
 65 | 	mv $i.Fa.fq $i.F.fq
 66 | 	done
 67 | elif [ "$STACKS" == "no" ]; then
 68 |     	echo "Proceeding with out sequence name alteration"
 69 | else
 70 | 	echo "Incorrect input, assuming no."
 71 | fi
 72 | 
 73 | TrimReads () 
 74 | { for i in "${NAMES[@]}"
 75 | do
 76 | echo "Trimming Sample $i"
 77 | trim_galore --paired -q 10 --length 20 -a GATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATATCGTATGCCGTCTTCTGCTTG -a2 GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCG --stringency 10 $i.F.fq $i.R.fq 2> $i.trim.log
 78 | mv $i.F_val_1.fq $i.R1.fq
 79 | mv $i.R_val_2.fq $i.R2.fq
 80 | done
 81 | }
 82 | 
 83 | #Use Rainbow to cluster and assemble reads
 84 | 
 85 | setupRainbow ()
 86 | { echo "Concatenating F and R reads of up to 10 individuals for assembly"
 87 | cat ${SIZE[0]}.F.fq > forward
 88 | cat ${SIZE[0]}.R.fq > reverse
 89 | 
 90 | for ((i = 1; i <= $MaxSize; i++));
 91 | do
 92 | cat ${SIZE[$i]}.F.fq >> forward
 93 | cat ${SIZE[$i]}.R.fq >> reverse
 94 | done
 95 | 
 96 | seqtk seq -r forward > forwardRC
 97 | mergefq.pl reverse forwardRC concat.fq
 98 | 
 99 | #Use Rainbow to cluster and assemble reads
100 | echo "Using rainbow to cluster and assemble reads"
101 | rainbow cluster -1 concat.fq -m 6 > cat.rbcluster.out 2> log
102 | rainbow div -i cat.rbcluster.out -o cat.rbdiv.out -f $MinAll
103 | rainbow merge -a -i cat.rbdiv.out -o cat.rbasm.out
104 | select_best_rbcontig.pl cat.rbasm.out > rainbow
105 | cat rainbow | sed s/./N/96 | sed s/./N/97 | sed s/./N/98 | sed s/./N/99 | sed s/./N/100 | sed s/./N/101 | sed s/./N/102 | sed s/./N/103 | sed s/./N/104 | sed s/./N/105 > rainbowN
106 | simC=0.9
107 | echo "Now using cd-hit to cluster reference sequences by similarity. The -c parameter (% similarity to cluster) may need to be changed for your taxa"
108 | echo "Would you like to enter a new c parameter now? Type yes or no and press [ENTER]"
109 | read optC
110 | if [ "$optC" == "no" ]; then
111 |         echo "Proceeding with default 0.9 value."
112 | 	
113 | elif [ "$optC" == "yes" ]; then
114 |     	echo "Please enter new value for c. Enter in decimal form (For 90%, enter 0.9)"
115 | 	read newC
116 | 	simC=$newC
117 | else
118 | 	echo "Incorrect input. Proceeding with the default value."
119 | fi
120 | 
121 | 
122 | cd-hit-est -i rainbowN -o referencegenome -T 0 -c $simC -M 0 -l 30 &>cdhit.log
123 | }
124 | 
125 | echo "Do you want to quality trim your reads?" 
126 | echo "Answering yes will simultaneously trim reads and perform reference assembly"
127 | echo "Type yes or no and press [ENTER]?"
128 | 
129 | read TRIM
130 | 
131 | if [ "$TRIM" == "yes" ]; then
132 | 	echo "Trimming reads and simultaneously assemblying reference sequences"	
133 | 	TrimReads & 2> trim.log; setupRainbow 2> rainbow.log
134 | 	wait
135 | else
136 | 	echo "Do you need to perform an assembly? Type no and press [ENTER] if you want to skip to read mapping and SNP calling"
137 | 	read ASSEMBLY
138 | 	if [ "$ASSEMBLY" == "yes" ]; then
139 | 		setupRainbow
140 | 	fi
141 | fi
142 | 
143 | 
144 | ##Use BWA to map reads to assembly
145 | 
146 | optA=1
147 | optB=4
148 | optO=6
149 | 
150 | echo "Using BWA to map reads.  You may need to adjust -A -B and -O parameters for your taxa."
151 | echo "Would you like to enter a new parameters now? Type yes or no and press [ENTER]"
152 | read optq
153 | 
154 | if [ "$optq" == "yes" ]; then
155 |         echo "Please enter new value for A (match score).  It should be an integer.  Default is 1."
156 |         read newA
157 |         optA=$newA
158 | 	echo "Please enter new value for B (mismatch score).  It should be an integer.  Default is 4."
159 |         read newB
160 |         optB=$newB
161 | 	echo "Please enter new value for O (gap penalty).  It should be an integer.  Default is 6."
162 |         read newO
163 |         optO=$newO
164 | else
165 | 	echo "Proceeding with default values."
166 | fi
167 | 
168 | 
169 | bwa0.7 index -a bwtsw referencegenome &> index.log
170 | 
171 | for i in "${NAMES[@]}"
172 | do
173 | bwa0.7 mem referencegenome $i.R1.fq $i.R2.fq -t 32 -a -T 10 -A $optA -B $optB -O $optO > $i.sam 2> bwa.$i.log
174 | done
175 | 
176 | ##Convert Sam to Bam and remove low quality, ambiguous mapping
177 | for i in "${NAMES[@]}"
178 | do
179 | samtools view -bT referencegenome -q1 $i.sam > $i.bam 2>$i.bam.log
180 | samtools sort $i.bam $i
181 | done
182 | 
183 | samtools faidx referencegenome
184 | 
185 | #Calling of SNPs from two samples must have a minimum read of depth of 10 and below 200 with a minimum quality score of 20
186 | echo "Using samtools to pileup reads"
187 | samtools mpileup -D -f referencegenome *.bam >mpileup 2> mpileup.log
188 | echo "Using VarScan2 to call SNPs with at least 5 reads (within 1 individual), 95% probability, and at least 2 reads for the minor allele"
189 | java -jar /usr/local/bin/VarScan.v2.3.5.jar mpileup2snp mpileup --output-vcf --min-coverage 5 --strand-filter 0 --min-var-freq 0.1 --p-value 0.05 >SNPS.vcf 2>varscan.log
190 | echo "Using VCFtools to parse SNPS.vcf for SNPS that are not indels and are called in at least 90% of individuals"
191 | vcftools --vcf SNPS.vcf --geno 0.9 --out Final --counts --recode --non-ref-af 0.001 --remove-indels &>VCFtools.log
192 | 
193 | tail Final.log	
194 | 
195 | if [ ! -d "logfiles" ]; then
196 | mkdir logfiles
197 | fi
198 | mv *.txt *.log log ./logfiles
199 | 
200 | echo `date` "It's Finished!" 
201 | 
202 | 


--------------------------------------------------------------------------------
/scripts/untested/old/filter_lowdepth_ind.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | vcftools --vcf $1 --depth --out $2
4 | 
5 | CUTOFF=$(mawk '!/IN/' $2.idepth | cut -f3 | sort -rn | perl -e '$d=.85;@l=<>;print $l[int($d*$#l)]')
6 | mawk -v x=$CUTOFF '$3 < x' $2.idepth | cut -f1 > lowDP.indv
7 | 
8 | vcftools --vcf $1 --remove lowDP.indv --recode --recode-INFO-all --out $2
9 | 


--------------------------------------------------------------------------------
/scripts/untested/old/filter_missing_ind.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | vcftools --vcf $1 --missing --out $2
 4 | 
 5 | CUTOFF=$(mawk '!/IN/' $2.imiss | cut -f5 | sort -rn | perl -e '$d=.14;@l=<>;print $l[int($d*$#l)]')
 6 | #echo $CUTOFF
 7 | 
 8 | mawk '!/IN/' $2.imiss | cut -f5 > totalmissing
 9 | 
10 | gnuplot << \EOF 
11 | set terminal dumb size 120, 30
12 | set autoscale 
13 | unset label
14 | set title "Histogram of % missing data per individual"
15 | set ylabel "Number of Occurrences"
16 | set xlabel "% of missing data"
17 | #set yr [0:100000]
18 | binwidth=0.01
19 | bin(x,width)=width*floor(x/width) + binwidth/2.0
20 | plot 'totalmissing' using (bin($1,binwidth)):(1.0) smooth freq with boxes
21 | pause -1
22 | EOF
23 | 
24 | echo "The 85% cutoff would be" $CUTOFF
25 | echo "Would you like to set a different cutoff, yes or no"
26 | 
27 | read NEWCUTOFF
28 | 
29 | if [ "$NEWCUTOFF" != "yes" ]; then
30 | 
31 | mawk -v x=$CUTOFF '$5 > x' $2.imiss | cut -f1 > lowDP.indv
32 | 
33 | vcftools --vcf $1 --remove lowDP.indv --recode --recode-INFO-all --out $2
34 | 
35 | else
36 | 
37 | echo "Please enter new cutoff"
38 | 
39 | read CUTOFF2
40 | 
41 | mawk -v x=$CUTOFF2 '$5 > x' $2.imiss | cut -f1 > lowDP.indv
42 | 
43 | vcftools --vcf $1 --remove lowDP.indv --recode --recode-INFO-all --out $2
44 | fi
45 | 


--------------------------------------------------------------------------------
/scripts/untested/old/filter_paralog.sh:
--------------------------------------------------------------------------------
1 | bgzip $1
2 | tabix -p vcf $1.gz
3 | vcf-annotate --filter c=$2,$3 $1.gz | mawk '!/SnpC/' > $4.vcf
4 | vcf-annotate --filter c=$2,$3 $1.gz | mawk '!/#/' | mawk '/SnpC/' | wc -l
5 | gunzip $1
6 | 


--------------------------------------------------------------------------------
/scripts/untested/old/filter_vcf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | vcftools --vcf SNP-VQSR-PASS.vcf --minQ 20 --geno 0.5 --mac 2 --recode --recode-INFO-all --out TRSg
 4 | vcftools --vcf TRSg.recode.vcf --minDP 4 --recode --recode-INFO-all --out TRSgdp4 &
 5 | vcftools --vcf TRSg.recode.vcf --minDP 3 --recode --recode-INFO-all --out TRSgdp3 &
 6 | vcftools --vcf TRSg.recode.vcf --minDP 5	--recode --recode-INFO-all --out TRSgdp5 &
 7 | vcftools --vcf TRSg.recode.vcf --minDP 10 --recode --recode-INFO-all --out TRSgdp10 &
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/scripts/untested/old/hwe_filter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo -e "This script will take a text file with individual names and population names \n and output a file of loci found to be out of HWE within populations"
 4 | 
 5 | echo "Proper execution is: sh hwe_filter.sh file_with_names_and_pops p-value"
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/scripts/untested/old/jpuritz@gdcws1.ethz.ch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ls *-RG.bam >bamlist.list; bamtools merge -list bamlist.list > full.bam
 4 | samtools index full.bam &>/dev/null
 5 | samtools idxstats full.bam 2>/dev/null | cut -f3 > coverage
 6 | rm full.bam*
 7 | 
 8 | gnuplot << \EOF 
 9 | set terminal dumb size 120, 30
10 | set autoscale
11 | #set xrange [10:150] 
12 | unset label
13 | set title "Histogram of coverage per contig"
14 | set ylabel "Number of Occurrences"
15 | set xlabel "Coverage"
16 | #set yr [0:100000]
17 | binwidth=10
18 | bin(x,width)=width*floor(x/width) + binwidth/2.0
19 | set xtics 100
20 | plot 'coverage' using (bin($1,binwidth)):(1.0) smooth freq with boxes
21 | pause -1
22 | EOF
23 | 
24 | AVE=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' coverage)
25 | NZ=$(mawk '$1 == 0' coverage | wc -l)
26 | NZ=$(($NZ + 0))
27 | 
28 | echo "The average coverage per contig is" $AVE
29 | echo "There are" $NZ "contigs with zero coverage"
30 | 


--------------------------------------------------------------------------------
/scripts/untested/old/lowQD.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | cut -f8 $1 | grep -oe "DP=[0-9]*" | sed -s 's/DP=//g' > $1.DEPTH
 5 | mawk '!/#/' $1 | cut -f1,2,6 > $1.loci.qual
 6 | DEPTH=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $1.DEPTH)
 7 | QUAL=$(python -c "print $DEPTH * 2")
 8 | paste $1.loci.qual $1.DEPTH | mawk -v x=$DEPTH '$4 > x'| mawk '$3 < 1.0 * $4' > $1.lowQDloci
 9 | 
10 | #DEPTH=$(awk '{print $1}' $1.DEPTH | sort -rn | perl -e '$d=.25;@l=<>;print $l[int($d*$#l)]')
11 | DEPTH=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $1.DEPTH)
12 | SD=$(mawk '{delta = $1 - avg; avg += delta / NR; mean2 += delta * ($1 - avg); } END { print sqrt(mean2 / NR); }' $1.DEPTH)
13 | DEPTH=$(python -c "print int("$DEPTH") + int("$SD")")
14 | paste $1.loci.qual $1.DEPTH | mawk -v x=$DEPTH '$4 > x'| mawk '$3 < 2 * $4' >> $1.lowQDloci
15 | 
16 | vcftools --vcf $1 --exclude-positions $1.lowQDloci --recode --recode-INFO-all --out $1.LowQDFIL
17 | echo $DEPTH
18 | echo $QUAL
19 | #vcffilter -f "DP > $DEPTH & QUAL < $QUAL" $1.inter.recode.vcf -t lowQD -F PASS > $1.inter2
20 | #vcftools --vcf $1.inter2 --remove-filtered lowQD --recode --recode-INFO-all --out $1.LQDFIL
21 | 


--------------------------------------------------------------------------------
/scripts/untested/old/pop_missing_filter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ -z "$2" ]]; then
 4 | echo "Usage is pop_missing_filter vcffile popmap percent_missing_per_pop number_of_pops_for_cutoff name_for_output"
 5 | exit 1
 6 | fi
 7 | 
 8 | POPS=( `cut -f2 $2 | sort | uniq `)
 9 | rm badloci
10 | 
11 | for i in "${POPS[@]}"
12 | do
13 | grep -w $i $2 | cut -f1 > keep.$i
14 | vcftools --vcf $1 --keep keep.$i --missing --out $i 
15 | mawk '!/CHROM/' $i.lmiss | mawk -v x=$3 '$6 > x' | cut -f1,2 >> badloci
16 | done
17 | 
18 | mawk '!/CH/' badloci | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | mawk -v x=$4 '$1 > x' | cut -f2,3  > loci.to.remove
19 | 
20 | #sort badloci | uniq > loci.to.remove
21 | 
22 | vcftools --vcf $1 --exclude-positions loci.to.remove --recode --recode-INFO-all --out $5
23 | 
24 | 


--------------------------------------------------------------------------------
/scripts/untested/old/reference.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NAMES=( `cat "namelist" `)
 4 | 
 5 | 
 6 | mawk -v x=$1 '$1 >= x' uniq.seqs | cut -f 2 > totaluniqseq
 7 | 
 8 | 
 9 | #Convert reads to fasta
10 | uniq2fasta totaluniqseq > uniq.fasta
11 | 
12 | #Perl function to split contigs by length
13 | 
14 | sed -e 's/NNNNNNNNNN/\t/g' uniq.fasta | cut -f1 > uniq.F.fasta
15 | sed -e 's/NNNNNNNNNN/\t/g' uniq.fasta | cut -f2 > uniq.R.fasta
16 | 
17 | #sed -i'' -e 's/_.* (.*)/_1/g' uniq.F.fasta
18 | #sed -i'' -e 's/_.*_.* (.*)/_2/g' uniq.R.fasta 
19 | 
20 | seqtk seq -r uniq.R.fasta > uniq.RC.fasta
21 | rm uniq.R.fasta
22 | 
23 | #Now use rainbow to cluster and assemble reads into longer contigs
24 | rainbow cluster -m 6 -1 uniq.F.fasta -2 uniq.RC.fasta > rcluster
25 | rainbow div -i rcluster -o rbdiv.out -f 0.01
26 | rainbow merge -o rbasm.out -a -i rbdiv.out -r 2
27 | select_best_rbcontig_plus_read1.pl rbasm.out rbdiv.out >rainbow.fasta
28 | 
29 | #cd-hit to cluster reads based on sequence similarity
30 | cd-hit-est -i rainbow.fasta -o referenceRC.fasta -mask N -M 0 -T 0 -c 0.9 &>cdhit.log
31 | 
32 | seqtk seq -r referenceRC.fasta > reference.fasta.original
33 | 
34 | sed -e 's/^C/NC/g' -e 's/^A/NA/g' -e 's/^G/NG/g' -e 's/^T/NT/g' -e 's/T$/TN/g' -e 's/A$/AN/g' -e 's/C$/CN/g' -e 's/G$/GN/g' reference.fasta.original > reference.fasta
35 | 
36 | samtools faidx reference.fasta
37 | bwa index reference.fasta
38 | 
39 | SEQS=$(cat reference.fasta | wc -l)
40 | SEQS=$(($SEQS / 2 ))
41 | 
42 | echo -e "KSEQS\t" $SEQS
43 | 


--------------------------------------------------------------------------------
/scripts/untested/old/reftest.old.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for i in {3..40}
 4 | do
 5 | echo "K is $i" >>test
 6 | sh ./reference.sh $i
 7 | bwa mem reference.fasta JC_1161.R1.fq JC_1161.R2.fq -L 100,5 -t 32 -a -M -T 10 -A 1 -B 3 -O 5 -R "@RG\tID:test\tSM:test\tPL:Illumina" 2> /dev/null | mawk '!/\t[2-9].S.*/' | mawk '!/[2-9].S\t/' | samtools view -@32 -q 1 -SbT reference.fasta - 2>> test | samtools flagstat - 2> /dev/null >> test
 8 | done
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/scripts/untested/old/reftest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm kopt.data
 4 | 
 5 | for i in {3..30}
 6 | do
 7 | echo "K is $i"
 8 | SEQS=$(./reference.sh $i 2>/dev/null | mawk '/KSEQS/' | cut -f2)
 9 | echo $i $SEQS >> kopt.data
10 | done
11 | 
12 | 
13 | #Plot graph of above data
14 | gnuplot << \EOF 
15 | set terminal dumb size 120, 30
16 | set autoscale 
17 | unset label
18 | set title "Number of Unique Sequences with More than X Occurrences"
19 | #set xlabel "Number of Occurrences"
20 | set ylabel "Number of Unique Sequences
21 | #set yr [0:100000]
22 | plot 'kopt.data' with dots notitle
23 | pause -1
24 | EOF
25 | 
26 | 


--------------------------------------------------------------------------------
/scripts/untested/old/remove_bad_loci_for_hap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #NAMES=$(cut -f1 $1 | sort | uniq)
 4 | NAMES=( `cat $1 | cut -f1 | sort | uniq `)
 5 | 
 6 | NAME=$(echo $2 | sed -e 's/\.recode.*//g') 
 7 | LEN=${#NAMES[@]}
 8 | 
 9 | mawk -v x=${NAMES[0]} '!/x/' $NAME.recode.vcf > $NAME.0.t.vcf
10 | 
11 | for ((i = 1; i < $LEN; i++));
12 | do
13 | j=$(($i - 1))
14 | mawk -v x=${NAMES[$i]} '!/x/' $NAME.$j.t.vcf > $NAME.$i.t.vcf
15 | done
16 | 
17 | LAST=$(($LEN - 1))
18 | 
19 | mv $NAME.$LAST.t.vcf $NAME.filtered.vcf
20 | 
21 | rm $NAME.*.t.vcf
22 | 


--------------------------------------------------------------------------------
/scripts/untested/pi_sample.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Script to randomly sample 18 individuals from populations and calculate pi
 4 | 
 5 | mawk1='{ sum += $5; n++ } END { if (n > 0) print sum / n; }'
 6 | 
 7 | rm sample.pi sample.pi.pop subsample.pi
 8 | 
 9 | for i in {1..100}
10 | do
11 | mawk '/AR_/' kept.popmap | shuf | head -15 | cut -f1 > AR1.keep
12 | mawk '/ARB_/' kept.popmap | shuf | head -15 | cut -f1 > AR2.keep
13 | mawk '/MB_/' kept.popmap | shuf | head -15 | cut -f1 > NR1.keep
14 | mawk '/MBB_/' kept.popmap | shuf | head -15 | cut -f1 > NR2.keep
15 | mawk '$2 =="EL"' kept.popmap | shuf | head -15 | cut -f1 > NR4.keep
16 | mawk '$2 =="ELA"' kept.popmap | shuf | head -15 | cut -f1 > NR3.keep
17 | mawk '$2 =="JC"' kept.popmap | shuf | head -15 | cut -f1 > AP1.keep
18 | mawk '$2 =="PC"' kept.popmap | shuf | head -15 | cut -f1 > AP2.keep
19 | ls *.keep | parallel --no-notice vcftools --vcf SNP.TRSdp5MIp25g9HWEHFv2.neutralmaf025.recode.vcf --keep {} --out {.} --site-pi &> /dev/null
20 | ls *.sites.pi | parallel --no-notice mv {} {.}
21 | ls *.sites | parallel --no-notice "mawk '$mawk1' {} >> sample.pi && echo {.} >> sample.pi.pop"
22 | done
23 | 
24 | paste sample.pi sample.pi.pop > subsample.pi
25 | 


--------------------------------------------------------------------------------
/scripts/untested/pop_missing_filter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ -z "$2" ]]; then
 4 | echo "Usage is pop_missing_filter vcffile popmap percent_missing_per_pop number_of_pops_for_cutoff name_for_output"
 5 | exit 1
 6 | fi
 7 | 
 8 | POPS=( `cut -f2 $2 | sort | uniq `)
 9 | rm badloci
10 | 
11 | for i in "${POPS[@]}"
12 | do
13 | grep -w $i $2 | cut -f1 > keep.$i
14 | vcftools --vcf $1 --keep keep.$i --missing --out $i 
15 | mawk '!/CHROM/' $i.lmiss | mawk -v x=$3 '$6 > x' | cut -f1,2 >> badloci
16 | done
17 | 
18 | mawk '!/CH/' badloci | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' | mawk -v x=$4 '$1 >= x' | cut -f2,3  > loci.to.remove
19 | 
20 | #sort badloci | uniq > loci.to.remove
21 | 
22 | vcftools --vcf $1 --exclude-positions loci.to.remove --recode --recode-INFO-all --out $5
23 | 
24 | 


--------------------------------------------------------------------------------
/scripts/untested/remove.bad.hap.loci.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | 
4 | NAME=$(echo $2 | sed -e 's/\.recode.*//g') 
5 | 
6 | grep -vwf <(cut -f1 $1) $2 > $NAME.filtered.vcf
7 | 
8 | 


--------------------------------------------------------------------------------
/scripts/untested/untested.new:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpuritz/dDocent/414d6f3f3fba0be75d01da2797629c3808571113/scripts/untested/untested.new


--------------------------------------------------------------------------------
/scripts/untested/vardist.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | vcfdistance <$1 | grep -oh 'BasesToClosestVariant=[0-9]*' | sed -e 's/BasesToClosestVariant=//g' > vardist
 4 | 
 5 | gnuplot << \EOF 
 6 | set terminal dumb size 120, 30
 7 | set autoscale
 8 | #set xrange [10:150]
 9 | unset label
10 | set title "Histogram of distance to closest variant"
11 | set ylabel "Number of Occurrences"
12 | set xlabel "Mean Depth"
13 | #set yr [0:100000]
14 | binwidth=1
15 | bin(x,width)=width*floor(x/width) + binwidth/2.0
16 | set xtics 5
17 | plot 'vardist' using (bin($1,binwidth)):(1.0) smooth freq with boxes
18 | pause -1
19 | EOF
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/tutorials/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jpuritz/dDocent/414d6f3f3fba0be75d01da2797629c3808571113/tutorials/.DS_Store


--------------------------------------------------------------------------------
/tutorials/README.md:
--------------------------------------------------------------------------------
 1 | #Welcome to the Tutorial Section
 2 | 
 3 | There are two different tutorials.  One works on assembling an optimal reference and the other works on filtering SNP data.
 4 | 
 5 | In the eternal words of Baz Luhrman:
 6 | 
 7 | > Advice is a form of nostalgia. Dispensing it is a way of fishing the past from the disposal, wiping it off, painting over the ugly parts and recycling it for more than it's worth.
 8 | 
 9 | **The guidelines in here are not intended to be a best practices document**
10 | **Do not blindly apply them to your own data.  Take the time to experiment**
11 | 
12 | In this directory, you will find 4 critical files
13 | 
14 | 1. RefTut
15 | 2. FilterTut
16 | 3. Filtering Tutorial.md
17 | 4. Reference Assembly Tutorial.md
18 | 
19 | The tutorials can be experienced in two different formats:
20 | 
21 | 1.  Executable text files to be view in the terminal
22 | 2.  Github formated .md files to be viewed through github.
23 | 
24 | 


--------------------------------------------------------------------------------