├── 00_build_index_hisat2.sh
├── 01_lastz_prepare_kmers.sh
├── CopyNumberGen.sh
├── ExtractSCOs.sh
├── ExtractSeq.sh
├── Fastq2CA.sh
├── GI2Taxonomy.sh
├── GOseq_auto.sh
├── GuessEncoding.sh
├── JRC.sh
├── JobBLAST.sh
├── JobP128.sh
├── JobQ.sh
├── JobR.sh
├── JobR_BLAST.sh
├── JobR_condo.sh
├── JobTime.sh
├── JobTime2.sh
├── Khmer_interleave.sh
├── L3.ModuleGen.sh
├── LAS-makeSLURMp.py
├── Lines_based_file_deletion.sh
├── ModuleGen.sh
├── N50Calc.pl
├── PcB_filter_stats.sh
├── Quiver_01.sh
├── Quiver_02.sh
├── README.md
├── Rscript_for_plotting_bayescan_results.sh
├── Rscripts
    ├── ElementScript.R
    ├── Elementsource.R
    ├── QS.R
    └── gff2gtf.R
├── SAM_to_sortedBAM.sh
├── SOYGO
    ├── ANML
    │   ├── ANML.4Kevin
    │   ├── ANML.BP.final
    │   ├── ANML.MF.final
    │   ├── ANML.list
    │   ├── ANML.list_BP.txt
    │   ├── ANML.list_BP_fisher.txt
    │   ├── ANML.list_BP_names.txt
    │   ├── ANML.list_BP_out.txt
    │   ├── ANML.list_BP_output.txt
    │   ├── ANML.list_ERROR.txt
    │   ├── ANML.list_MF.txt
    │   ├── ANML.list_MF_fisher.txt
    │   ├── ANML.list_MF_names.txt
    │   ├── ANML.list_MF_out.txt
    │   ├── ANML.list_MF_output.txt
    │   └── ANML.list~
    ├── ATH_GO_GOSLIM.022714
    ├── GOinfoKevinupdate2.pl
    ├── Gmv2_GODb
    ├── README_SOYBASE
    ├── SOYGO.pl
    ├── Script_Fisher.R
    ├── SoyGO.sh
    └── combinefilesbyGO.pl
├── SRAfq2FASTQ.sh
├── a_loghistory_func.sh
├── a_longhistory_func.sh
├── abacas.1.3.1.pl
├── acd_func.sh
├── addFastaHeaders.pl
├── backup
├── bayes_script.sh
├── biostar
├── biostar.cpp
├── blastXML2Tab.py
├── blast_job_gen.sh
├── blast_wrapper.sh
├── bowtie2_se_noclip.sh
├── bwa_map_sort.sh
├── cb
├── cdgit
├── cegma.sh
├── checkBlastStatus.sh
├── clean_trinity.sh
├── combineRawFiles.sh
├── configFiles
    ├── PcB_subreadfiltering_settings.xml
    └── vcf_structure.spid
├── connected_graphs.awk
├── count.sh
├── count_fastq.sh
├── create_sam.sh
├── decison_tree_for_picking_common_strata_for_orthogroups.sh
├── dos2unix
├── downloadSRA_ebi.sh
├── edit_sam_files.sh
├── end_time.sh
├── extract_seq.sh
├── extract_seq.sh.save
├── fasta-splitter.pl
├── fasta2fastq.py
├── fastaMulti2singleLine.pl
├── fastaSortByName.pl
├── fasta_distribution.pl
├── fasta_length.py
├── fastarange
├── fastarange.c
├── fastasplitn
├── fastasplitn.c
├── fastq-splitter.pl
├── fastq2fasta.sh
├── fastqc_parse.py
├── fastqc_stats.sh
├── fasttrans
├── fasttrans.c
├── filecount
├── filter_parllel_log.sh
├── firstInstanceOf.awk
├── fna_qual2fastq.py
├── formatOut2Tab.sh
├── fq2bam.sh
├── genome-gaps-as-bed.py
├── get_GitHub_file
    ├── get_GitHub_file.sh
    └── get_GitHub_folder.sh
├── get_ip
├── get_taxanomy.sh
├── gff2fasta.pl
├── gitgrep
├── gmap_cdna.sh
├── gsize.sh
├── gsnap_pe.sh
├── gsnap_pe2.sh
├── gsnap_pe_clip.sh
├── gsnap_pe_clip_final.sh
├── gsnap_pe_noclip.sh
├── gsnap_pe_noclip_final.sh
├── gsnap_se.sh
├── gsnap_stats.sh
├── gtf2gff3.pl
├── guess_encoding.py
├── header_replace_fasta.pl
├── histogram.awk
├── htseq_count.sh
├── interleave_PE_fastq.sh
├── intervalBins.awk
├── join.pl
├── join_files.sh
├── joinr.sh
├── khmer_pe.sh
├── mac2unix
├── makeLocalSLURMp.py
├── makeLocalSLURMs.py
├── makeNovaSLURMs.py
├── makePBSp.py
├── makePBSs.py
├── makeSLURM_bridges.py
├── makeSLURMp.py
├── makeSLURMp_ceres.py
├── makeSLURMs.py
├── makeSLURMs_ceres.py
├── mayday.sh
├── md
├── mismatch-counter.sh
├── mpblast.pl
├── nanoVersions.sh
├── newModuleGen.sh
├── new_Assemblathon.pl
├── noTabCompletion.sh
├── numberPatMatch_byLine
├── parse.sh
├── pathadd
├── pb_errc.sh
├── pb_errc2.sh
├── prepare_genome_modules.old
├── qcMarkdownGenerator.sh
├── qiime_config.sh
├── qn
├── qourum_ec.sh
├── quedel.sh
├── readme.md
├── removeRedundantScaffolds.sh
├── removeSeqsFromReads.sh
├── renamed_results.sh
├── reorder_fasta.py
├── rowsums
├── rsem_analyize_dge.sh
├── rsem_dge.sh
├── rsem_estm_matrix.sh
├── runAugustus.sh
├── runBUSCO.sh
├── runBWAcrossspp.sh
├── runBayesScan.sh
├── runBlobtools.sh
├── runBraker.sh
├── runDigiNorm_pe.sh
├── runGenomeScope.sh
├── runGenomeScope_arun.sh
├── runGmap.sh
├── runGuidenceAA.sh
├── runGuidenceNT.sh
├── runHISAT2.sh
├── runIRF.sh
├── runLTRfinder.sh
├── runMaSuRCA.sh
├── runMegablast.sh
├── runMinimap2-cDNA.sh
├── runPlatanus.sh
├── runRAxML.sh
├── runRepeatExplorer.sh
├── runRepeatModeler.sh
├── runSNPHYLO.sh
├── runSOAP.sh
├── runSPAdes.sh
├── runSTAR.sh
├── runSpliceSites.sh
├── runStringtie.sh
├── runSynteny.sh
├── runTabix.sh
├── runTasselGWAS.sh
├── runTraning.sh
├── run_gmap.sh
├── rundipSPAdes.sh
├── sample_fastq.py
├── scaffold2contig.pl
├── scores2dokuwiki.sh
├── seprate_paired_reads.sh
├── soap_config_file.txt
├── split_submit.sh
├── sr_config_MaSuRCA.txt
├── sr_config_MaSuRSOAP.txt
├── star_build.sh
├── sub
├── summary.sh
├── taxadump_gen.sh
├── taxaids2division.sh
├── taxaids2lineage.sh
├── taxid_ranks.py
├── template.slurm
├── test_bioperl.pl
├── tophat_se.sh
├── transpose.awk
├── trim_pe.sh
├── trim_se.sh
├── trimmomatic_pe.sh
├── trimmomatic_se.sh
├── unix2dos
├── unix2mac
├── validate_features.pl
├── vcf-subset.py
└── walltime


/00_build_index_hisat2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #if [ "$#" -ne 2 ] ; then
 3 | #echo "please provide:"
 4 | #echo -e "\t\t(1) name for index, preferably the NAM line name"
 5 | #echo -e "\t\t(2) genome sequences, use only scaffolds"
 6 | #echo "";
 7 | #echo "./00_build_index_hisat2 <NAM name> <FASTA-scaffolds-file>" ;
 8 | #echo "";
 9 | #exit 0;
10 | #fi
11 | 
12 | NAM=$(basename $(dirname $(pwd)))
13 | file=$1
14 | 
15 | module load hisat2
16 | hisat2-build ${file} $NAM
17 | 


--------------------------------------------------------------------------------
/01_lastz_prepare_kmers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ############
 3 | # files needed
 4 | # 1. Genome File to which the sequences needs to aligned 
 5 | # 2. Output folder where all the temp and final results will be written
 6 | # 3. Draft assembly sequences
 7 | # enter them in that order for this scripts
 8 | # note that, for all the files you need to enter the full path
 9 | ############
10 | 
11 | ## define where the scripts are located (for add_contig_ID_to_seg.pl & merge_kmer_ranges.pl)
12 | BINDIR="/data003/GIF/arnstrm/20150506_Hufford_maker/alignment/raw_files/kmer_align"
13 | CPU=48 # number of parallel jobs  
14 | KMER=23 # kmer size
15 | TASSEL="/home/arnstrm/tassel/dist"
16 | MEM=1500 # total system memory in GB
17 | MPJ=$((${MEM} / ${CPU})) # calculate per job memory and round it to nearest integer
18 | 
19 | # modules required
20 | module load parallel
21 | module load java
22 | module load tassel/5.2.16
23 | module load perl
24 | module load python
25 | 
26 | # easy names
27 | TARGETDIR="$2"
28 | REF="$1"
29 | ASSEMBLY="$3"
30 | 
31 | # set-up for analyses
32 | mkdir -p ${TARGETDIR} && cd ${TARGETDIR}
33 | rm -f tasks_java tasks_sort tasks_merge
34 | 
35 | # generate job lists
36 | for f in $(echo {A,T,G,C}{A,T,G,C}{A,T,G,C}); do
37 | echo "java -Xmx${MPJ}g -cp ${TASSEL}/tassel5-active.jar net.maizegenetics.ed.t5.KmerAnalysisVB1 $KMER ${f} ${OUTNAME}_${f} ${REF} ${ASSEMBLY}" >> tasks_java
38 | echo "sort -S ${MPJ}G -k 1,1 -k 2,2n ${OUTNAME}_${f}.seg >& ${OUTNAME}_${f}.seg.srt12" >> tasks_sort
39 | echo "${BINDIR}/merge_kmer_ranges.pl ${OUTNAME}_${f}.seg.srt12 >& ${OUTNAME}_${f}.seg.srt12.mrg" >> tasks_merge
40 | done
41 | 
42 | parallel --jobs ${CPU} --joblog java_tasks.log --workdir $PWD < tasks_java
43 | parallel --jobs ${CPU} --joblog java_sort.log --workdir $PWD < tasks_sort
44 | parallel --jobs ${CPU} --joblog java_merge.log --workdir $PWD < tasks_merge
45 | cat ${OUTNAME}_???.seg.srt12.mrg | sort -S ${MEM} --parallel=${CPU} -k 1,1 -k 2,2n >& ${OUTNAME}.mrg.srt12
46 | ${BINDIR}/merge_kmer_ranges.pl ${OUTNAME}.mrg.srt12 >& ${OUTNAME}.mrg.srt12.mrg
47 | ${BINDIR}/add_contig_ID_to_seg.pl ${OUTNAME}.mrg.srt12.mrg ${REF} ${ASSEMBLY} | sort -k 8,8n -k 7,7r -k 5,5n - | cut -d " " -f 8 --complement > ${OUTNAME}.mrg.qsrt
48 | 
49 | 


--------------------------------------------------------------------------------
/CopyNumberGen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is a bash script that generates the table for number of orthologs present in each species. 
 3 | # It takes the output generated by "orthomclMclToGroups" and converts ids to numbers.
 4 | # by default it prints to stdout
 5 | 
 6 | # Arun Seetharam <arnstrm@iastate.edu>
 7 | 
 8 | scriptName="${0##*/}"
 9 | 
10 | function printUsage() {
11 |     cat <<EOF
12 | 
13 | Synopsis
14 | 
15 |     $scriptName [-h | --help] input_file
16 | 
17 | Description
18 | 
19 |     Generates the count table from the orthologous group ids file generated by "orthomclMclToGroups"
20 |     The count table gives gene copy number in all species for each othologous group.
21 |     
22 |     input_file 
23 |         Input file should contain orthologous group and IDs
24 |         This file has to be generated by "orthomclMclToGroups" command
25 | 
26 |     -h, --help 
27 |         Brings up this help page
28 |  
29 | Author
30 | 
31 |     Arun Seetharam, Bioinformatics Core, Purdue University.
32 |     aseetharam@purdue.edu
33 | 
34 | 
35 | 
36 | 
37 | EOF
38 | }
39 | 
40 | if [ $# -lt 1 ] ; then
41 |     printUsage
42 |     exit 1
43 | fi
44 | 
45 | while getopts ':h:' option; do
46 |   case "$option" in
47 |     h) printUsage
48 |        exit
49 |        ;;
50 |     help) printUsage
51 |        exit
52 |        ;;
53 |   esac
54 | done
55 | 
56 | file=${1};
57 | sed -e 's/ /" /g' -e 's/|/ "/g' -e 's/$/"/g' -e 's/:"/ /g' ${file} > ${file}.temp # separate gene ids from the species identifier
58 | names=`head -n 500 ${file}.temp | tr -s " " "\n" |sed '/^".*/d'|sed '/^OG.*/d'| sort |uniq |tr -s "\n" " "; echo ""` #array of all species names
59 | echo -en "OG_name\t" #print the header line
60 | for name in ${names[@]}; do
61 | echo -en "$name\t";
62 | done
63 | echo "";
64 | while read line; do #count frequency
65 | ogroup=$(echo $line|cut -d " " -f 1 );
66 | echo -en "$ogroup\t";
67 | for name2 in ${names[@]}; do
68 | freq=`echo $line | awk -F "$name2" '{print NF-1}'`;
69 | echo -ne "$freq\t";
70 | done
71 | echo "";
72 | done<${file}.temp;
73 | rm ${file}.temp; #delete temp file
74 | 
75 | 


--------------------------------------------------------------------------------
/ExtractSCOs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is a bash script that extracts all ortholog groups having single copy gene in different species. 
 3 | # It takes the output generated by "CopyNumberGen.sh" and prints single copy orthologs.
 4 | # by default it prints to stdout
 5 | 
 6 | # Arun Seetharam <aseetharam@purdue.edu>
 7 | 
 8 | scriptName="${0##*/}"
 9 | declare -i DEFAULT_COPY=1
10 | declare -i copynum=DEFAULT_COPY
11 | 
12 | function printUsage() {
13 |     cat <<EOF
14 | 
15 | Synopsis
16 | 
17 |     $scriptName [-h | --help] [-c number] input_file
18 | 
19 | Description
20 | 
21 |     Reads the count table and prints only lines having one copy gene in every genome.
22 |     Output is printed to STDOUT
23 |     
24 |     input_file 
25 |         Input file should contain count table and ortholog group IDs
26 |         This file has to be generated by "CopyNumberGen.sh" script.
27 |     
28 |     -c number
29 |         By default, all orthologs groups that have single copy gene are printed, but you can specify
30 |         desired number of genes per ortholog group. Using 2 will print all ortholog groups having 
31 |         two copy genes per genome.
32 | 
33 |     -h, --help 
34 |         Brings up this help page
35 |  
36 | Author
37 | 
38 |     Arun Seetharam, Bioinformatics Core, Purdue University.
39 |     aseetharam@purdue.edu
40 | 
41 | 
42 | 
43 | 
44 | EOF
45 | }
46 | 
47 | if [ $# -lt 1 ] ; then
48 |     printUsage
49 |     exit 1
50 | fi
51 | 
52 | while getopts ':c:' option; do
53 |   case "$option" in
54 |     c) copynum=$OPTARG
55 |        shift
56 |        ;;
57 |     h) printUsage
58 |        exit
59 |        ;;
60 |     help) printUsage
61 |        exit
62 |        ;;
63 |   esac
64 | done
65 | shift $(( $# - 1 ))
66 | FILE="$1"
67 | head -n 1 ${FILE}
68 | while read line; do
69 | g=$(echo $line | cut -d " " -f 2- | tr " " "\n" | uniq | wc -l );
70 | if [ $g -eq "1" ]; then
71 | echo $line | sed 's/ /\t/g' | grep -w "${copynum}" ;
72 | fi;
73 | done<${FILE}
74 | 


--------------------------------------------------------------------------------
/ExtractSeq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is a bash script that extracts the sequences for all orthologous groups (OG).
 3 | # It takes the a OG ids list as input and saves all sequences belonging to that group
 4 | # from all organism in a file named with OG group in fasta format.
 5 | # Note that after the script is executed, there will be 'n' number of files (where 
 6 | # n=total number of OG's in the input list
 7 | 
 8 | # Arun Seetharam <arnstrm@iastate.edu>
 9 | 
10 | scriptName="${0##*/}"
11 | outdir=$(pwd)
12 | function printUsage() {
13 |     cat <<EOF
14 | 
15 | Synopsis
16 | 
17 |     $scriptName [-h | --help] [-o dir_name] input_ids_list database
18 | 
19 | Description
20 | 
21 |     Extracts sequences for all ortholog groups supplied as list. For each ID in the list
22 |     a file containing FASTA sequences will be generated, which belong to that OG.
23 |     Note: this script requires standalone cdbfasta program.
24 | 
25 |     input_ids_list
26 |         Input list should contain orthologous group IDs one per line
27 |         These IDs should be generated by "orthomclMclToGroups" command
28 | 
29 |     sequence_file
30 |         Absolute path for the sequence file should be specified. This file is generally
31 |         named as 'goodProteins.fasta'
32 | 
33 |     -o directory_name
34 |         directory name to save the output files. By default all files will be saved in 
35 |         the current directory
36 | 
37 |     -h, --help
38 |         Brings up this help page
39 | 
40 | Author
41 | 
42 |     Arun Seetharam, Genome Informatics Facility, Iowa State University.
43 |     arnstrm@iastate.edu
44 | 
45 | 
46 | EOF
47 | }
48 | 
49 | if [ $# -lt 1 ] ; then
50 |     printUsage
51 |     exit 1
52 | fi
53 | 
54 | while getopts ':o:' option; do
55 |   case "$option" in
56 |        o) outdir=$OPTARG
57 |        shift
58 |        ;;
59 |        h) printUsage
60 |        exit
61 |        ;;
62 |        help) printUsage
63 |        exit
64 |        ;;
65 |   esac
66 | done
67 | 
68 | module load cdbfasta
69 | 
70 | mkdir -p $outdir
71 | 
72 | shift $(( $# - 2 ))
73 | file=${1}
74 | pathdbname=${2}
75 | cdbfasta ${pathdbname}
76 | sed -i 's/://g' ${file}
77 | while IFS=$' ' read -r -a myArray
78 | do
79 | echo "${myArray[@]:1}" | cdbyank ${pathdbname}.cidx >> ${outdir}/${myArray[0]}.fa;
80 | done <${file}
81 | 


--------------------------------------------------------------------------------
/Fastq2CA.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | R1=$2
3 | R2=$3
4 | LB=$1
5 | OUTFILE=$(basename ${R1} | sed 's/_1.fq//g');
6 | fastqToCA -insertsize 500 100 -libraryname $LB -technology illumina -type sanger -innie -mates ${R1},${R2} > ${OUTFILE}.frg
7 | 


--------------------------------------------------------------------------------
/GI2Taxonomy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # donwload nodes.dmp and names.dmp to provide the loaction here
 3 | # run it as:
 4 | # parallel "sh GI2Taxonomy.sh {}" ::: gi_ids.file
 5 | #
 6 | 
 7 | 
 8 | NAMES="names.dmp"
 9 | NODES="nodes.dmp"
10 | GI_TO_TAXID="gi_taxid_nucl.dmp"
11 | TAXONOMY=""
12 | GI="${1}"
13 | 
14 | # Obtain the name corresponding to a taxid or the taxid of the parent taxa
15 | get_name_or_taxid()
16 | {
17 |     grep --max-count=1 "^${1}"$'\t' "${2}" | cut --fields="${3}"
18 | }
19 | 
20 | # Get the taxid corresponding to the GI number
21 | TAXID=$(get_name_or_taxid "${GI}" "${GI_TO_TAXID}" "2")
22 | 
23 | # Loop until you reach the root of the taxonomy (i.e. taxid = 1)
24 | while [[ "${TAXID}" -gt 1 ]] ; do
25 |     # Obtain the scientific name corresponding to a taxid
26 |     NAME=$(get_name_or_taxid "${TAXID}" "${NAMES}" "3")
27 |     # Obtain the parent taxa taxid
28 |     PARENT=$(get_name_or_taxid "${TAXID}" "${NODES}" "3")
29 |     # Build the taxonomy path
30 |     TAXONOMY="${NAME};${TAXONOMY}"
31 |     TAXID="${PARENT}"
32 | done
33 | 
34 | echo -e "${GI}\t${TAXONOMY}"
35 | 
36 | exit 0
37 | 


--------------------------------------------------------------------------------
/GOseq_auto.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | module unload R
 4 | module load R/3.1.2
 5 | 
 6 | 
 7 | #define the go annotation file
 8 | export goAnnot="go_annotation.sorted.uniq.Mapped"
 9 | #define the gene lengths file
10 | export geneLen="gene.lengths.txt"
11 | 
12 | 
13 | #create a list of your genes of interest
14 | #one per line
15 | #geneID1
16 | #geneID2 
17 | #etc
18 | #GOI = Genes of Interest
19 | #export GOI="GOI.txt"
20 | export GOI=$1
21 | 
22 | #create a factor label for the GOI
23 | #export labelgoi="incOverTm"
24 | export labelgoi=$1
25 | 
26 | #create the factor labeled file
27 | #GOI geneID1
28 | #GOI geneID2
29 | #GOI etc
30 | #awk 'BEGIN{OFS="\t"} {print "'$labelgoi'",$1}' $GOI > GOI_factorFile.txt
31 | 
32 | #create a list of all your genes
33 | export Allgenes="Allgenes.txt"
34 | 
35 | #create a factor label for the GOI
36 | export labelallg="Allgenes"
37 | 
38 | #create the factor labeled file
39 | #seriola geneID1
40 | #seriola geneID2
41 | #seriola etc
42 | #r awk 'BEGIN{OFS="\t"} {print "'$labelallg'",$1}' $GOI > Allgenes_factorFile.txt
43 | 
44 | #r awk '{print $1}' GOI_factorFile.txt Allgenes_factorFile.txt
45 | cat $GOI $Allgenes | sort | uniq -c | awk 'BEGIN{OFS="\t"} ($1==1) {print "'$labelallg'",$2} ($1==2) {print "'$labelgoi'",$2}' >factor_file_$GOI 
46 | 
47 | run_GOseq.pl --factor_labeling factor_file_$GOI  --GO_assignments $goAnnot  --lengths $geneLen 
48 | 
49 | mkdir factor_file
50 | mv factor_file_* factor_file
51 | mkdir GO_OUT
52 | mv *enriched GO_OUT/
53 | mv *depleted GO_OUT/
54 | mkdir transcriptsIDs
55 | mv *.txt transcriptsIDs/
56 | 
57 | 


--------------------------------------------------------------------------------
/GuessEncoding.sh:
--------------------------------------------------------------------------------
1 | #/bin/bash
2 | INFILE="$1"
3 | head -n 100000 ${INFILE} | awk '{if(NR%4==0) printf("%s",$0);}' |  od -A n -t u1 | awk 'BEGIN{min=100;max=0;}{for(i=1;i<=NF;i++) {if($i>max) max=$i; if($i<min) min=$i;}}END{if(max<=74 && min<59) print "Phred+33"; else if(max>73 && min>=64) print "Phred+64"; else if(min>=59 && min<64 && max>73) print "Solexa+64"; else print "Unknown score encoding!";}'
4 | 


--------------------------------------------------------------------------------
/JRC.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #script to create a submission script that then can easily be appended to.
 3 | # 05/23/2016
 4 | #Andrew severin
 5 | 
 6 | function printUsage () {
 7 |     cat <<EOF
 8 | 
 9 | Synopsis
10 | 
11 |     $scriptName [-h | --help] BATCH_NAME NUMBERofHOURS 
12 | 
13 | Description
14 | 
15 |     This is a bash script that generates the sub file quickly.
16 | 
17 |         -h, --help
18 |         Brings up this help page
19 | 
20 | Author
21 | 
22 |     Andrew Severin, Genome Informatics Facilty, Iowa State University
23 |     severin@iastate.edu
24 |     23 May, 2016
25 | 
26 | 
27 | EOF
28 | }
29 | if [ $# -lt 2 ] ; then
30 |         printUsage
31 |         exit 0
32 | fi
33 | 
34 | 
35 | 
36 | 
37 | cat <<JOBHEAD > $1.sub
38 | #!/bin/bash
39 | #PBS -l nodes=1:ppn=16
40 | #PBS -l walltime=$2:00:00
41 | #PBS -N $1
42 | #PBS -o \${PBS_JOBNAME}.o\${PBS_JOBID} -e \${PBS_JOBNAME}.e\${PBS_JOBID}
43 | #PBS -m ae -M $netid@iastate.edu
44 | cd \$PBS_O_WORKDIR
45 | ulimit -s unlimited
46 | chmod g+rw \${PBS_JOBNAME}.[eo]\${PBS_JOBID}
47 | module use /shared/modulefiles 
48 | module load LAS/parallel/20150922 
49 | 
50 | JOBHEAD
51 | 


--------------------------------------------------------------------------------
/JobBLAST.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | rm cmds
 3 | pre=$(pwd)
 4 | for file in trinity_all.part-???.fasta; do
 5 | num=$(echo ${file%.*}| cut -d "-" -f 2);
 6 | echo "blastx -query ${file} -db ${pre}/DATABASE/uniref90.fasta -num_threads 4 -max_target_seqs 1 -outfmt 6 > uniref90.blastx.outfmt6.${num}" >> cmds
 7 | done
 8 | 
 9 | split -d -l 8 cmds blast_cmds_
10 | 
11 | for file in blast_cmds_*; do
12 | num=$(echo ${file} | cut -d "_" -f 3);
13 | cat <<JOBHEAD > blast_${num}.sub
14 | #!/bin/bash
15 | #PBS -l vmem=256Gb,pmem=8Gb,mem=256Gb
16 | #PBS -l nodes=1:ppn=32:ib
17 | #PBS -l walltime=48:00:00
18 | #PBS -N blast_${num}
19 | #PBS -o \${PBS_JOBNAME}.o\${PBS_JOBID} -e \${PBS_JOBNAME}.e\${PBS_JOBID}
20 | #PBS -m ae -M arnstrm@gmail.com
21 | cd \$PBS_O_WORKDIR
22 | ulimit -s unlimited
23 | chmod g+rw \${PBS_JOBNAME}.[eo]\${PBS_JOBID}
24 | module use /data004/software/GIF/modules
25 | module load ncbi-blast
26 | module load parallel
27 | parallel <<EOF
28 | JOBHEAD
29 | cat ${file} >> blast_${num}.sub
30 | echo "EOF" >> blast_${num}.sub
31 | echo "qstat -f \"\$PBS_JOBID\"" >>  blast_${num}.sub
32 | done
33 | 


--------------------------------------------------------------------------------
/JobP128.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Script to generate PBS sub files reading a command file
 3 | # 01/26/2015
 4 | # Andrew Severin <severin@iastate.edu>
 5 | 
 6 | function printUsage () {
 7 |     cat <<EOF
 8 | 
 9 | Synopsis
10 | 
11 |     $scriptName [-h | --help] <Number of commands per file> <commands_file>
12 | 
13 | Description
14 | 
15 |     This is a bash script that will create a sub file to subdivide a single file and parallize a bash script 
16 | 	across 128 cpus to run on the subdivisions of the file.
17 |     	The submission file is formatted to run on the Condo with 1 hours walltime on default queue.
18 | 	The output will be named with the bash SCRIPT and the jobID.
19 | 
20 |         -h, --help
21 |         Brings up this help page
22 | 
23 |     
24 | 	<SCRIPT file>
25 | 	This is a bash script that you wish to run on a large file.
26 | 
27 | Author
28 | 
29 |     Andrew Severin, Genome Informatics Facilty, Iowa State University
30 |     severin@iastate.edu
31 |     11 June, 2015
32 | 
33 | 
34 | EOF
35 | }
36 | if [ $# -lt 2 ] ; then
37 |         printUsage
38 |         exit 0
39 | fi
40 | 
41 | 
42 | SCRIPT="$2"
43 | INFILE="$1"
44 | 
45 | 
46 | function filesizefun () 
47 | {
48 | a=`ls -AlL $INFILE | awk '{print int($5/128/1000000)+1}'`"M"
49 | echo $a
50 | }
51 | filesize="$(filesizefun)"
52 | 
53 | function readlines () {
54 |     local N="$1"
55 |     local line
56 |     local rc="1"
57 |     for i in $(seq 1 $N); do
58 |         read line
59 |         if [ $? -eq 0 ]; then
60 |             echo "$line\n"
61 |             rc="0"
62 |         else
63 |             break
64 |         fi
65 |     done
66 |     return $rc
67 | }
68 | #> ${INFILE%%.*}_${num}.sub
69 | cat <<-JOBHEAD > ${INFILE}_$SCRIPT.sub 
70 | #!/bin/bash
71 | #PBS -l nodes=8:ppn=16
72 | #PBS -l walltime=1:00:00
73 | #PBS -N ${INFILE%%.*}_${num}
74 | #PBS -o \${PBS_JOBNAME}.o\${PBS_JOBID} -e \${PBS_JOBNAME}.e\${PBS_JOBID}
75 | #PBS -m ae -M severin@iastate.edu
76 | cd \$PBS_O_WORKDIR
77 | ulimit -s unlimited
78 | chmod g+rw \${PBS_JOBNAME}.[eo]\${PBS_JOBID}
79 | module load parallel
80 | parallel --env _ --jobs 16 --tmpdir /dev/shm | parallel --tmpdir /dev/shm --pipepart --block $filesize --jobs 16 --sshloginfile \$PBS_NODEFILE --joblog progress.log.\${PBS_JOBID} --workdir $PWD  -a  $INFILE -k ./$SCRIPT > ${INFILE}.\${PBS_JOBID}.out 
81 | JOBHEAD
82 | 
83 | echo -e "\nqstat -f \"\$PBS_JOBID\" | head" >> ${INFILE}_$SCRIPT.sub
84 | 


--------------------------------------------------------------------------------
/JobR.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Script to generate PBS sub files reading a command file
 3 | # 01/26/2015
 4 | # Arun Seetharam <arnstrm@iastate.edu>
 5 | 
 6 | function printUsage () {
 7 |     cat <<EOF
 8 | 
 9 | Synopsis
10 | 
11 |     $scriptName [-h | --help] <Number of commands per file> <commands_file>
12 | 
13 | Description
14 | 
15 |     This is a bash script that generates the sub file for each line/lines reading the command file
16 |     The submission file is formatted to run on the Lightning3 with 48 hours walltime on default queue.
17 | 	The output will be named with the commands_file name along with the number suffix.
18 | 
19 |         -h, --help
20 |         Brings up this help page
21 | 
22 | 	<Number of commands per file>
23 | 	Integer value that represnets the number of lines that needs to be put in each submission file
24 | 
25 |     <commands_file>
26 |     File with commands. Each line should be a independent job that should not have any variables.
27 |     Eg., "sh bash_script.sh file1;" as first line, "sh bash_script.sh file2;" as second line and so on
28 | 	makes a good commands file. Note that the lines should end with colon (;). 
29 | 
30 | Author
31 | 
32 |     Arun Seetharam, Genome Informatics Facilty, Iowa State University
33 |     arnstrm@iastate.edu
34 |     26 January, 2015
35 | 
36 | 
37 | EOF
38 | }
39 | if [ $# -lt 2 ] ; then
40 |         printUsage
41 |         exit 0
42 | fi
43 | 
44 | 
45 | LINES="$1"
46 | INFILE="$2"
47 | 
48 | 
49 | 
50 | function readlines () {
51 |     local N="$1"
52 |     local line
53 |     local rc="1"
54 |     for i in $(seq 1 $N); do
55 |         read line
56 |         if [ $? -eq 0 ]; then
57 |             echo "$line\n"
58 |             rc="0"
59 |         else
60 |             break
61 |         fi
62 |     done
63 |     return $rc
64 | }
65 | num=1
66 | while chunk=$(readlines ${LINES}); do
67 | cat <<JOBHEAD > ${INFILE%%.*}_${num}.sub
68 | #!/bin/bash
69 | #PBS -l vmem=256Gb,pmem=8Gb,mem=256Gb
70 | #PBS -l nodes=1:ppn=32:ib
71 | #PBS -l walltime=48:00:00
72 | #PBS -N ${INFILE%%.*}_${num}
73 | #PBS -o \${PBS_JOBNAME}.o\${PBS_JOBID} -e \${PBS_JOBNAME}.e\${PBS_JOBID}
74 | #PBS -m ae -M arnstrm@gmail.com
75 | cd \$PBS_O_WORKDIR
76 | ulimit -s unlimited
77 | chmod g+rw \${PBS_JOBNAME}.[eo]\${PBS_JOBID}
78 | module use /data004/software/GIF/modules
79 | module load parallel
80 | module load ncbi-blast
81 | parallel <<FIL
82 | JOBHEAD
83 | echo -e "${chunk}" >> ${INFILE%%.*}_${num}.sub
84 | echo -e "FIL\nqstat -f \"\$PBS_JOBID\" | head" >> ${INFILE%%.*}_${num}.sub
85 | echo -e "FIL\nwalltime" >> ${INFILE%%.*}_${num}.sub
86 | 
87 | ((num++))
88 | done<"${INFILE}"
89 | sed -i '/^$/d' ${INFILE}
90 | 
91 | 
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/JobR_BLAST.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Script to generate PBS sub files reading a command file
 3 | # 01/26/2015
 4 | # Arun Seetharam <arnstrm@iastate.edu>
 5 |  
 6 | LINES="$1"
 7 | INFILE="$2"
 8 | function readlines () {
 9 | local N="$1"
10 | local line
11 | local rc="1"
12 | for i in $(seq 1 $N); do
13 | read line
14 | if [ $? -eq 0 ]; then
15 | echo $line
16 | rc="0"
17 | else
18 | break
19 | fi
20 | done
21 | return $rc
22 | }
23 | num=1
24 | while chunk=$(readlines ${LINES}); do
25 | cat <<JOBHEAD > ${INFILE%%.*}_${num}.sub
26 | #!/bin/bash
27 | #PBS -l nodes=1:ppn=16
28 | #PBS -l walltime=96:00:00
29 | #PBS -N ${INFILE%%.*}_${num}
30 | #PBS -o \${PBS_JOBNAME}.o\${PBS_JOBID} -e \${PBS_JOBNAME}.e\${PBS_JOBID}
31 | #PBS -m ae -M arnstrm@gmail.com
32 | cd \$PBS_O_WORKDIR
33 | ulimit -s unlimited
34 | chmod g+rw \${PBS_JOBNAME}.[eo]\${PBS_JOBID}
35 | module use /shared/software/GIF/modules/
36 | module load parallel
37 | module load gatk
38 | JOBHEAD
39 | echo ${chunk} >> ${INFILE%%.*}_${num}.sub
40 | echo -e "qstat -f \"\$PBS_JOBID\" | head" >> ${INFILE%%.*}_${num}.sub
41 | ((num++))
42 | done<"${INFILE}"
43 | 


--------------------------------------------------------------------------------
/JobR_condo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Script to generate PBS sub files reading a command file
 3 | # 01/26/2015
 4 | # Arun Seetharam <arnstrm@iastate.edu>
 5 | 
 6 | function printUsage () {
 7 |     cat <<EOF
 8 | 
 9 | Synopsis
10 | 
11 |     $scriptName [-h | --help] <Number of commands per file> <commands_file>
12 | 
13 | Description
14 | 
15 |     This is a bash script that generates the sub file for each line/lines reading the command file
16 |     The submission file is formatted to run on the Lightning3 with 48 hours walltime on default queue.
17 | 	The output will be named with the commands_file name along with the number suffix.
18 | 
19 |         -h, --help
20 |         Brings up this help page
21 | 
22 | 	<Number of commands per file>
23 | 	Integer value that represnets the number of lines that needs to be put in each submission file
24 | 
25 |     <commands_file>
26 |     File with commands. Each line should be a independent job that should not have any variables.
27 |     Eg., "sh bash_script.sh file1;" as first line, "sh bash_script.sh file2;" as second line and so on
28 | 	makes a good commands file. Note that the lines should end with colon (;). 
29 | 
30 | Author
31 | 
32 |     Arun Seetharam, Genome Informatics Facilty, Iowa State University
33 |     arnstrm@iastate.edu
34 |     26 January, 2015
35 | 
36 | 
37 | EOF
38 | }
39 | if [ $# -lt 2 ] ; then
40 |         printUsage
41 |         exit 0
42 | fi
43 | 
44 | 
45 | LINES="$1"
46 | INFILE="$2"
47 | 
48 | 
49 | 
50 | function readlines () {
51 |     local N="$1"
52 |     local line
53 |     local rc="1"
54 |     for i in $(seq 1 $N); do
55 |         read line
56 |         if [ $? -eq 0 ]; then
57 |             echo "$line\n"
58 |             rc="0"
59 |         else
60 |             break
61 |         fi
62 |     done
63 |     return $rc
64 | }
65 | netid=$(whoami)
66 | num=1
67 | while chunk=$(readlines ${LINES}); do
68 | cat <<JOBHEAD > ${INFILE%%.*}_${num}.sub
69 | #!/bin/bash
70 | #PBS -l nodes=1:ppn=16
71 | #PBS -l walltime=48:00:00
72 | #PBS -N ${INFILE%%.*}_${num}
73 | #PBS -o \${PBS_JOBNAME}.o\${PBS_JOBID} -e \${PBS_JOBNAME}.e\${PBS_JOBID}
74 | #PBS -m ae -M $netid@iastate.edu
75 | cd \$PBS_O_WORKDIR
76 | ulimit -s unlimited
77 | chmod g+rw \${PBS_JOBNAME}.[eo]\${PBS_JOBID}
78 | module use /work/GIF3/software/modules
79 | module load parallel
80 | parallel <<FIL
81 | JOBHEAD
82 | echo -e "${chunk}" >> ${INFILE%%.*}_${num}.sub
83 | echo -e "FIL\nqstat -f \"\$PBS_JOBID\" | head" >> ${INFILE%%.*}_${num}.sub
84 | echo -e "\nwalltime" >> ${INFILE%%.*}_${num}.sub
85 | ((num++))
86 | done<"${INFILE}"
87 | sed -i '/^$/d' ${INFILE}
88 | 


--------------------------------------------------------------------------------
/JobTime.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CDIR=${PWD##*/}
 4 | PILAST=$(echo ${CDIR} | cut -d "_" -f 2)
 5 | PROJNAME=$(echo ${CDIR} | cut -d "_" -f 3- )
 6 | 
 7 | for FILE in $(find . -name "*.o*.hpc5"); do
 8 |    EFILE=$(echo $FILE | sed 's/\.o/.e/1')
 9 |    CDATE=$(stat -c%y ${FILE} | cut -c1-10)
10 |    START=$(sed -n '2p' ${FILE})
11 |       if grep -q "TIME STAMP" ${FILE}; then
12 |           END=$(ls --time-style='+%s' -l ${FILE} |cut -d " " -f 6)
13 |           DIFF=$((${END}-${START}))
14 |           DURATION=$(printf "%02d:%02d:%02d" "$((${DIFF}/3600))" "$(((DIFF%3600)/60))" "$(((DIFF%3600)%60))");
15 |       else
16 |           END=$(ls --time-style='+%s' -l ${EFILE} |cut -d " " -f 6)
17 |           DIFF=$((${END}-${START}))
18 |           DURATION=$(printf "%02d:%02d:%02d" "$((${DIFF}/3600))" "$(((DIFF%3600)/60))" "$(((DIFF%3600)%60))");
19 |           MESSAGE=$(echo ", Job failed or walltime expired")
20 |       fi
21 |    PROCS=$(sed -n '4p' ${FILE} | rev | cut -d " " -f 1 | rev)
22 |    NODES=$(sed -n '5p' ${FILE} | rev | cut -d " " -f 1 | rev)
23 |    BNAME=$(basename ${FILE})
24 |    JOBID=$(echo ${BNAME} | cut -d "." -f 2 | sed 's/^o//');
25 |    JOBNAME=$(echo ${BNAME} | cut -d "." -f 1)
26 |    echo -e "`date -d ${CDATE} +"%m-%d-%Y"`\tNA\t${PILAST}\tNA\t${DURATION}\t${PROCS}\t${NODES}\tNA\tNA\t${PROJNAME}\tJob:${JOBID}, ${JOBNAME}${MESSAGE}";
27 |    unset MESSAGE
28 | done
29 | 


--------------------------------------------------------------------------------
/JobTime2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PWD=$(pwd)
 3 | CDIR=${PWD##*/}
 4 | #PILAST=$(echo ${CDIR} | cut -d "_" -f 2)
 5 | #PROJNAME=$(echo ${CDIR} | cut -d "_" -f 3- )
 6 | 
 7 | for FILE in $(find . -name "*.o*.hpc5"); do
 8 |    EFILE=$(echo $FILE | sed 's/\.o/.e/1')
 9 |    CDATE=$(stat -c%y ${FILE} | cut -c1-10)
10 |    PROCTIME=$(grep "resources_used.cput"  ${FILE} | awk '{print $NF}')
11 |    WALLTIME=$(grep "resources_used.walltime"  ${FILE} | awk '{print $NF}')
12 |    JOBNAME=$(grep "Job_Name ="  ${FILE} | awk '{print $NF}')
13 |    JOBID=$(grep "Job Id: "  ${FILE} | awk '{print $NF}')
14 |    BNAME=$(basename ${FILE})
15 |       if [ -z "${PROCTIME}" ]; then
16 | 	echo -n "";
17 |       else
18 |         echo -e "`date -d ${CDATE} +"%m-%d-%Y"`\tNA\t${PILAST}\tNA\t${PROCTIME}\tNA\t$NA\tNA\tNA\t${PROJNAME}\tJob:${JOBID}, ${JOBNAME}${MESSAGE}";
19 |       fi
20 |    unset MESSAGE
21 | done
22 | 


--------------------------------------------------------------------------------
/Khmer_interleave.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #variables
 3 | progdir2='/home/arnstrm/arnstrm/20140304_Hufford_teosinte_TIL01/07_KHMER/khmerEnv/bin'
 4 | pwd=$(pwd)
 5 | input1=$1
 6 | input2=$2
 7 | output=$(echo ${input1} | sed 's/_R1_paired.fq$//g')
 8 | #interleave
 9 | source ${progdir2}/activate
10 | python ${progdir2}/interleave-reads.py -o ${output}_interleaved.fq ${input1} ${input2}
11 | #compress
12 | gzip -9 ${output}_interleaved_qc.fq
13 | 


--------------------------------------------------------------------------------
/LAS-makeSLURMp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | Usage = """
 4 | creates a job file for desired number of commands per job
 5 | 
 6 | Usage:
 7 | 
 8 |   makePBSp.py <number of jobs per PBS file> <commands file>
 9 | 
10 | eg:
11 |   
12 |   makePBSp.py 10 bowtie2.cmds
13 | 
14 | will create bowtie2_N.sub files, where N equals to number of lines in bowtie2.cmds divided by 10
15 | 
16 | If you have large number of commands that you would like to package (a set number) in a single
17 | PBS script file, you can run this script along with desired number of commands per job.
18 | Note that all commands will run in parallel with this script (p suffix). If you want to run one command
19 | at a time then use the s suffix script 
20 | 
21 | Arun Seetharam
22 | arnstrm@iastate.edu
23 | 11/08/2016
24 | """
25 | import sys
26 | import os
27 | if len(sys.argv)<3:
28 |     print Usage
29 | else:
30 |    cmdargs = str(sys.argv)
31 |    cmds = open(sys.argv[2],'r')
32 |    jobname = str(os.path.splitext(sys.argv[2])[0])
33 |    filecount = 0
34 |    numcmds = int(sys.argv[1])
35 |    line = cmds.readline()
36 |    while line:
37 |         cmd = []
38 |         while len(cmd) != int(sys.argv[1]):
39 |                 cmd.append(line)
40 |                 line = cmds.readline()
41 |         w = open(jobname+'_'+str(filecount)+'.sub','w')
42 |         w.write("#!/bin/bash\n")
43 |         w.write("#SBATCH -N 1\n")
44 |         w.write("#SBATCH --ntasks-per-node=16\n")
45 |         w.write("#SBATCH -A its-hpc-condo-las\n")
46 |         w.write("#SBATCH -t 96:00:00\n")
47 |         w.write("#SBATCH -J "+jobname+"_"+str(filecount)+"\n")
48 |         w.write("#SBATCH -o "+jobname+"_"+str(filecount)+".o%j\n")
49 |         w.write("#SBATCH -e "+jobname+"_"+str(filecount)+".e%j\n")
50 |         w.write("#SBATCH --mail-user=arnstrm@gmail.com\n")
51 |         w.write("#SBATCH --mail-type=begin\n")
52 |         w.write("#SBATCH --mail-type=end\n")
53 |         w.write("cd $SLURM_SUBMIT_DIR\n")
54 |         w.write("ulimit -s unlimited\n")
55 |         w.write("module use /work/GIF/software/modules\n")
56 |         w.write("module load parallel\n")
57 |         w.write("parallel --joblog "+jobname+"_progress_"+str(filecount)+".log --workdir $PWD <<FIL\n")
58 |         count = 0
59 |         while (count < numcmds):
60 |            w.write(cmd[count])
61 |            count = count + 1
62 |         w.write("FIL\n")
63 |         w.write("scontrol show job $SLURM_JOB_ID\n")
64 |         w.close()
65 |         filecount += 1
66 | 


--------------------------------------------------------------------------------
/Lines_based_file_deletion.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MINLINES="$1"
 4 | FILE="$2"
 5 | if [ -z "$MINLINES" ]
 6 | then
 7 | 	echo "Need to enter min number of lines to be present in files for deleting \$MINLINES"
 8 | 
 9 | else
10 | 
11 |        NUMLINES=$(wc -l "${FILE}" | cut -f 1 -d " ")
12 |        if [[ "${NUMLINES}" -le "${MINLINES}" ]];
13 |        then
14 |               rm -f ${FILE} ${FILE}.idx
15 |        fi
16 | 
17 | fi
18 | 


--------------------------------------------------------------------------------
/PcB_filter_stats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | dir=$1
 3 | cd ${dir}
 4 | a=$(basename $(dirname $(grep "data ref" input.xml |cut -f 2 -d "\"")))
 5 | b=$(basename $(grep "data ref" input.xml |cut -f 2 -d "\""))
 6 | all=$(grep -i "<tr><td>Polymerase Read" results/filter_reports_filter_stats.html |cut -f 5,7 -d ">" | sed 's/<\/td>/\n/' |cut -f 1 -d "<")
 7 | stat=( $all )
 8 | echo -e "tar_file\tdir\tsmrtcell\tpre-bases\tpre-reads\tpre-n50\tpre-mean_len\tpre-mean_read_score\tpost-bases\tpost-reads\tpost-n50\tpost-mean_len\tpost-mean_read_score"
 9 | echo -e "unk\t$a\t$b\t${stat[0]}\t${stat[2]}\t${stat[4]}\t${stat[6]}\t${stat[8]}\t${stat[1]}\t${stat[3]}\t${stat[5]}\t${stat[7]}\t${stat[9]}"
10 | cd ..
11 | 


--------------------------------------------------------------------------------
/Quiver_01.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Script to generate PBS sub files for quiver analysis
 3 | # 07/25/2016
 4 | # Andrew Severin <severin@iastate.edu>
 5 | 
 6 | function printUsage () {
 7 |     cat <<EOF
 8 | 
 9 | Synopsis
10 | 
11 |     $scriptName [-h | --help] <ReferenceGenomeToPolish>
12 | 
13 | Description
14 | 
15 |     This is a bash script that will take all *.bax.h5 files in a folder and generate a list of pbalign commands, then generate a submission script for each one with the required modules loaded. The submission file is formatted to run on condo with 48 hours walltime on default queue.
16 | 	The output will be named with the commands_file name along with the number suffix.
17 | 
18 |         -h, --help
19 |         Brings up this help page
20 | 
21 |     <ReferenceGenomeToPolish>
22 |        This fasta file contains the genome that you wish to polish with the raw read data (bax.h5).
23 | 
24 | 
25 | Author
26 | 
27 |     Andrew Severin, Genome Informatics Facilty, Iowa State University
28 |     severin@iastate.edu
29 |     25 July, 2016
30 | 
31 | 
32 | EOF
33 | }
34 | if [ $# -lt 1 ] ; then
35 |         printUsage
36 |         exit 0
37 | fi
38 | 
39 | GENOME="$1"
40 | 
41 | ls *bax.h5 | xargs -I xx echo "pbalign --forQuiver xx $1 aligned_reads.xx.cmp.h5" > pbalign.commands
42 | 
43 | INFILE="pbalign.commands"
44 | 
45 | 
46 | 
47 | function readlines () {
48 |     local N=1
49 |     local line
50 |     local rc="1"
51 |     for i in $(seq 1 $N); do
52 |         read line
53 |         if [ $? -eq 0 ]; then
54 |             echo "$line"
55 |             rc="0"
56 |         else
57 |             break
58 |         fi
59 |     done
60 |     return $rc
61 | }
62 | netid=$(whoami)
63 | num=1
64 | while chunk=$(readlines ${LINES}); do
65 | cat <<JOBHEAD > ${INFILE%%.*}_${num}.sub
66 | #!/bin/bash
67 | #PBS -l nodes=1:ppn=16
68 | #PBS -l walltime=48:00:00
69 | #PBS -N ${INFILE%%.*}_${num}
70 | #PBS -o \${PBS_JOBNAME}.o\${PBS_JOBID} -e \${PBS_JOBNAME}.e\${PBS_JOBID}
71 | #PBS -m ae -M $netid@iastate.edu
72 | cd \$PBS_O_WORKDIR
73 | ulimit -s unlimited
74 | chmod g+rw \${PBS_JOBNAME}.[eo]\${PBS_JOBID}
75 | module use /shared/modulefiles 
76 | module load LAS/parallel/20150922 
77 | module load python/2.7.10
78 | module load SMRTAnalysis/2.3.0
79 | JOBHEAD
80 | echo -e "${chunk}" >> ${INFILE%%.*}_${num}.sub
81 | echo -e "qstat -f \"\$PBS_JOBID\" | head" >> ${INFILE%%.*}_${num}.sub
82 | echo -e "\nwalltime" >> ${INFILE%%.*}_${num}.sub
83 | ((num++))
84 | done<"${INFILE}"
85 | sed -i '/^$/d' ${INFILE}
86 | 


--------------------------------------------------------------------------------
/Quiver_02.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l nodes=1:ppn=16
 3 | #PBS -l walltime=48:00:00
 4 | #PBS -N quiver 
 5 | #PBS -o ${PBS_JOBNAME}.o${PBS_JOBID} -e ${PBS_JOBNAME}.e${PBS_JOBID}
 6 | #PBS -m ae -M $netid@iastate.edu
 7 | cd $PBS_O_WORKDIR
 8 | ulimit -s unlimited
 9 | chmod g+rw ${PBS_JOBNAME}.[eo]${PBS_JOBID}
10 | 
11 | function printUsage () {
12 |     cat <<EOF
13 | 
14 | Synopsis
15 | 
16 |     $scriptName [-h | --help] GenomeFastafile 
17 | 
18 |         GenomeFastafile: Genome file from PacBio Assembly that you wish to polish with Quiver 
19 | 
20 | Author
21 | 
22 |     Andrew Severin, Genome Informatics Facilty, Iowa State University
23 |     severin@iastate.edu
24 |     16 December, 2016
25 | 
26 | 
27 | EOF
28 | }
29 | 
30 | if [ $# -lt 1 ] ; then
31 |         printUsage
32 |         exit 0
33 | fi
34 | 
35 | module use /shared/modulefiles 
36 | module load LAS/parallel/20150922 
37 | module load python/2.7.10
38 | module load SMRTAnalysis/2.3.0
39 | module load samtools
40 | 
41 | cmph5tools.py merge --outFile out_all.cmp.h5 aligned_reads*
42 | cmph5tools.py sort --inPlace --deep out_all.cmp.h5
43 | samtools faidx $1 
44 | quiver out_all.cmp.h5 -j 16 -r $1 -o $1_polished.fasta
45 | rm align_reads*
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | common_scripts
2 | ==============
3 | my bin directory
4 | 
5 | 


--------------------------------------------------------------------------------
/Rscript_for_plotting_bayescan_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # creates a R script to draw plots from bayescan ouput files
 3 | # save the stdout as .R file and run it as
 4 | # R CMD yourfile.R
 5 | # files will be "bayescan_plots.pdf" and "fst_plots.pdf"
 6 | # sometimes the files will be really large, you might want to break it into seperate pdf files
 7 | 
 8 | echo "source (\"/data003/GIF/software/packages/bayescan/2.1/R_functions/plot_R.r\")"
 9 | # uses plot_bayescan from the bayescan package
10 | for fstfile in $(find $(pwd) -name "*.b_fst.txt"); do
11 | chrname=$(basename $(dirname ${fstfile}));
12 | echo "pdf(\"bayescan_plots_${chrname}.pdf\")"
13 | echo "plot_bayescan(\"${fstfile}\")";
14 | echo "title(\"${chrname}\")";
15 | echo "dev.off()"
16 | done
17 | 
18 | # simple way to see fst distribution across chromosomes
19 | echo "plot_colors <- c(rgb(r=0.0,g=0.0,b=0.9), \"red\", \"forestgreen\")"
20 | for fstfile in $(find $(pwd) -name "*.b_fst.txt"); do
21 | chrname=$(basename $(dirname ${fstfile}));
22 | echo "pdf(\"fst_plots_${chrname}.pdf\")"
23 | echo "data <- read.table(\"${fstfile}\", header = TRUE, row.names = 1)";
24 | echo "plot(data\$fst, type=\"o\", col=plot_colors[1], ylab=\"Fst\", xlab=\"coordinates\")";
25 | echo "title(\"${chrname} Fst distribution\")";
26 | echo "dev.off()"
27 | done
28 | 
29 | 


--------------------------------------------------------------------------------
/Rscripts/QS.R:
--------------------------------------------------------------------------------
 1 | QLresultsPvaluePlot<-function(QLfit,Strname){
 2 | filename=Strname
 3 | results<-QL.results(QLfit,Plot=F)
 4 | designNum<-dim(results$P.values$QLSpline)[2]
 5 | designNames<-colnames(results$P.values$QLSpline)
 6 |  for (i in 1:designNum){
 7 |  print(i)
 8 |    if (min(results$P.values$QLSpline[,i])<1 && min(results$P.values$QLSpline[,i])!="NaN"){
 9 |    #need to add info to plot
10 | 
11 |      #results$P.values$QLSpline
12 |      Rnames<-rownames(dataIn)
13 |    #need to output this information to a file perhaps join the columns (do a cbind of identical outputs of qvalues or qvalue pvalue and then rowname it)
14 |    
15 |    #gives Qvalues  want to cbind to include pvalues
16 |      #print(as.matrix(results$Q.values$QLSpline[which(results$Q.values$QLSpline[,i]<0.3),i]))
17 |    #Gives original dataIn matrix
18 |      if (min(results$Q.values$QLSpline[,i])<0.1 && min(results$Q.values$QLSpline[,i])!="NaN"){
19 | 
20 |        if (length(which(results$Q.values$QLSpline[,i]<0.1))>1){
21 |        outData<-cbind(as.matrix(dataIn[which(results$Q.values$QLSpline[,i]<0.1),]),as.matrix(results$P.values$QLSpline[which(results$Q.values$QLSpline[,i]<0.1),i]),as.matrix(results$Q.values$QLSpline[which(results$Q.values$QLSpline[,i]<0.1),i]),(sign(rowSums(dataIn.norm[which(results$Q.values$QLSpline[,i]<0.1),which(trt==1)])-rowSums(dataIn.norm[which(results$Q.values$QLSpline[,i]<0.1),which(trt==2)])))*rowSums(dataIn.norm[which(results$Q.values$QLSpline[,i]<0.1),which(trt==1)])/rowSums(dataIn.norm[which(results$Q.values$QLSpline[,i]<0.1),which(trt==2)]))
22 |        colnames(outData)<-c(colnames(dataIn),"Pvalues","Qvalues","fold_change")
23 |        write.table(outData,file=paste(filename,".FulldesignVS.",i,".txt",sep=""))
24 |        }
25 |        if (length(which(results$Q.values$QLSpline[,i]<0.1))==1){
26 | outData<-cbind(matrix(dataIn[which(results$Q.values$QLSpline[,i]<0.1),],1,dim(dataIn)[2]),as.matrix(results$P.values$QLSpline[which(results$Q.values$QLSpline[,i]<0.1),i]),as.matrix(results$Q.values$QLSpline[which(results$Q.values$QLSpline[,i]<0.1),i]),(sign(sum(dataIn.norm[which(results$Q.values$QLSpline[,i]<0.1),which(trt==1)])-sum(dataIn.norm[which(results$Q.values$QLSpline[,i]<0.1),which(trt==2)])))*sum(dataIn.norm[which(results$Q.values$QLSpline[,i]<0.1),which(trt==1)])/sum(dataIn.norm[which(results$Q.values$QLSpline[,i]<0.1),which(trt==2)]))
27 |        colnames(outData)<-c(colnames(dataIn),"Pvalues","Qvalues","fold_change")
28 |        write.table(outData,file=paste(filename,".FullvsDesignVS.",i,".txt",sep=""))
29 |        }
30 | }
31 |      
32 |      pdf(file=paste(filename,".",i,".pdf",sep=""),width=5,height=5)
33 |      a<-hist(results$P.values$QLSpline[,i],breaks=seq(0,1,.01),main=paste(Strname,designNames[i]),cex.main=.5)
34 |      b<-a$counts[1]*.75
35 |      bb<-a$counts[1]*.65
36 |      bbb<-a$counts[1]*.55
37 |      text(.5,b,paste("Number of genes qvalue below 0.5 = ",as.character( dim(as.matrix(dataIn[which(results$Q.values$QLSpline[,i]<0.5),i]))[1])),cex=.8)
38 |      text(.5,bb,paste("Number of genes qvalue below 0.3 = ",as.character( dim(as.matrix(dataIn[which(results$Q.values$QLSpline[,i]<0.3),i]))[1])),cex=.8)
39 |      text(.5,bbb,paste("Number of genes qvalue below 0.1 = ",as.character( dim(as.matrix(dataIn[which(results$Q.values$QLSpline[,i]<0.1),i]))[1])),cex=.8)
40 |      dev.off()
41 |      
42 |    }
43 |  }
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/Rscripts/gff2gtf.R:
--------------------------------------------------------------------------------
1 | library(rtracklayer)
2 | test <- ("combined.gff")
3 | test2 <- import(test)
4 | export(test2,"test.gtf","gtf")
5 | savehistory(file="gff2gtf")
6 | 


--------------------------------------------------------------------------------
/SAM_to_sortedBAM.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #convert sam to sorted bam file using samtools
3 | # specify the sam file as first argument
4 | module load samtools
5 | 
6 | SAM="$1"
7 | samtools view --threads 36 -b -o ${SAM%.*}.bam ${SAM}
8 | samtools sort -o ${SAM%.*}_sorted.bam -T ${SAM%.*}_temp --threads 36 ${SAM%.*}.bam
9 | 


--------------------------------------------------------------------------------
/SOYGO/ANML/ANML.4Kevin:
--------------------------------------------------------------------------------
 1 | Glyma.03G078300	GO:0006636 GO:0009058 GO:0009247 GO:0015995 GO:0016036 GO:0016117 GO:0019288 GO:0019375 GO:0019761 GO:0046506 	GO:0008194 GO:0016757 GO:0046510 
 2 | Glyma.16G052000	GO:0006071 GO:0006629 GO:0030643 GO:0046686 	GO:0008081 GO:0008889 
 3 | Glyma.18G001700	GO:0006979 	GO:0003674 
 4 | Glyma.19G098500	GO:0006071 GO:0006629 GO:0030643 GO:0046686 	GO:0008081 GO:0008889 
 5 | Glyma.01G191700	GO:0018401 GO:0031348 GO:0055114 	GO:0005506 GO:0016491 GO:0016705 GO:0016706 GO:0031418 
 6 | Glyma.08G195100	GO:0008152 GO:0016036 GO:0019375 GO:0045892 	GO:0016791 GO:0052731 GO:0052732 
 7 | Glyma.08G056400	GO:0016036 GO:0019375 GO:0030643 GO:0042542 GO:0045892 	GO:0003993 GO:0004722 GO:0016787 GO:0016791 
 8 | Glyma.06G028200	GO:0016036 	GO:0003993 GO:0004722 
 9 | Glyma.19G064200		GO:0003674 
10 | Glyma.18G072800	GO:0008150 	
11 | Glyma.17G212100	GO:0006168 GO:0009061 GO:0009116 GO:0009117 	GO:0003999 
12 | Glyma.08G195000	GO:0008152 GO:0016036 GO:0019375 GO:0045892 	GO:0016791 GO:0052731 GO:0052732 
13 | Glyma.11G226200	GO:0006863 GO:0016132 	GO:0008514 
14 | Glyma.20G008300	GO:0008150 	GO:0003674 
15 | Glyma.06G078400	GO:0008150 	GO:0003674 
16 | Glyma.09G153900	GO:0006094 GO:0006096 GO:0006098 GO:0009409 GO:0009416 GO:0009651 GO:0009737 GO:0009853 GO:0046686 	GO:0000287 GO:0003677 GO:0004634 GO:0005507 
17 | Glyma.09G223700	GO:0008643 GO:0055062 GO:0055085 	GO:0005351 
18 | Glyma.20G238000	GO:0000165 GO:0006612 GO:0007154 GO:0008152 GO:0009409 GO:0009738 GO:0009862 GO:0009867 GO:0010363 GO:0016036 GO:0019375 GO:0030968 GO:0031348 GO:0042631 GO:0043069 GO:0048193 GO:0048364 GO:0048527 GO:0050832 	GO:0003824 GO:0004630 GO:0005543 
19 | Glyma.17G114700	GO:0006817 GO:0016036 GO:0019375 GO:0045892 	GO:0003674 
20 | Glyma.11G231700	GO:0008152 GO:0016036 GO:0019375 	GO:0016779 
21 | Glyma.12G013200	GO:0008643 GO:0055062 GO:0055085 	GO:0005351 
22 | Glyma.18G235900	GO:0009617 GO:0009735 GO:0015688 GO:0055072 GO:0055114 	GO:0000293 GO:0005506 GO:0009055 GO:0016491 GO:0050660 
23 | Glyma.03G130500		
24 | Glyma.08G289300	GO:0008150 	GO:0003674 
25 | Glyma.01G214400	GO:0009664 GO:0009826 GO:0009828 GO:0009831 GO:0010054 GO:0048446 GO:0048765 GO:0048767 	
26 | Glyma.04G147600	GO:0006817 GO:0016036 GO:0019375 GO:0045892 GO:0080040 	GO:0003674 
27 | Glyma.11G142200		
28 | Glyma.11G065900		GO:0003824 
29 | Glyma.12G065500		
30 | Glyma.17G108700	GO:0005975 GO:0006631 GO:0009247 GO:0016036 GO:0019374 GO:0019375 GO:0030259 GO:0045892 	GO:0016757 GO:0016758 GO:0030246 GO:0035250 GO:0046509 
31 | 


--------------------------------------------------------------------------------
/SOYGO/ANML/ANML.list:
--------------------------------------------------------------------------------
 1 | Glyma.03G078300.1
 2 | Glyma.16G052000.1
 3 | Glyma.18G001700.1
 4 | Glyma.19G098500.1
 5 | Glyma.01G191700.1
 6 | Glyma.08G195100.1
 7 | Glyma.08G056400.1
 8 | Glyma.06G028200.1
 9 | Glyma.19G064200.1
10 | Glyma.18G072800.1
11 | Glyma.17G212100.1
12 | Glyma.08G195000.1
13 | Glyma.11G226200.1
14 | Glyma.20G008300.1
15 | Glyma.06G078400.1
16 | Glyma.09G153900.1
17 | Glyma.09G223700.1
18 | Glyma.20G238000.1
19 | Glyma.17G114700.1
20 | Glyma.11G231700.1
21 | Glyma.12G013200.1
22 | Glyma.18G235900.1
23 | Glyma.03G130500.1
24 | Glyma.08G289300.1
25 | Glyma.01G214400.1
26 | Glyma.04G147600.1
27 | Glyma.11G142200.1
28 | Glyma.11G065900.1
29 | Glyma.12G065500.1
30 | Glyma.17G108700.1
31 | 


--------------------------------------------------------------------------------
/SOYGO/ANML/ANML.list_BP.txt:
--------------------------------------------------------------------------------
  1 | GO:0006636
  2 | GO:0009058
  3 | GO:0009247
  4 | GO:0015995
  5 | GO:0016036
  6 | GO:0016117
  7 | GO:0019288
  8 | GO:0019375
  9 | GO:0019761
 10 | GO:0046506
 11 | GO:0006071
 12 | GO:0006629
 13 | GO:0030643
 14 | GO:0046686
 15 | GO:0006979
 16 | GO:0006071
 17 | GO:0006629
 18 | GO:0030643
 19 | GO:0046686
 20 | GO:0018401
 21 | GO:0031348
 22 | GO:0055114
 23 | GO:0008152
 24 | GO:0016036
 25 | GO:0019375
 26 | GO:0045892
 27 | GO:0016036
 28 | GO:0019375
 29 | GO:0030643
 30 | GO:0042542
 31 | GO:0045892
 32 | GO:0016036
 33 | 
 34 | GO:0008150
 35 | GO:0006168
 36 | GO:0009061
 37 | GO:0009116
 38 | GO:0009117
 39 | GO:0008152
 40 | GO:0016036
 41 | GO:0019375
 42 | GO:0045892
 43 | GO:0006863
 44 | GO:0016132
 45 | GO:0008150
 46 | GO:0008150
 47 | GO:0006094
 48 | GO:0006096
 49 | GO:0006098
 50 | GO:0009409
 51 | GO:0009416
 52 | GO:0009651
 53 | GO:0009737
 54 | GO:0009853
 55 | GO:0046686
 56 | GO:0008643
 57 | GO:0055062
 58 | GO:0055085
 59 | GO:0000165
 60 | GO:0006612
 61 | GO:0007154
 62 | GO:0008152
 63 | GO:0009409
 64 | GO:0009738
 65 | GO:0009862
 66 | GO:0009867
 67 | GO:0010363
 68 | GO:0016036
 69 | GO:0019375
 70 | GO:0030968
 71 | GO:0031348
 72 | GO:0042631
 73 | GO:0043069
 74 | GO:0048193
 75 | GO:0048364
 76 | GO:0048527
 77 | GO:0050832
 78 | GO:0006817
 79 | GO:0016036
 80 | GO:0019375
 81 | GO:0045892
 82 | GO:0008152
 83 | GO:0016036
 84 | GO:0019375
 85 | GO:0008643
 86 | GO:0055062
 87 | GO:0055085
 88 | GO:0009617
 89 | GO:0009735
 90 | GO:0015688
 91 | GO:0055072
 92 | GO:0055114
 93 | 
 94 | GO:0008150
 95 | GO:0009664
 96 | GO:0009826
 97 | GO:0009828
 98 | GO:0009831
 99 | GO:0010054
100 | GO:0048446
101 | GO:0048765
102 | GO:0048767
103 | GO:0006817
104 | GO:0016036
105 | GO:0019375
106 | GO:0045892
107 | GO:0080040
108 | 
109 | 
110 | 
111 | GO:0005975
112 | GO:0006631
113 | GO:0009247
114 | GO:0016036
115 | GO:0019374
116 | GO:0019375
117 | GO:0030259
118 | GO:0045892
119 | 


--------------------------------------------------------------------------------
/SOYGO/ANML/ANML.list_BP_fisher.txt:
--------------------------------------------------------------------------------
 1 | GO:0030259	1	15	112	43122
 2 | GO:0009116	1	38	112	43099
 3 | GO:0016132	1	270	112	42867
 4 | GO:0080040	1	3	112	43134
 5 | GO:0006096	1	660	112	42477
 6 | GO:0055062	2	24	111	43112
 7 | GO:0009117	1	149	112	42988
 8 | GO:0009058	1	347	112	42790
 9 | GO:0009061	1	34	112	43103
10 | GO:0030968	1	500	112	42637
11 | GO:0006612	1	1019	112	42118
12 | GO:0008152	4	2091	109	41043
13 | GO:0019375	9	225	104	42904
14 | GO:0006094	1	462	112	42675
15 | GO:0007154	1	185	112	42952
16 | GO:0018401	1	21	112	43116
17 | GO:0048527	1	137	112	43000
18 | GO:0048364	1	400	112	42737
19 | GO:0019288	1	580	112	42557
20 | GO:0006979	1	631	112	42506
21 | GO:0006817	2	50	111	43086
22 | GO:0006098	1	420	112	42717
23 | GO:0046686	3	1307	110	41828
24 | GO:0009867	1	799	112	42338
25 | GO:0009828	1	83	112	43054
26 | GO:0009853	1	336	112	42801
27 | GO:0008150	4	11998	109	31136
28 | GO:0006629	2	532	111	42604
29 | GO:0009831	1	57	112	43080
30 | GO:0048767	1	503	112	42634
31 | GO:0055114	2	2339	111	40797
32 | GO:0009664	1	336	112	42801
33 | GO:0006636	1	203	112	42934
34 | GO:0005975	1	910	112	42227
35 | GO:0009735	1	166	112	42971
36 | GO:0009651	1	1844	112	41293
37 | GO:0015995	1	293	112	42844
38 | GO:0000165	1	574	112	42563
39 | GO:0006168	1	13	112	43124
40 | GO:0009247	2	11	111	43125
41 | GO:0009738	1	633	112	42504
42 | GO:0043069	1	524	112	42613
43 | GO:0030643	3	4	110	43131
44 | GO:0006071	2	25	111	43111
45 | GO:0048193	1	456	112	42681
46 | GO:0055072	1	71	112	43066
47 | GO:0019374	1	3	112	43134
48 | GO:0045892	6	462	107	42670
49 | GO:0015688	1	3	112	43134
50 | GO:0048446	1	35	112	43102
51 | GO:0009737	1	1253	112	41884
52 | GO:0008643	2	9	111	43127
53 | GO:0009617	1	406	112	42731
54 | GO:0031348	2	764	111	42372
55 | GO:0009416	1	439	112	42698
56 | GO:0010363	1	1018	112	42119
57 | GO:0006631	1	85	112	43052
58 | GO:0016117	1	258	112	42879
59 | GO:0042631	1	189	112	42948
60 | GO:0050832	1	940	112	42197
61 | GO:0016036	10	320	103	42808
62 | GO:0055085	2	1059	111	42077
63 | GO:0009826	1	273	112	42864
64 | GO:0009409	2	1115	111	42021
65 | GO:0009862	1	687	112	42450
66 | GO:0019761	1	348	112	42789
67 | GO:0042542	1	510	112	42627
68 | GO:0006863	1	121	112	43016
69 | GO:0048765	1	395	112	42742
70 | GO:0046506	1	5	112	43132
71 | GO:0010054	1	50	112	43087
72 | 


--------------------------------------------------------------------------------
/SOYGO/ANML/ANML.list_BP_names.txt:
--------------------------------------------------------------------------------
 1 | GO:0030259
 2 | GO:0009116
 3 | GO:0016132
 4 | GO:0080040
 5 | GO:0006096
 6 | GO:0055062
 7 | GO:0009117
 8 | GO:0009058
 9 | GO:0009061
10 | GO:0030968
11 | GO:0006612
12 | GO:0008152
13 | GO:0019375
14 | GO:0006094
15 | GO:0007154
16 | GO:0018401
17 | GO:0048527
18 | GO:0048364
19 | GO:0019288
20 | GO:0006979
21 | GO:0006817
22 | GO:0006098
23 | GO:0046686
24 | GO:0009867
25 | GO:0009828
26 | GO:0009853
27 | GO:0008150
28 | GO:0006629
29 | GO:0009831
30 | GO:0048767
31 | GO:0055114
32 | GO:0009664
33 | GO:0006636
34 | GO:0005975
35 | GO:0009735
36 | GO:0009651
37 | GO:0015995
38 | GO:0000165
39 | GO:0006168
40 | GO:0009247
41 | GO:0009738
42 | GO:0043069
43 | GO:0030643
44 | GO:0006071
45 | GO:0048193
46 | GO:0055072
47 | GO:0019374
48 | GO:0045892
49 | GO:0015688
50 | GO:0048446
51 | GO:0009737
52 | GO:0008643
53 | GO:0009617
54 | GO:0031348
55 | GO:0009416
56 | GO:0010363
57 | GO:0006631
58 | GO:0016117
59 | GO:0042631
60 | GO:0050832
61 | GO:0016036
62 | GO:0055085
63 | GO:0009826
64 | GO:0009409
65 | GO:0009862
66 | GO:0019761
67 | GO:0042542
68 | GO:0006863
69 | GO:0048765
70 | GO:0046506
71 | GO:0010054
72 | 


--------------------------------------------------------------------------------
/SOYGO/ANML/ANML.list_BP_out.txt:
--------------------------------------------------------------------------------
 1 | 1	15	112	43122
 2 | 1	38	112	43099
 3 | 1	270	112	42867
 4 | 1	3	112	43134
 5 | 1	660	112	42477
 6 | 2	24	111	43112
 7 | 1	149	112	42988
 8 | 1	347	112	42790
 9 | 1	34	112	43103
10 | 1	500	112	42637
11 | 1	1019	112	42118
12 | 4	2091	109	41043
13 | 9	225	104	42904
14 | 1	462	112	42675
15 | 1	185	112	42952
16 | 1	21	112	43116
17 | 1	137	112	43000
18 | 1	400	112	42737
19 | 1	580	112	42557
20 | 1	631	112	42506
21 | 2	50	111	43086
22 | 1	420	112	42717
23 | 3	1307	110	41828
24 | 1	799	112	42338
25 | 1	83	112	43054
26 | 1	336	112	42801
27 | 4	11998	109	31136
28 | 2	532	111	42604
29 | 1	57	112	43080
30 | 1	503	112	42634
31 | 2	2339	111	40797
32 | 1	336	112	42801
33 | 1	203	112	42934
34 | 1	910	112	42227
35 | 1	166	112	42971
36 | 1	1844	112	41293
37 | 1	293	112	42844
38 | 1	574	112	42563
39 | 1	13	112	43124
40 | 2	11	111	43125
41 | 1	633	112	42504
42 | 1	524	112	42613
43 | 3	4	110	43131
44 | 2	25	111	43111
45 | 1	456	112	42681
46 | 1	71	112	43066
47 | 1	3	112	43134
48 | 6	462	107	42670
49 | 1	3	112	43134
50 | 1	35	112	43102
51 | 1	1253	112	41884
52 | 2	9	111	43127
53 | 1	406	112	42731
54 | 2	764	111	42372
55 | 1	439	112	42698
56 | 1	1018	112	42119
57 | 1	85	112	43052
58 | 1	258	112	42879
59 | 1	189	112	42948
60 | 1	940	112	42197
61 | 10	320	103	42808
62 | 2	1059	111	42077
63 | 1	273	112	42864
64 | 2	1115	111	42021
65 | 1	687	112	42450
66 | 1	348	112	42789
67 | 1	510	112	42627
68 | 1	121	112	43016
69 | 1	395	112	42742
70 | 1	5	112	43132
71 | 1	50	112	43087
72 | 


--------------------------------------------------------------------------------
/SOYGO/ANML/ANML.list_BP_output.txt:
--------------------------------------------------------------------------------
 1 | names...1.	dataout...1.
 2 | 1	GO:0030259	0.0410011887884129
 3 | 2	GO:0009116	0.0970374562284642
 4 | 3	GO:0016132	0.508944434801075
 5 | 4	GO:0080040	0.0104103402563575
 6 | 5	GO:0006096	1
 7 | 6	GO:0055062	0.00211070783676381
 8 | 7	GO:0009117	0.325037936270398
 9 | 8	GO:0009058	0.599119532247922
10 | 9	GO:0009061	0.0875306766504622
11 | 10	GO:0030968	1
12 | 11	GO:0006612	0.528211379413265
13 | 12	GO:0008152	0.663185002922654
14 | 13	GO:0019375	1.24951886520828e-08
15 | 14	GO:0006094	1
16 | 15	GO:0007154	0.385929989884397
17 | 16	GO:0018401	0.0559432374621913
18 | 17	GO:0048527	0.303439422539069
19 | 18	GO:0048364	1
20 | 19	GO:0019288	1
21 | 20	GO:0006979	1
22 | 21	GO:0006817	0.0082393103092063
23 | 22	GO:0006098	1
24 | 23	GO:0046686	1
25 | 24	GO:0009867	0.727113813816823
26 | 25	GO:0009828	0.197454577823894
27 | 26	GO:0009853	0.587320828327371
28 | 27	GO:0008150	2.49420995384823e-11
29 | 28	GO:0006629	0.407538995346548
30 | 29	GO:0009831	0.140870909038357
31 | 30	GO:0048767	1
32 | 31	GO:0055114	0.0948234696431762
33 | 32	GO:0009664	0.587320828327371
34 | 33	GO:0006636	0.414301301443197
35 | 34	GO:0005975	0.73544095888815
36 | 35	GO:0009735	0.354504076810426
37 | 36	GO:0009651	0.0963391446505929
38 | 37	GO:0015995	0.537803217033858
39 | 38	GO:0000165	1
40 | 39	GO:0006168	0.0359686025417041
41 | 40	GO:0009247	0.000517935526929727
42 | 41	GO:0009738	1
43 | 42	GO:0043069	1
44 | 43	GO:0030643	6.03257643264206e-07
45 | 44	GO:0006071	0.00227568242998902
46 | 45	GO:0048193	1
47 | 46	GO:0055072	0.171813381049013
48 | 47	GO:0019374	0.0104103402563575
49 | 48	GO:0045892	0.0014833133606088
50 | 49	GO:0015688	0.0104103402563575
51 | 50	GO:0048446	0.0899166313777213
52 | 51	GO:0009737	0.387990319839512
53 | 52	GO:0008643	0.000366461634762876
54 | 53	GO:0009617	1
55 | 54	GO:0031348	1
56 | 55	GO:0009416	1
57 | 56	GO:0010363	0.52811979547242
58 | 57	GO:0006631	0.20165093511169
59 | 58	GO:0016117	0.493185861165883
60 | 59	GO:0042631	0.392350170814628
61 | 60	GO:0050832	0.524306308875632
62 | 61	GO:0016036	1.81298640176518e-08
63 | 62	GO:0055085	1
64 | 63	GO:0009826	0.512807586514423
65 | 64	GO:0009409	1
66 | 65	GO:0009862	1
67 | 66	GO:0019761	0.600175415257013
68 | 67	GO:0042542	1
69 | 68	GO:0006863	0.273572407462158
70 | 69	GO:0048765	1
71 | 70	GO:0046506	0.015575156693989
72 | 71	GO:0010054	0.124972717725108
73 | 


--------------------------------------------------------------------------------
/SOYGO/ANML/ANML.list_ERROR.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ISUgenomics/common_scripts/68986279ec7634d41aaf0db7e0125a24a9663bd1/SOYGO/ANML/ANML.list_ERROR.txt


--------------------------------------------------------------------------------
/SOYGO/ANML/ANML.list_MF.txt:
--------------------------------------------------------------------------------
 1 | GO:0008194
 2 | GO:0016757
 3 | GO:0046510
 4 | GO:0008081
 5 | GO:0008889
 6 | GO:0003674
 7 | GO:0008081
 8 | GO:0008889
 9 | GO:0005506
10 | GO:0016491
11 | GO:0016705
12 | GO:0016706
13 | GO:0031418
14 | GO:0016791
15 | GO:0052731
16 | GO:0052732
17 | GO:0003993
18 | GO:0004722
19 | GO:0016787
20 | GO:0016791
21 | GO:0003993
22 | GO:0004722
23 | GO:0003674
24 | 
25 | GO:0003999
26 | GO:0016791
27 | GO:0052731
28 | GO:0052732
29 | GO:0008514
30 | GO:0003674
31 | GO:0003674
32 | GO:0000287
33 | GO:0003677
34 | GO:0004634
35 | GO:0005507
36 | GO:0005351
37 | GO:0003824
38 | GO:0004630
39 | GO:0005543
40 | GO:0003674
41 | GO:0016779
42 | GO:0005351
43 | GO:0000293
44 | GO:0005506
45 | GO:0009055
46 | GO:0016491
47 | GO:0050660
48 | 
49 | GO:0003674
50 | 
51 | GO:0003674
52 | 
53 | GO:0003824
54 | 
55 | GO:0016757
56 | GO:0016758
57 | GO:0030246
58 | GO:0035250
59 | GO:0046509
60 | 


--------------------------------------------------------------------------------
/SOYGO/ANML/ANML.list_MF_fisher.txt:
--------------------------------------------------------------------------------
 1 | GO:0052732	2	6	52	26422
 2 | GO:0050660	1	318	53	26111
 3 | GO:0008889	2	17	52	26411
 4 | GO:0004722	2	280	52	26148
 5 | GO:0016791	3	93	51	26334
 6 | GO:0008194	1	232	53	26197
 7 | GO:0016787	1	728	53	25701
 8 | GO:0005507	1	533	53	25896
 9 | GO:0016706	1	297	53	26132
10 | GO:0000293	1	8	53	26421
11 | GO:0009055	1	969	53	25460
12 | GO:0046509	1	3	53	26426
13 | GO:0003824	2	2674	52	23754
14 | GO:0046510	1	3	53	26426
15 | GO:0000287	1	248	53	26181
16 | GO:0003677	1	3933	53	22496
17 | GO:0004634	1	11	53	26418
18 | GO:0005506	2	709	52	25719
19 | GO:0003993	2	98	52	26330
20 | GO:0016758	1	407	53	26022
21 | GO:0031418	1	33	53	26396
22 | GO:0003674	7	10814	47	15609
23 | GO:0004630	1	35	53	26394
24 | GO:0016705	1	502	53	25927
25 | GO:0052731	2	6	52	26422
26 | GO:0016491	2	1466	52	24962
27 | GO:0035250	1	14	53	26415
28 | GO:0016779	1	98	53	26331
29 | GO:0005351	2	190	52	26238
30 | GO:0005543	1	140	53	26289
31 | GO:0030246	1	416	53	26013
32 | GO:0003999	1	11	53	26418
33 | GO:0016757	2	953	52	25475
34 | GO:0008081	2	88	52	26340
35 | GO:0008514	1	43	53	26386
36 | 


--------------------------------------------------------------------------------
/SOYGO/ANML/ANML.list_MF_names.txt:
--------------------------------------------------------------------------------
 1 | GO:0052732
 2 | GO:0050660
 3 | GO:0008889
 4 | GO:0004722
 5 | GO:0016791
 6 | GO:0008194
 7 | GO:0016787
 8 | GO:0005507
 9 | GO:0016706
10 | GO:0000293
11 | GO:0009055
12 | GO:0046509
13 | GO:0003824
14 | GO:0046510
15 | GO:0000287
16 | GO:0003677
17 | GO:0004634
18 | GO:0005506
19 | GO:0003993
20 | GO:0016758
21 | GO:0031418
22 | GO:0003674
23 | GO:0004630
24 | GO:0016705
25 | GO:0052731
26 | GO:0016491
27 | GO:0035250
28 | GO:0016779
29 | GO:0005351
30 | GO:0005543
31 | GO:0030246
32 | GO:0003999
33 | GO:0016757
34 | GO:0008081
35 | GO:0008514
36 | 


--------------------------------------------------------------------------------
/SOYGO/ANML/ANML.list_MF_out.txt:
--------------------------------------------------------------------------------
 1 | 2	6	52	26422
 2 | 1	318	53	26111
 3 | 2	17	52	26411
 4 | 2	280	52	26148
 5 | 3	93	51	26334
 6 | 1	232	53	26197
 7 | 1	728	53	25701
 8 | 1	533	53	25896
 9 | 1	297	53	26132
10 | 1	8	53	26421
11 | 1	969	53	25460
12 | 1	3	53	26426
13 | 2	2674	52	23754
14 | 1	3	53	26426
15 | 1	248	53	26181
16 | 1	3933	53	22496
17 | 1	11	53	26418
18 | 2	709	52	25719
19 | 2	98	52	26330
20 | 1	407	53	26022
21 | 1	33	53	26396
22 | 7	10814	47	15609
23 | 1	35	53	26394
24 | 1	502	53	25927
25 | 2	6	52	26422
26 | 2	1466	52	24962
27 | 1	14	53	26415
28 | 1	98	53	26331
29 | 2	190	52	26238
30 | 1	140	53	26289
31 | 1	416	53	26013
32 | 1	11	53	26418
33 | 2	953	52	25475
34 | 2	88	52	26340
35 | 1	43	53	26386
36 | 


--------------------------------------------------------------------------------
/SOYGO/ANML/ANML.list_MF_output.txt:
--------------------------------------------------------------------------------
 1 | names...1.	dataout...1.
 2 | 1	GO:0052732	0.000113378327175181
 3 | 2	GO:0050660	0.480591166542297
 4 | 3	GO:0008889	0.000682525996599221
 5 | 4	GO:0004722	0.112862884147783
 6 | 5	GO:0016791	0.00100157527555157
 7 | 6	GO:0008194	0.379779252338316
 8 | 7	GO:0016787	1
 9 | 8	GO:0005507	1
10 | 9	GO:0016706	0.45756956784418
11 | 10	GO:0000293	0.0182051554747428
12 | 11	GO:0009055	0.723940515433054
13 | 12	GO:0046509	0.00813172253989661
14 | 13	GO:0003824	0.170071893343885
15 | 14	GO:0046510	0.00813172253989661
16 | 15	GO:0000287	0.399887075387603
17 | 16	GO:0003677	0.00330034159711537
18 | 17	GO:0004634	0.0242009455838455
19 | 18	GO:0005506	0.657134119228944
20 | 19	GO:0003993	0.017783051192459
21 | 20	GO:0016758	0.567966240620021
22 | 21	GO:0031418	0.067085375683233
23 | 22	GO:0003674	1.08219276510247e-05
24 | 23	GO:0004630	0.070890957238323
25 | 24	GO:0016705	1
26 | 25	GO:0052731	0.000113378327175181
27 | 26	GO:0016491	0.769249726192873
28 | 27	GO:0035250	0.0301607936014123
29 | 28	GO:0016779	0.183269928612103
30 | 29	GO:0005351	0.0585352195215228
31 | 30	GO:0005543	0.250660332521306
32 | 31	GO:0030246	0.575953551731007
33 | 32	GO:0003999	0.0242009455838455
34 | 33	GO:0016757	0.721986002468312
35 | 34	GO:0008081	0.0145753854056631
36 | 35	GO:0008514	0.0859615054691601
37 | 


--------------------------------------------------------------------------------
/SOYGO/ANML/ANML.list~:
--------------------------------------------------------------------------------
 1 | Glyma.03G078300
 2 | Glyma.16G052000
 3 | Glyma.18G001700
 4 | Glyma.19G098500
 5 | Glyma.01G191700
 6 | Glyma.08G195100
 7 | Glyma.08G056400
 8 | Glyma.06G028200
 9 | Glyma.19G064200
10 | Glyma.18G072800
11 | Glyma.17G212100
12 | Glyma.08G195000
13 | Glyma.11G226200
14 | Glyma.20G008300
15 | Glyma.06G078400
16 | Glyma.09G153900
17 | Glyma.09G223700
18 | Glyma.20G238000
19 | Glyma.17G114700
20 | Glyma.11G231700
21 | Glyma.12G013200
22 | Glyma.18G235900
23 | Glyma.03G130500
24 | Glyma.08G289300
25 | Glyma.01G214400
26 | Glyma.04G147600
27 | Glyma.11G142200
28 | Glyma.11G065900
29 | Glyma.12G065500
30 | Glyma.17G108700
31 | 


--------------------------------------------------------------------------------
/SOYGO/GOinfoKevinupdate2.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | ($infile1,$infile2)=@ARGV;
 3 | open (DATAHASH,$infile2) || die "Cannot open $infile2 \n";
 4 | while (<DATAHASH>) {
 5 |     @tmp=split /\t/;
 6 |     chomp @tmp;
 7 |    $gmid=$tmp[0];
 8 |     chomp $affyid;
 9 |     $GOBP= $tmp[2];
10 |     chomp $GOBP;
11 |     $GOMF=$tmp[3];
12 |     chomp $GOMF;
13 |     $GOCC=$tmp[4];
14 |     chomp $GOCC;
15 |     $data= $gmid . "\t" . $GOBP . "\t" . $GOMF . "\t" . $GOCC;
16 |     $data=$gmid . "\t" . $GOBP . "\t" . $GOMF;
17 | #    print "$data\n";
18 |     foreach $gmid (@tmp) {
19 | 	if ($gmid ne "") {
20 | 	    $data_lookup{$gmid} = $data;
21 | 	}
22 | 	
23 |     }
24 | #    $data='';
25 |     $gmid='';
26 |     $GOBP='';
27 |     $GOMF='';
28 |     $GOCC='';
29 |     $data=''
30 | }
31 | close (DATAHASH);
32 | open (QUERY,$infile1) || die "Cannot open $infile1 \n";
33 | 
34 | while (<QUERY>) {
35 |     @fields= split /\t/;
36 |     chomp @fields;
37 |     $requested_gm_id=$fields[0];
38 |     chomp $requested_gm_id;
39 |     if($requested_gm_id =~ /(Glyma\.\d{0,}\D{1}\d{6})/){
40 | 	$new_requested_gm_id=$1;
41 |     }
42 | 
43 | #    print MYOUT "$requested_gm_id\n";
44 |     
45 | 
46 |     else {
47 | 	$new_requested_gm_id=$requested_gm_id;
48 |     }
49 |     if (defined($data_lookup{$new_requested_gm_id})){
50 | 	print $data_lookup{$new_requested_gm_id};
51 | 	print "\n";
52 |     }
53 |     else {
54 | 	# DO NOTHING
55 | 
56 |     }
57 |     
58 | 
59 | 	
60 |     
61 | }
62 | 
63 | 
64 | close (QUERY);
65 | close (MYOUT);
66 | exit 0;
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/SOYGO/README_SOYBASE:
--------------------------------------------------------------------------------
 1 | 1. To just look up GO ids assigned to Glyma2 gene call
 2 |    perl GOinfoKevinupdate2.pl ANML.list Gmv2_GODb > ANML.4Kevin
 3 | 2. To start running GOstats (gather all go data put in fisher format)
 4 |    perl SOYGO.pl Gmv2_GODb ANML.list
 5 | 
 6 |    This will create the following files...
 7 | 
 8 |    ANML.list_BP.txt		  ANML.list_BP_out.txt	ANML.list_MF_fisher.txt
 9 |    ANML.list_BP_fisher.txt	  ANML.list_ERROR.txt	ANML.list_MF_names.txt
10 |    ANML.list_BP_names.txt	  ANML.list_MF.txt	ANML.list_MF_out.txt
11 | 
12 | 3. Run fisher test
13 |    R CMD BATCH "--args ANML.list" Script_Fisher.R
14 | 
15 |    This wiill create the following files...
16 |    ANML.list_BP_output.txt
17 |    ANML.list_MF_output.txt
18 | 
19 | 4. Combine all the data into 1 file for BP and MF
20 |    perl combinefilesbyGO.pl ANML.list_BP_fisher.txt ATH_GO_GOSLIM.022714  ANML.list_BP_output.txt > ANML.BP.final
21 |    perl combinefilesbyGO.pl ANML.list_MF_fisher.txt ATH_GO_GOSLIM.022714  ANML.list_BP_output.txt > ANML.MF.final 


--------------------------------------------------------------------------------
/SOYGO/SOYGO.pl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ISUgenomics/common_scripts/68986279ec7634d41aaf0db7e0125a24a9663bd1/SOYGO/SOYGO.pl


--------------------------------------------------------------------------------
/SOYGO/Script_Fisher.R:
--------------------------------------------------------------------------------
 1 | #usage:
 2 | #R CMD BATCH "--args list_of_ID_to_analyze" Script_Fisher.R
 3 | 
 4 | 
 5 | 
 6 | args<-commandArgs(trailingOnly = TRUE)
 7 | file<-args[1]
 8 | fileMF<-paste(file,"_MF_fisher.txt",sep="")
 9 | MF<-read.table(file=fileMF)
10 | names<-as.data.frame(MF[,1])
11 | MF<-MF[,-1]
12 | MFmatrix<-data.matrix(MF)
13 | dataout<-NULL
14 | for (i in 1:dim(MFmatrix)[1]){ 					 #calculate the p-value with a fisher exact test for each line for the Molecular Function
15 | 	fi<-fisher.test(matrix(c(MFmatrix[i,1],MFmatrix[i,2],MFmatrix[i,3],MFmatrix[i,4]),nrow=2),"greater")$p.value
16 | 	dataout[i]<-fi
17 | }
18 | dataout<-as.data.frame(dataout)
19 | df1<- data.frame(names[,1], dataout[,1])
20 | outMF<-paste(file,"_MF_output.txt",sep="")
21 | write.table(df1,file=outMF,sep="\t",quote=F) 			#print the p-values of the Molecular Functions in file _MF_output.txt
22 | 
23 | fileBP<-paste(file,"_BP_fisher.txt",sep="")
24 | BP<-read.table(file=fileBP)
25 | names<-as.data.frame(BP[,1])
26 | BP<-BP[,-1]
27 | BPmatrix<-data.matrix(BP)
28 | dataout<-NULL
29 | for (i in 1:dim(BPmatrix)[1]){ 					#calculate the p-value with a fisher exact test for each line for the Biological Process
30 | 	fi<-fisher.test(matrix(c(BPmatrix[i,1],BPmatrix[i,2],BPmatrix[i,3],BPmatrix[i,4]),nrow=2),"greater")$p.value
31 | 	dataout[i]<-fi
32 | }
33 | dataout<-as.data.frame(dataout)
34 | df1<- data.frame(names[,1], dataout[,1])
35 | outBP<-paste(file,"_BP_output.txt",sep="")
36 | write.table(df1,file=outBP,sep="\t",quote=F) 			#print the p-values of the Biological Precess in file _BP_output.txt
37 | q()
38 | n
39 | 


--------------------------------------------------------------------------------
/SOYGO/SoyGO.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export SOYGO="/data005/GIF2/severin/isugif/common_analyses/SOYGO"
 4 | perl $SOYGO/GOinfoKevinupdate2.pl $1 $SOYGO/Gmv2_GODb > $1.GOIDs
 5 | perl $SOYGO/SOYGO.pl $SOYGO/Gmv2_GODb $1
 6 | R CMD BATCH "--args $1" $SOYGO/Script_Fisher.R
 7 | perl $SOYGO/combinefilesbyGO.pl $1_BP_fisher.txt $SOYGO/ATH_GO_GOSLIM.022714  $1_BP_output.txt  > $1.BP.final
 8 | perl $SOYGO/combinefilesbyGO.pl $1_MF_fisher.txt $SOYGO/ATH_GO_GOSLIM.022714  $1_MF_output.txt  > $1.MF.final
 9 | rm $1*.txt
10 | 


--------------------------------------------------------------------------------
/SRAfq2FASTQ.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ $# -lt 1 ] ; then
 3 |         echo ""
 4 |         echo "usage: SRAfq2FASTQ.sh [fastq_file1] <fastq_file2> ..|| *.fastq"
 5 |         echo "converts SRA fastq to normal fastq"
 6 |         echo "First extra field 'SRAXXXXX' will be removed"
 7 |         echo ""
 8 |         exit 0
 9 | fi
10 | 
11 | filear=${@};
12 | for i in ${filear[@]}
13 | do
14 | theword=$(head -n 1 $i | cut -d "." -f 1 | sed 's/@//g')
15 | sed -e 's/^@'$theword'.\{4,6\}[ ]/@/' -e 's/^+'$theword'.*/+/1' $i > $i.cleaned
16 | done
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/acd_func.sh:
--------------------------------------------------------------------------------
 1 | # do ". acd_func.sh"
 2 | # acd_func 1.0.5, 10-nov-2004
 3 | # petar marinov, http:/geocities.com/h2428, this is public domain
 4 | 
 5 | cd_func ()
 6 | {
 7 |   local x2 the_new_dir adir index
 8 |   local -i cnt
 9 | 
10 |   if [[ $1 ==  "--" ]]; then
11 |     dirs -v
12 |     return 0
13 |   fi
14 | 
15 |   the_new_dir=$1
16 |   [[ -z $1 ]] && the_new_dir=$HOME
17 | 
18 |   if [[ ${the_new_dir:0:1} == '-' ]]; then
19 |     #
20 |     # Extract dir N from dirs
21 |     index=${the_new_dir:1}
22 |     [[ -z $index ]] && index=1
23 |     adir=$(dirs +$index)
24 |     [[ -z $adir ]] && return 1
25 |     the_new_dir=$adir
26 |   fi
27 | 
28 |   #
29 |   # '~' has to be substituted by ${HOME}
30 |   [[ ${the_new_dir:0:1} == '~' ]] && the_new_dir="${HOME}${the_new_dir:1}"
31 | 
32 |   #
33 |   # Now change to the new dir and add to the top of the stack
34 |   pushd "${the_new_dir}" > /dev/null
35 |   [[ $? -ne 0 ]] && return 1
36 |   the_new_dir=$(pwd)
37 | 
38 |   #
39 |   # Trim down everything beyond 11th entry
40 |   popd -n +11 2>/dev/null 1>/dev/null
41 | 
42 |   #
43 |   # Remove any other occurence of this dir, skipping the top of the stack
44 |   for ((cnt=1; cnt <= 10; cnt++)); do
45 |     x2=$(dirs +${cnt} 2>/dev/null)
46 |     [[ $? -ne 0 ]] && return 0
47 |     [[ ${x2:0:1} == '~' ]] && x2="${HOME}${x2:1}"
48 |     if [[ "${x2}" == "${the_new_dir}" ]]; then
49 |       popd -n +$cnt 2>/dev/null 1>/dev/null
50 |       cnt=cnt-1
51 |     fi
52 |   done
53 | 
54 |   return 0
55 | }
56 | 
57 | alias cd=cd_func
58 | 
59 | if [[ $BASH_VERSION > "2.05a" ]]; then
60 |   # ctrl+w shows the menu
61 |   bind -x "\"\C-w\":cd_func -- ;"
62 | fi
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/addFastaHeaders.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | #taken from https://www.biostars.org/p/90333/
 5 | #By https://www.biostars.org/u/7002/ Kenosis 
 6 | #Usage: perl script.pl foo.fa annotations.txt [>outFile.fa]
 7 | #
 8 | my $file1 = shift;
 9 | my ( %hash, @F );
10 | 
11 | while (<>) {
12 |     $hash{ $F[0] } = $F[1] if @F = split;
13 | }
14 | 
15 | local $/ = '>';
16 | push @ARGV, $file1;
17 | 
18 | while (<>) {
19 |     print ">$F[0] $hash{ $F[0] }\n$F[1]\n" if @F = split />|\n/ and $hash{ $F[0] };
20 | }
21 | 


--------------------------------------------------------------------------------
/backup:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | tar -cf - $1 | pigz -2 -R > $1.tar.gz &
4 | wait
5 | echo "$1 done"
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/bayes_script.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | FILE="$1"
3 | module load bayescan
4 | mkdir -p ${FILE%%.*}
5 | bayescan ${FILE} -od ${FILE%%.*} -threads 32 -n 1000000 -pr_odds 10000
6 | 


--------------------------------------------------------------------------------
/biostar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ISUgenomics/common_scripts/68986279ec7634d41aaf0db7e0125a24a9663bd1/biostar


--------------------------------------------------------------------------------
/blast_job_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | database="/home/severin/GIF_1/scripts/BLAST/DB/NR/nr"
 3 | rm cmds.txt
 4 | for file in Trinity_TIL01.part-???.fasta; do
 5 |   outfile=$(echo ${file%.*})
 6 |   num=$(echo ${file%.*} | cut -d "-" -f 2);
 7 |   echo "blastx -query "${file}" -db "${database}" -out "${outfile}".out -evalue 1e-20 -num_threads 1 -max_target_seqs 1 -outfmt \"6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids\"" >> cmds.txt
 8 | done
 9 | split -d -l 32 cmds.txt blast_cmds_
10 | for file in blast_cmds_*; do
11 | num=$(echo ${file} | cut -d "_" -f 3);
12 | cat <<JOBHEAD > blast_${num}.sub
13 | #!/bin/bash
14 | #PBS -l vmem=256Gb,pmem=8Gb,mem=256Gb
15 | #PBS -l nodes=1:ppn=32:ib
16 | #PBS -l walltime=48:00:00
17 | #PBS -N blast_${num}
18 | #PBS -o \${PBS_JOBNAME}.o\${PBS_JOBID} -e \${PBS_JOBNAME}.e\${PBS_JOBID}
19 | #PBS -m ae -M arnstrm@gmail.com
20 | cd \$PBS_O_WORKDIR
21 | ulimit -s unlimited
22 | echo "################ STATS ##################"
23 | SSECS=\$(date +"%s")
24 | echo \${SSECS}
25 | START=\$(date +"%r, %m-%d-%Y")
26 | echo -e "Host\t\t: \$(hostname)"
27 | echo -e "Processors\t: \$(wc -l < \$PBS_NODEFILE)"
28 | echo -e "Nodes\t\t: \$(uniq \$PBS_NODEFILE | wc -l)"
29 | echo -e "Total memory\t: \$(free | grep Mem | awk '{print \$2/1048576}' OFMT="%2.2f") Gb"
30 | echo -e "Free memory\t: \$(free | grep Mem | awk '{print \$4/1048576}' OFMT="%2.2f") Gb"
31 | echo -e "Directory\t: \$(pwd)"
32 | chmod g+rw \${PBS_JOBNAME}.[eo]\${PBS_JOBID}
33 | echo "#########################################"
34 | module use /data004/software/GIF/modules
35 | module load ncbi-blast
36 | module load parallel
37 | parallel --jobs 32 <<ALLCMD
38 | JOBHEAD
39 | cat ${file} >> blast_${num}.sub
40 | echo "ALLCMD" >> blast_${num}.sub
41 | cat <<JOBTAIL >> blast_${num}.sub
42 | echo "############# TIME STAMP ################"
43 | DIFF=\$((\`date +"%s"\`-\${SSECS}))
44 | printf "Start\t\t:\${START}\nEnd\t\t:\$(date +"%r, %m-%d-%Y")\nTIME (hh:mm:ss)\t:%02d:%02d:%02d\n" "\$((${DIFF}/3600))" "\$(((DIFF%3600)/60))" "\$(((DIFF%3600)%60))"
45 | echo "#########################################"
46 | JOBTAIL
47 | done
48 | 


--------------------------------------------------------------------------------
/blast_wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # perfomrs NR blast (blastx)
 3 | infile="$1"
 4 | outfile="$(basename "${infile%.*}").out"
 5 | database="/home/severin/GIF_1/scripts/BLAST/DB/NR/nr"
 6 | module load ncbi-blast
 7 | blastx \
 8 |  -query "${infile}" \
 9 |  -db "${database}" \
10 |  -out "${outfile}" \
11 |  -evalue 1e-20 \
12 |  -num_threads 1 \
13 |  -max_target_seqs 1 \
14 |  -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids"
15 | 


--------------------------------------------------------------------------------
/bowtie2_se_noclip.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | module use /data004/software/GIF/modules
 3 | module load samtools
 4 | module load bowtie2
 5 | INPUT="$1"
 6 | OUTPUT=$(basename ${INPUT})
 7 | GENOME="/home/arnstrm/arnstrm/GMAPDB/Lvannamei_ray_scaffolds"
 8 | bowtie2 --un-gz ${OUTPUT%%.*}_unaligned.fq.gz --threads 8 -x ${GENOME} -U ${INPUT} -S ${OUTPUT%%.*}.sam
 9 | samtools view -bS ${OUTPUT%%.*}.sam > ${OUTPUT%%.*}.bam
10 | samtools sort ${OUTPUT%%.*}.bam ${OUTPUT%%.*}_sorted
11 | 


--------------------------------------------------------------------------------
/bwa_map_sort.sh:
--------------------------------------------------------------------------------
 1 | module use /data004/software/GIF/modules
 2 | module load bwa
 3 | module load samtools
 4 | 
 5 | refgenome1="/home/arnstrm/arnstrm/20140710_Hufford_teosinte_TIL01/06_FRC/RAY_assembly.fasta"
 6 | refgenome2="/home/arnstrm/arnstrm/20140710_Hufford_teosinte_TIL01/06_FRC/ALLPATHS_assembly.fasta"
 7 | refgenome3="/home/arnstrm/arnstrm/20140710_Hufford_teosinte_TIL01/06_FRC/MaSuRCA_C100_assembly.fasta"
 8 | 
 9 | ref1=$(basename ${refgenome1} | cut -d "_" -f 1)
10 | ref2=$(basename ${refgenome2} | cut -d "_" -f 1)
11 | ref3=$(basename ${refgenome3} | cut -d "_" -f 1)
12 | 
13 | 
14 | readA1="/home/arnstrm/arnstrm/20140710_Hufford_teosinte_TIL01/01_DATA/A_FULL/A_180bp/3510_3807_3510_N_TIP521_4_R1.fastq"
15 | readB1="/home/arnstrm/arnstrm/20140710_Hufford_teosinte_TIL01/01_DATA/A_FULL/B_250bp/SRR447882N3_C1.fastq"
16 | readC1="/home/arnstrm/arnstrm/20140710_Hufford_teosinte_TIL01/01_DATA/A_FULL/C_2000bp/471_3807_3653_N_TIP521.4_CGATGT_R1.fastq"
17 | readD1="/home/arnstrm/arnstrm/20140710_Hufford_teosinte_TIL01/01_DATA/A_FULL/D_8000bp/run562.EAR123_L004_R1.fastq"
18 | 
19 | readA2=$(echo ${readA1} |sed 's/1.fastq$/2.fastq/g')
20 | readB2=$(echo ${readB1} |sed 's/1.fastq$/2.fastq/g')
21 | readC2=$(echo ${readC1} |sed 's/1.fastq$/2.fastq/g')
22 | readD2=$(echo ${readD1} |sed 's/1.fastq$/2.fastq/g')
23 | 
24 | outnameA=$(echo ${readA1} |sed 's/_.1.fastq$//g')
25 | outnameB=$(echo ${readB1} |sed 's/_.1.fastq$//g')
26 | outnameC=$(echo ${readC1} |sed 's/_.1.fastq$//g')
27 | outnameD=$(echo ${readD1} |sed 's/_.1.fastq$//g')
28 | 
29 | bwa mem -M -t 8 ${refgenome1} ${readA1} ${readA2} | samtools view -buS - | samtools sort - ${ref1}_${outnameA}.sorted
30 | bwa mem -M -t 8 ${refgenome1} ${readB1} ${readB2} | samtools view -buS - | samtools sort - ${ref1}_${outnameB}.sorted
31 | bwa mem -M -t 8 ${refgenome1} ${readC1} ${readC2} | samtools view -buS - | samtools sort - ${ref1}_${outnameC}.sorted
32 | bwa mem -M -t 8 ${refgenome1} ${readD1} ${readD2} | samtools view -buS - | samtools sort - ${ref1}_${outnameD}.sorted
33 | 
34 | bwa mem -M -t 8 ${refgenome2} ${readA1} ${readA2} | samtools view -buS - | samtools sort - ${ref2}_${outnameA}.sorted
35 | bwa mem -M -t 8 ${refgenome2} ${readB1} ${readB2} | samtools view -buS - | samtools sort - ${ref2}_${outnameB}.sorted
36 | bwa mem -M -t 8 ${refgenome2} ${readC1} ${readC2} | samtools view -buS - | samtools sort - ${ref2}_${outnameC}.sorted
37 | bwa mem -M -t 8 ${refgenome2} ${readD1} ${readD2} | samtools view -buS - | samtools sort - ${ref2}_${outnameD}.sorted
38 | 
39 | bwa mem -M -t 8 ${refgenome3} ${readA1} ${readA2} | samtools view -buS - | samtools sort - ${ref3}_${outnameA}.sorted
40 | bwa mem -M -t 8 ${refgenome3} ${readB1} ${readB2} | samtools view -buS - | samtools sort - ${ref3}_${outnameB}.sorted
41 | bwa mem -M -t 8 ${refgenome3} ${readC1} ${readC2} | samtools view -buS - | samtools sort - ${ref3}_${outnameC}.sorted
42 | bwa mem -M -t 8 ${refgenome3} ${readD1} ${readD2} | samtools view -buS - | samtools sort - ${ref3}_${outnameD}.sorted
43 | 
44 | 


--------------------------------------------------------------------------------
/cb:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # cp2clip - copy to the clipboard the contents of a file
 3 | 
 4 | module load xclip
 5 | #PATH=/software//GIF/GIF/programs/xclip-0.12:$PATH
 6 | # Program name from it's filename
 7 | prog=${0##*/}
 8 | 
 9 | # Text color variables
10 | bldblu='\e[1;34m'         # blue
11 | bldred='\e[1;31m'         # red
12 | bldwht='\e[1;37m'         # white
13 | txtbld=$(tput bold)       # bold
14 | txtund=$(tput sgr 0 1)    # underline
15 | txtrst='\e[0m'            # text reset
16 | info=${bldwht}*${txtrst}
17 | pass=${bldblu}*${txtrst}
18 | warn=${bldred}!${txtrst}
19 | 
20 | 
21 | if [ "$#" -ne 1 ]; then
22 | xclip -in -selection primary 
23 | echo -e "$pass ${txtund}"${filename##*/}"${txtrst} copied to clipboard"
24 | else
25 | 
26 | filename=$@
27 | 
28 | # Display usage if full argument isn't given
29 | if [[ -z $filename ]]; then
30 |     echo " $prog <filename> - copy a file to the clipboard"
31 |     exit
32 | fi
33 | 
34 | # Check that file exists
35 | if [[ ! -f $filename ]]; then
36 |   echo -e "$warn File ${txtund}$filename${txtrst} doesn't exist"
37 |   exit
38 | fi
39 | 
40 | # Check user is not root (root doesn't have access to user xorg server)
41 | if [[ $(whoami) == root ]]; then
42 |   echo -e "$warn Must be regular user to copy a file to the clipboard"
43 |   exit
44 | fi
45 | 
46 | # Copy file to clipboard, give feedback
47 | xclip -in -selection primary < "$filename"
48 | echo -e "$pass ${txtund}"${filename##*/}"${txtrst} copied to clipboard"
49 | fi
50 | 


--------------------------------------------------------------------------------
/cdgit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | function cdgit() {
4 | 
5 | cd "/work/GIF/"$USER"/gitdirs"
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/cegma.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Runs CEGMA for the genome
 3 | module use /data004/software/GIF/modules
 4 | module load cegma/2.5
 5 | export CEGMATMP="/scratch/arnstrm"
 6 | genome="$1"
 7 | cegma \
 8 |   --ext \
 9 |   --threads 32 \
10 |   --verbose \
11 |   --genome ${genome}
12 | 
13 | 


--------------------------------------------------------------------------------
/checkBlastStatus.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # gives the blast run status (if run based on wiki method)
 3 | # simply run it as ./check_status.sh to print the table to stdout
 4 | echo -e "BLAST File\tTotal\tCompleted (hits found)";
 5 | for file in *.out; do
 6 | if [ -s "$file" ]
 7 | then
 8 | uniqhits=$(cut -f 1 "${file}" | sort | uniq | wc -l)
 9 | fastaseq=$(grep -c ">" "${file%.*}.fasta")
10 | currentseq=$(tail -n 1 "$file" |cut -f 1)
11 | currentnum=$(grep -n ">" ${file%.*}.fasta| grep -n "${currentseq}" | cut -f 1 -d ":")
12 | echo -e "${file%.*}\t${fastaseq}\t${currentnum} (${uniqhits})";
13 | else
14 | fastaseq=$(grep -c ">" "${file%.*}.fasta")
15 | echo -e "${file%.*}\t${fastaseq}\t0 (0)"
16 | fi
17 | done
18 | 


--------------------------------------------------------------------------------
/clean_trinity.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | unlink Trinity.fasta
3 | mv trinity_out/Trinity.fasta ./
4 | rm -rf trinity_out
5 | 
6 | 


--------------------------------------------------------------------------------
/combineRawFiles.sh:
--------------------------------------------------------------------------------
 1 | /bin/bash
 2 | 
 3 | #input file is
 4 | #a two column association between the index ID and the provided Sample ID
 5 | ###Example INPUT file
 6 | #7492 LP-5615-F
 7 | #7493 LP-6188-F
 8 | #7494 LP-6186-F
 9 | #7495 LP-6184-F
10 | 
11 | #first column is a uniq identifier for the sample assigned by DNA sequencing facility
12 | #second column is a more descriptive sample name to be attached.
13 | 
14 | while IFS='' read -r line || [[ -n "$line" ]]; do
15 | #ADDR is just a variable for the line information ADDR[0] is the first column value and ADDR[1] is second column value for each line of SampleInfo.2col
16 | read -ra ADDR <<< "$line"
17 | tempR1=$(find ./ -name "${ADDR[0]}*R1*fastq")
18 | tempR2=$(find ./ -name "${ADDR[0]}*R2*fastq")
19 | cat $tempR1 | paste - - - - | sort -k1,1 -S 30G | tr '\t' '\n'  > ${ADDR[0]}_${ADDR[1]}_R1_001.fastq &
20 | cat $tempR2 | paste - - - - | sort -k1,1 -S 30G | tr '\t' '\n'  > ${ADDR[0]}_${ADDR[1]}_R2_001.fastq &
21 | wait
22 | 
23 | done < "Sample.2col" 
24 | #end of while loop
25 | 
26 | #do some word counting to verify that the files that are being combined add up to the correct word count
27 | #will need to check this by hand
28 | 
29 | find ./Sample* -name "*fastq" | xargs -I xx wc -l xx > counts.txt
30 | wc -l *fastq > counts.combined.txt
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/configFiles/vcf_structure.spid:
--------------------------------------------------------------------------------
 1 | # spid-file generated: Wed Jun 24 13:22:43 CDT 2015
 2 | 
 3 | # VCF Parser questions
 4 | PARSER_FORMAT=VCF
 5 | 
 6 | # Only output SNPs with a phred-scaled quality of at least:
 7 | VCF_PARSER_QUAL_QUESTION=
 8 | # Select population definition file:
 9 | VCF_PARSER_POP_FILE_QUESTION=
10 | # What is the ploidy of the data?
11 | VCF_PARSER_PLOIDY_QUESTION=DIPLOID
12 | # Do you want to include a file with population definitions?
13 | VCF_PARSER_POP_QUESTION=false
14 | # Output genotypes as missing if the phred-scale genotype quality is below:
15 | VCF_PARSER_GTQUAL_QUESTION=
16 | # Do you want to include non-polymorphic SNPs?
17 | VCF_PARSER_MONOMORPHIC_QUESTION=false
18 | # Only output following individuals (ind1, ind2, ind4, ...):
19 | VCF_PARSER_IND_QUESTION=
20 | # Only input following regions (refSeqName:start:end, multiple regions: whitespace separated):
21 | VCF_PARSER_REGION_QUESTION=
22 | # Output genotypes as missing if the read depth of a position for the sample is below:
23 | VCF_PARSER_READ_QUESTION=
24 | # Take most likely genotype if "PL" or "GL" is given in the genotype field?
25 | VCF_PARSER_PL_QUESTION=false
26 | # Do you want to exclude loci with only missing data?
27 | VCF_PARSER_EXC_MISSING_LOCI_QUESTION=false
28 | 
29 | # STRUCTURE Writer questions
30 | WRITER_FORMAT=STRUCTURE
31 | 
32 | # Specify the locus/locus combination you want to write to the STRUCTURE file:
33 | STRUCTURE_WRITER_LOCUS_COMBINATION_QUESTION=
34 | # Do you want to include inter-marker distances?
35 | STRUCTURE_WRITER_LOCI_DISTANCE_QUESTION=false
36 | # Specify which data type should be included in the STRUCTURE file  (STRUCTURE can only analyze one data type per file):
37 | STRUCTURE_WRITER_DATA_TYPE_QUESTION=SNP
38 | # Save more specific fastSTRUCTURE format?
39 | STRUCTURE_WRITER_FAST_FORMAT_QUESTION=true
40 | 


--------------------------------------------------------------------------------
/connected_graphs.awk:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/awk -f
 2 | # NAME
 3 | #     connected_graphs.awk - show vertices of connected graphs in a set of edges
 4 | #
 5 | # SYNOPSIS
 6 | #     connected_graphs.awk [FILE...]
 7 | #
 8 | # DESCRIPTION
 9 | #     Takes edges as input and outputs vertices in each connected graph found
10 | #     in the input.
11 | #
12 | # STDIN
13 | #     The standard input will be used only if no file operands are specified, 
14 | #     or if a file operand is '-'.
15 | # 
16 | # INPUT FILES
17 | #     FILE
18 | #         A list of undirected edges, one per line. Each line contains two
19 | #         space- or tab-separated columns that contain the vertex names. The
20 | #         order in which the vertices appear does not matter, and there may be
21 | #         loops (i.e., the same vertex appears in both columns).
22 | #     
23 | # STDOUT
24 | #     Each line of output lists the vertices in a connected graph.
25 | #
26 | # EXAMPLES
27 | #     $ cat edges.txt
28 | #     1 2
29 | #     2 3
30 | #     4 5
31 | #     10 20
32 | #     20 10
33 | #     10 10
34 | #     3 4
35 | #     $ connected_graphs.awk edges.txt
36 | #     20 10
37 | #     1 2 3 4 5
38 | #
39 | # AUTHOR
40 | #     Nathan Weeks <nathan.weeks@ars.usda.gov>
41 | 
42 | NF == 0 { next } # Skip empty lines. This may occur if the last line is empty, 
43 |                  # resulting in a blank line being printed at some point
44 | NF != 2 {print "ERROR: invalid input (line " NR "):" $0; exit}
45 | 
46 | # Neither vertex seen; add new connected graph
47 | !($1 in lookup || $2 in lookup) {
48 |     if ($1 == $2) {
49 |         connected_graph[++count] = $1
50 |         lookup[$1] = count
51 |     } else {
52 |         connected_graph[++count] = $1 " " $2
53 |         lookup[$1] = count
54 |         lookup[$2] = count
55 |     }
56 |     next
57 | }
58 | 
59 | # first vertex is in a connected graph; add second vertex to the graph
60 | $1 in lookup && !($2 in lookup) {
61 |     connected_graph[lookup[$1]] = connected_graph[lookup[$1]] " " $2
62 |     lookup[$2] = lookup[$1]
63 |     next
64 | }
65 | 
66 | # second vertex is in a connected graph; add first vertex to the graph
67 | !($1 in lookup) && $2 in lookup {
68 |     connected_graph[lookup[$2]] = connected_graph[lookup[$2]] " " $1
69 |     lookup[$1] = lookup[$2]
70 |     next
71 | }
72 | 
73 | # If we made it this far, both vertices are in connected graphs (i.e., "$1 in
74 | # lookup && $2 in lookup").
75 | lookup[$1] != lookup[$2] { # if they are not the same connected graph
76 |     # absorb second connected graph into first
77 |     connected_graph[lookup[$1]] = connected_graph[lookup[$1]] " " \
78 |                                   connected_graph[lookup[$2]]
79 |     
80 |     # delete the absorbed connected graph, and change lookup table entries for
81 |     # all vertices that were in it to point to the first connected graph
82 |     split(connected_graph[lookup[$2]], vertices)
83 |     delete connected_graph[lookup[$2]] 
84 |     for (vertex in verticies)
85 |         lookup[vertex] = lookup[$1]
86 | }
87 | 
88 | # print each connected graph
89 | END { for(graph in connected_graph) print connected_graph[graph] }
90 | 


--------------------------------------------------------------------------------
/count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | file="$1"
 3 | lines=$(zcat ${file} | wc -l)
 4 | count=$(($lines / 4))
 5 | #echo -n -e "\t$i : "
 6 | echo -n "${file} :"
 7 | echo "${count}"  | \
 8 | sed -r '
 9 |   :L
10 |   s=([0-9]+)([0-9]{3})=\1,\2=
11 |   t L'
12 | 
13 | 


--------------------------------------------------------------------------------
/count_fastq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is a bash script to count number of reads in fastq files
 3 | # 08/04/2014
 4 | # Arun Seetharam <arnstrm@iastate.edu>
 5 | 
 6 | function printUsage () {
 7 |     cat <<EOF
 8 | 
 9 | Synopsis
10 | 
11 |     $scriptName [-h | --help] [-b | --base-count] fastq_file1 [fastq_file2 fastq_file3 ..] || *.fastq[.gz]
12 | 
13 | Description
14 | 
15 |     This is a bash script counts the total number of reads in every fastq file
16 |     It also prints the number of reads tabular and easy to read format
17 | 
18 |         -h, --help
19 |         Brings up this help page
20 | 
21 |         -b, --base-count
22 |         include base counts along with number of reads
23 | 
24 | 	fastq_file
25 | 	A standard fastq file with any extension [but not compressed]
26 | 
27 | Author
28 | 
29 |     Arun Seetharam, Genome Informatics Facilty, Iowa State University
30 |     arnstrm@iastate.edu
31 |     08 April, 2014
32 | 
33 | 
34 | 
35 | EOF
36 | }
37 | 
38 | 
39 | 
40 | if [ $# -lt 1 ] ; then
41 |         printUsage
42 | 	exit 0
43 | fi
44 | 
45 | while :
46 | do
47 |     case $1 in
48 |         -h | --help | -\?)
49 |             printUsage
50 |             exit 0
51 |             ;;
52 |         --)
53 |             shift
54 |             break
55 |             ;;
56 |         -*)
57 |             printf >&2 'WARNING: Unknown option (ignored): %s\n' "$1"
58 |             shift
59 |             ;;
60 |         *)
61 |             break
62 |             ;;
63 |     esac
64 | done
65 | 
66 | filear=${@};
67 | for i in ${filear[@]}
68 | do
69 | 
70 | if [ ! -f $i ]; then
71 |     echo "\"$i\" file not found!"
72 |     exit 1;
73 | fi
74 | 
75 | if [[ $i =~ \.gz$ ]]; then
76 | lines=$(zcat $i | wc -l |cut -d " " -f 1)
77 | count=$(($lines / 4))
78 | echo -n -e "\t$i : "
79 | echo "$count"  | \
80 | sed -r '
81 |   :L
82 |   s=([0-9]+)([0-9]{3})=\1,\2=
83 |   t L'
84 | else
85 | lines=$(wc -l $i|cut -d " " -f 1)
86 | count=$(($lines / 4))
87 | echo -n -e "\t$i : "
88 | echo "$count"  | \
89 | sed -r '
90 |   :L
91 |   s=([0-9]+)([0-9]{3})=\1,\2=
92 |   t L'
93 | fi
94 | done
95 | 
96 | 


--------------------------------------------------------------------------------
/create_sam.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for f in gsnap_?; do
 3 | cd $f;
 4 | grep "^@" *3.concordant_uniq > header
 5 | cat *[0123].paired_uniq* >> $f.txt;
 6 | cat *concordant_uniq | grep -v "^@" >> $f.txt;
 7 | cat header $f.txt > ../$f.sam;
 8 | cd ..;
 9 | done
10 | wc -l gsnap_?/gsnap_?.txt > alignment-stats.txt
11 | 
12 | 


--------------------------------------------------------------------------------
/dos2unix:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ISUgenomics/common_scripts/68986279ec7634d41aaf0db7e0125a24a9663bd1/dos2unix


--------------------------------------------------------------------------------
/downloadSRA_ebi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ $# -lt 1 ] ; then
 3 |         echo ""
 4 |         echo "usage: downloadSRA_ebi.sh <FILE> "
 5 |         echo ""
 6 |         echo "FILE should contain list of SRR ids, one per line"
 7 |         echo "downloads the SRR file from the ebi server"
 8 |         echo "considers if the file is single-end or paired-end and downloads"
 9 |         echo ""
10 | exit 0
11 | fi
12 | 
13 | function validate_url(){
14 |   if [[ $(wget -S --spider $1  2>&1 | grep 'No such file') ]]; then echo "false"; fi
15 | }
16 | 
17 | file="$1"
18 | while read line; do
19 | num=$(echo "${line: -1}")
20 | part=$(echo "${line::6}")
21 | 
22 | if [[ $(validate_url ftp://ftp.sra.ebi.ac.uk/vol1/fastq/${part}/00${num}/${line}/${line}.fastq.gz) -eq false ]]; then
23 |   wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/${part}/00${num}/${line}/${line}_1.fastq.gz
24 |   wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/${part}/00${num}/${line}/${line}_2.fastq.gz
25 | else
26 |   wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/${part}/00${num}/${line}/${line}.fastq.gz
27 | fi
28 | done<$file
29 | 


--------------------------------------------------------------------------------
/edit_sam_files.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | for samfile in *.sam; do
3 | echo ${samfile};
4 | grep -v -E "[[:digit:]]{7,15}N" ${samfile} > ${samfile%%.*}_edited.sam;
5 | done
6 | 
7 | 


--------------------------------------------------------------------------------
/end_time.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # quick check for job completion
3 | # Arun Seetharam 25 April, 2014
4 | for file in $(find . -name "*o*.hpc5"); do
5 | pname=$(basename $file)
6 | rtime=$(grep "TIME (hh:mm:ss)" ${file} | awk '{print $NF}')
7 | echo -e "${pname}\t${rtime}"
8 | done
9 | 


--------------------------------------------------------------------------------
/extract_seq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ $# -lt 2 ] ; then
 3 | echo "extract_seq.sh <gff> <fasta>"
 4 | exit 0
 5 | fi
 6 | 
 7 | GFF="$1"
 8 | FASTA="$2"
 9 | GBASE=$(basename ${GFF%.*})
10 | 
11 | module load bedtools
12 | 
13 | CODW="/data004/software/GIF/packages/codonW/1.4.4"
14 | 
15 | awk '$3=="CDS" {print $0}' ${GFF} > ${GBASE}_CDS.gff
16 | bedtools getfasta -fi ${FASTA} -bed ${GBASE}_CDS.gff -fo ${GBASE}_CDS.fasta
17 | ${CODW}/codonw ${GBASE}_CDS.fasta ${GBASE}_GC3.txt ${GBASE}_GC3.blk -nomenu -nowarn -noblk -gc3s
18 | sed -i -e 's/ //g' -e 's/:/\t/g' -e 's/-/\t/g' ${GBASE}_GC3.txt
19 | awk '{print $0, $3-$2+1}' ${GBASE}_GC3.txt | awk '($NF % 3) == 0' | awk '$NF >= 300' > ${GBASE}_filtered.txt
20 | 
21 | for chr in $(cut -f 1 ${GBASE}_filtered.txt |sort | uniq); do
22 |   grep -w "${chr}" ${GBASE}_filtered.txt | sort -k 3,3 -n | awk '{print NR"\t"$4*100}'> ${GBASE}_${chr}.txt
23 |   sed -i '1 i Order\tChr'${chr}'' ${GBASE}_${chr}.txt
24 |   lines=$(wc -l ${GBASE}_${chr}.txt | cut -d " " -f 1);
25 |   if [ "${lines}" -gt "20" ]; then
26 | cat <<CMD1 >> rplots_pdf.R
27 | dv <- read.table("${GBASE}_${chr}.txt", header=1)
28 | gc3pc = dv[,2]
29 | coef15 = 1/15
30 | ma15 = filter(gc3pc, rep(coef15, 15), sides=1)
31 | jpeg("${GBASE}_Chr${chr}.jpg", width=5, height=5, units="in", res=500)
32 | plot(gc3pc, type="l", main="Chromosome ${chr}", xlab="Gene number", ylab="GC3 %", col="white")
33 | lines(ma15, col="black")
34 | dev.off()
35 | CMD1
36 | else
37 |   echo "#skipping Chr${chr}"
38 | fi
39 | done
40 | 
41 | Rscript rplots_pdf.R;
42 | mkdir -p ${GBASE}_plots
43 | mv *.jpg ${GBASE}_plots/
44 | 


--------------------------------------------------------------------------------
/extract_seq.sh.save:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ $# -lt 2 ] ; then
 3 | echo "extract_seq.sh <gff> <fasta>"
 4 | exit 0
 5 | fi
 6 | 
 7 | GFF="$1"
 8 | FASTA="$2"
 9 | GBASE=$(basename ${GFF%.*})
10 | 
11 | module load bedtools
12 | 
13 | CODW="/data004/software/GIF/packages/codonW/1.4.4"
14 | 
15 | awk '$3=="CDS" {print $0}' ${GFF} > ${GBASE}_CDS.gff
16 | bedtools getfasta -fi ${FASTA} -bed ${GBASE}_CDS.gff -fo ${GBASE}_CDS.fasta
17 | 
18 | #for CGD only
19 | lp=$(echo $GFF |sed 's/_features.gff//g')
20 | fp="Supercontig"
21 | sed -i -e 's/>Supercontig_/SC_/g' -e  
22 | 
23 | 
24 | ${CODW}/codonw ${GBASE}_CDS.fasta ${GBASE}_GC3.txt ${GBASE}_GC3.blk -nomenu -nowarn -noblk -gc3s
25 | sed -i -e 's/ //g' -e 's/:/\t/g' -e 's/-/\t/g' ${GBASE}_GC3.txt
26 | awk '{print $0, $3-$2+1}' ${GBASE}_GC3.txt | awk '($NF % 3) == 0' | awk '$NF >= 300' > ${GBASE}_filtered.txt
27 | 
28 | for chr in $(cut -f 1 ${GBASE}_filtered.txt |sort | uniq); do
29 |   grep -w "${chr}" ${GBASE}_filtered.txt | sort -k 3,3 -n | awk '{print NR"\t"$4*100}'> ${GBASE}_${chr}.txt
30 |   sed -i '1 i Order\tChr'${chr}'' ${GBASE}_${chr}.txt
31 |   lines=$(wc -l ${GBASE}_${chr}.txt | cut -d " " -f 1);
32 |   if [ "${lines}" -gt "20" ]; then
33 | cat <<CMD1 >> rplots_pdf.R
34 | dv <- read.table("${GBASE}_${chr}.txt", header=1)
35 | gc3pc = dv[,2]
36 | coef15 = 1/15
37 | ma15 = filter(gc3pc, rep(coef15, 15), sides=1)
38 | jpeg("${GBASE}_Chr${chr}.jpg", width=5, height=5, units="in", res=500)
39 | plot(gc3pc, type="l", main="Chromosome ${chr}", xlab="Gene number", ylab="GC3 %", col="white")
40 | lines(ma15, col="black")
41 | dev.off()
42 | CMD1
43 | else
44 |   echo "#skipping Chr${chr}"
45 | fi
46 | done
47 | 
48 | Rscript rplots_pdf.R;
49 | mkdir -p ${GBASE}_plots
50 | mv *.jpg ${GBASE}_plots/
51 | 


--------------------------------------------------------------------------------
/fasta2fastq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | Usage = """
 3 | Convert the FASTA sequences to static FASTQ format
 4 | Quality values are encoded as I for 40
 5 | needs input file and ouputfile names
 6 | Usage:
 7 |   fasta2fastq.py inputfile.fasta output.fastq
 8 | 
 9 | Arun Seetharam
10 | arnstrm@iastate.edu
11 | fasta2fastq.py -version 1.0
12 | 11/04/2015
13 | """
14 | from Bio.SeqIO.FastaIO import SimpleFastaParser
15 | import sys
16 | if len(sys.argv)<3:
17 |     print Usage
18 | else:
19 |    cmdargs = str(sys.argv)
20 |    with open(str(sys.argv[1]) as in_handle:
21 |       with open(str(sys.argv[2], "w") as out_handle:
22 |          for title, seq in SimpleFastaParser(in_handle):
23 |             out_handle.write("@%s\n%s\n+\n%s\n" \
24 |                              % (title, seq, "I" * len(seq)))
25 | 


--------------------------------------------------------------------------------
/fastaMulti2singleLine.pl:
--------------------------------------------------------------------------------
 1 | #!/bin/perl
 2 | 
 3 | use strict;
 4 | 
 5 | my @Output = ();
 6 | 
 7 | my $old_sep = $/;
 8 | $/ = '>';
 9 | 
10 | open (INPUT, $ARGV[0]) or die $!." $ARGV[0]";
11 | while (<INPUT>)
12 |   {
13 |     next if (length $_ == 1);
14 |     chomp;
15 |     my @lines = split(/\n/);
16 |     my $header = shift @lines;
17 |     my $seq = join(//, @lines);
18 | 	print ">".$header."\n";
19 | 	foreach my $o (@lines)
20 |   {
21 |     print $o; 
22 |   }
23 |     print "\n";
24 | 	push (@Output, ">".$header);
25 |     push (@Output, $seq);
26 |   }
27 | close INPUT;
28 | 
29 | # $/ = $old_sep;
30 | # open (OUTPUT, "> $ARGV[0].new") or die $!." $ARGV[0].new";
31 | # foreach my $o (@Output)
32 |   # {
33 |     # print OUTPUT $o."\n"; 
34 |   # }
35 | 
36 | # close OUTPUT;
37 | 


--------------------------------------------------------------------------------
/fastaSortByName.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | my $usage="\nUsage: $0 [-hrg] [fastaFileName1 ...]\n".
 4 |     "  -h: help\n".
 5 |     "  -r: reverse\n" .
 6 |     "  -g: remove gaps '-' from the sequence\n".
 7 |     "Sort FASTA sequences alphabetically by names.  If multiple files are \n".
 8 |     "given, sequences in all files are marged before sorting.  If no \n".
 9 |     "argument is given, it will take STDIN as the input\n";
10 | 
11 | our($opt_h, $opt_g, $opt_r);
12 | 
13 | use Bio::SeqIO;
14 | 
15 | use Getopt::Std;
16 | getopts('hgr') || die "$usage\n";
17 | die "$usage\n" if (defined($opt_h));
18 | 
19 | my $format = "fasta";
20 | my @seqArr = ();
21 | 
22 | @ARGV = ('-') unless @ARGV;
23 | while (my $file = shift) {
24 |     my $seqio_obj = Bio::SeqIO->new(-file => $file, -format => $format);
25 |     while (my $seq = $seqio_obj->next_seq()) {
26 | 	push(@seqArr, $seq);
27 |     }
28 | }
29 | 
30 | if (defined($opt_r)) {
31 |     @seqArr = sort { - ($a->id() cmp $b->id()) } @seqArr;
32 | } else {
33 |     @seqArr = sort { $a->id() cmp $b->id() } @seqArr;
34 | }
35 | 
36 | 
37 | my $seqOut = Bio::SeqIO->new(-fs => \*STDOUT, -format => $format);
38 | foreach my $s (@seqArr) {
39 |     if(defined($opt_g)) {
40 | 	my $tmp = $s->seq();
41 | 	$tmp =~ s/-//g;
42 | 	$s->seq($tmp);
43 |     }
44 |     $seqOut->write_seq($s);
45 | }
46 | 
47 | exit;
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/fasta_length.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | Usage = """
 4 | Print the length of each fasta sequence in a file
 5 | by default prints to the stdout
 6 | Usage:
 7 |   fasta_length.py inputfile.fasta > seq_length.txt
 8 | 
 9 | Arun Seetharam
10 | arnstrm@iastate.edu
11 | fasta_length.py -version 1.0
12 | 07/01/2015
13 | """
14 | from Bio import SeqIO
15 | import sys
16 | if len(sys.argv)<2:
17 |     print Usage
18 | else:
19 |     cmdargs = str(sys.argv)
20 |     for seq_record in SeqIO.parse(str(sys.argv[1]), "fasta"):
21 |         output_line = '%s\t%i' % \
22 |         (seq_record.id, len(seq_record)) 
23 |         print(output_line)
24 | 


--------------------------------------------------------------------------------
/fastarange:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ISUgenomics/common_scripts/68986279ec7634d41aaf0db7e0125a24a9663bd1/fastarange


--------------------------------------------------------------------------------
/fastasplitn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ISUgenomics/common_scripts/68986279ec7634d41aaf0db7e0125a24a9663bd1/fastasplitn


--------------------------------------------------------------------------------
/fastq2fasta.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cat $1 | perl -pe 's/ /_/g' | paste - - - - | awk '{print ">"$1;print $2}' | perl -pe 's/^>@/>/g' > ${1%%.*}.fasta
4 | 


--------------------------------------------------------------------------------
/fastqc_parse.py:
--------------------------------------------------------------------------------
 1 | #!/shared/software/GIF/programs/python/3.5.1/bin/python3
 2 | ###!/usr/bin/env python3
 3 | # by James Ashmore 
 4 | # https://www.biostars.org/u/13632/
 5 | # Biostar post: https://www.biostars.org/p/152138/#152151
 6 | # 08/26/2015
 7 | 
 8 | # Import necessary libraries:
 9 | import csv
10 | import os
11 | import subprocess
12 | import zipfile
13 | 
14 | # List modules used by FastQC:
15 | modules = ['Basic_Statistics',
16 |            'Per_base_sequence_quality',
17 |            'Per_tile_sequence_quality',
18 |            'Per_sequence_quality_scores',
19 |            'Per_base_sequence_content',
20 |            'Per_sequence_GC_content',
21 |            'Per_base_N_content',
22 |            'Sequence_Length_Distribution',
23 |            'Sequence_Duplication_Levels',
24 |            'Overrepresented_sequences',
25 |            'Adapter_Content',
26 |            'Kmer_Content']
27 | 
28 | # Set dict to convert module results to integer scores:
29 | scores = {'pass': 1,
30 |           'warn': 0,
31 |           'fail': -1}
32 | 
33 | # Get current working directory:
34 | cwd = os.getcwd()
35 | 
36 | # Get list of '_fastqc.zip' files generated by FastQC:
37 | files = [file for file in os.listdir(cwd) if file.endswith('_fastqc.zip')]
38 | 
39 | # List to collect module scores for each '_fastqc.zip' file:
40 | all_mod_scores = []
41 | 
42 | # Read fastqc_data.txt file in each archive:
43 | for file in files:
44 |     archive = zipfile.ZipFile(file, 'r') # open '_fastqc.zip' file
45 |     members = archive.namelist() # return list of archive members
46 |     fname = [member for member in members if 'fastqc_data.txt' in member][0] # find 'fastqc_data.txt' in members
47 |     data = archive.open(fname) # open 'fastqc_data.txt'
48 |   
49 |     # Get module scores for this file:
50 |     mod_scores = [file]
51 |     for line in data:
52 |         text = line.decode('utf-8') 
53 |         if '>>' in text and '>>END' not in text:
54 |             text = text.lstrip('>>').split()
55 |             module = '_'.join(text[:-1])
56 |             result = text[-1]
57 |             mod_scores.append(scores[result])
58 |     
59 |     # Append to all module scores list:
60 |     all_mod_scores.append(mod_scores)
61 |     
62 |     # close all opened files:
63 |     data.close()
64 |     archive.close()
65 | 
66 | # Write scores out to a CSV file:
67 | with open('all_mod_scores.csv', 'w') as f:
68 |     writer = csv.writer(f)
69 |     for mod_scores in all_mod_scores:
70 |         writer.writerow(mod_scores)
71 |     f.close()
72 | 
73 | 


--------------------------------------------------------------------------------
/fastqc_stats.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for f in *_fastqc; do
 4 | name=$(grep "Filename" ${f}/fastqc_data.txt |cut -f 2)
 5 | #type=$(grep "File type" ${f}/fastqc_data.txt |cut -f 2)
 6 | #enc=$(grep "Encoding" ${f}/fastqc_data.txt | cut -f 2)
 7 | seq=$(grep "Total Sequences" ${f}/fastqc_data.txt | cut -f 2)
 8 | len=$(grep "Sequence length" ${f}/fastqc_data.txt | cut -f 2)
 9 | gc=$(grep "%GC" ${f}/fastqc_data.txt |grep -v "Base" | cut -f 2 )
10 | echo -e "$name\t$len\t$seq\t$gc";
11 | done
12 | 
13 | 


--------------------------------------------------------------------------------
/fasttrans:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ISUgenomics/common_scripts/68986279ec7634d41aaf0db7e0125a24a9663bd1/fasttrans


--------------------------------------------------------------------------------
/filecount:
--------------------------------------------------------------------------------
1 | #!/bin/bash 
2 | 
3 | find . -maxdepth 1 -type d | while read -r dir; do printf "%s:\t" "$dir"; find "$dir" -type f | wc -l; done > fc.txt &
4 | 


--------------------------------------------------------------------------------
/filter_parllel_log.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ $# -lt 1 ] ; then
 3 | 	echo "usage: filter_parallel_log.sh <parallel_log_file> <parallel_CMD_file>"
 4 | 	echo ""
 5 | 	echo "To extract the commands that failed when running through parallel"
 6 | 	echo ""
 7 | exit 0
 8 | fi
 9 | 
10 | LOG="$1"
11 | CMDS="$2"
12 | 
13 | if [ -f $LOG ];
14 | then
15 |    awk '$7==0' ${LOG} | cut -f 9 > .done_cmds
16 | else
17 |    echo "$LOG not found";
18 |    exit
19 | fi
20 | 
21 | grep -F -x -v -f .done_cmds ${CMDS}
22 | 


--------------------------------------------------------------------------------
/firstInstanceOf.awk:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | awk 'BEGIN{id=""} (id!=$1) {print $0;id=$1}'
4 | 


--------------------------------------------------------------------------------
/fna_qual2fastq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Convert FASTA + QUAL file pairs to a single FASTQ file
 5 | http://seqanswers.com/forums/showthread.php?t=16925
 6 | 
 7 | You can use this script from the shell like this::
 8 | $ ./fasta_to_fastaq reads.fna reads.qual reads.fastq
 9 | """
10 | 
11 | # The libraries we need #
12 | import sys, os
13 | from Bio import SeqIO
14 | from Bio.SeqIO.QualityIO import PairedFastaQualIterator
15 | # Get the shell arguments #
16 | fa_path = sys.argv[1]
17 | qa_path = sys.argv[2]
18 | fq_path = sys.argv[3]
19 | # Check that the paths are valid #
20 | if not os.path.exists(fa_path): raise Exception("No file at %s." % fa_path)
21 | if not os.path.exists(qa_path): raise Exception("No file at %s." % qa_path)
22 | # Do it #
23 | with open(fq_path, "w") as handle:
24 |     records = PairedFastaQualIterator(open(fa_path), open(qa_path))
25 |     count = SeqIO.write(records, handle, "fastq")
26 | # Report success #
27 | print "Converted %i records" % count
28 | 
29 | 


--------------------------------------------------------------------------------
/formatOut2Tab.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | file=$1
3 | # get taxaid
4 | g=$(basename ${f%%.*} |cut -f 1 -d "_")
5 | echo -e "qseqid\tsseqid\tqstart\tqend\tsstart\tsend\tevalue\tscore\tstaxid" >  ${g}.tab
6 | awk -v x=${g} 'BEGIN{OFS=FS="\t"}{print $1,$2,$7,$8,$9, $10, $11, $13, x}' $f  >>  ${g}.tab; done
7 | 
8 | 


--------------------------------------------------------------------------------
/fq2bam.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | FILE1="$6"
 3 | FILE2="$5"
 4 | OUTPUT="$4"
 5 | LIBNAME="$2"
 6 | SAMPLE="$3"
 7 | LIBSIZE="$1"
 8 | java -jar /opt/picard-tools/latest/FastqToSam.jar \
 9 |   F1="${FILE1}" \
10 |   F2="${FILE2}" \
11 |   OUTPUT="${OUTPUT}" \
12 |   READ_GROUP_NAME=CML247 \
13 |   SAMPLE_NAME="${SAMPLE}" \
14 |   LIBRARY_NAME="${LIBNAME}" \
15 |   PLATFORM=illumina \
16 |   PREDICTED_INSERT_SIZE="${LIBSIZE}" \
17 | 


--------------------------------------------------------------------------------
/genome-gaps-as-bed.py:
--------------------------------------------------------------------------------
 1 | #!bin/python
 2 | 
 3 | import gzip
 4 | import io
 5 | import sys
 6 | import os
 7 | 
 8 | # This file will generate a bedfile of the masked regions a fasta file.
 9 | 
10 | # STDIN or arguments
11 | if len(sys.argv) > 1:
12 | 
13 |     # Check file type
14 |     if sys.argv[1].endswith(".fa.gz"):
15 |         input_fasta = io.TextIOWrapper(io.BufferedReader(gzip.open(sys.argv[1])))
16 |     elif sys.argv[1].endswith(".fa") or sys.argv[1].endswith(".txt"):
17 |         input_fasta = file(sys.argv[1],'r')
18 |     else:
19 |         raise Exception("Unsupported File Type")
20 | else:
21 |     print """
22 |     \tUsage:\n\t\tgenerate_masked_ranges.py <fasta file | .fa or .fa.gz> <chrome find> <chrome replace>
23 |     
24 |     \t\t'Chrome find' and 'chrome replace' are used to find and replace the name of a chromsome. For example,
25 |     \t\treplacing CHROMSOME_I with chr1 can be accomplished by using the command as follows:
26 |     \t\t\tpython generate_masked_ranges.py my_fasta.fa CHROMSOME_ chr
27 |     \t\tOutput is to stdout
28 |     """
29 |     raise SystemExit
30 | 
31 | 
32 | n, state = 0, 0 # line, character, state (0=Out of gap; 1=In Gap)
33 | chrom, start, end = None, None, None
34 | 
35 | with input_fasta as f:
36 |     for line in f:
37 |         line = line.replace("\n","")
38 |         if line.startswith(">"):
39 |             # Print end range
40 |             if state == 1:
41 |                 print '\t'.join([chrom ,str(start), str(n)])
42 |                 start, end, state  = 0, 0, 0
43 |             n = 0 # Reset character
44 |             chrom = line.split(" ")[0].replace(">","")
45 |             # If user specifies, replace chromosome as well
46 |             if len(sys.argv) > 2:
47 |                 chrom = chrom.replace(sys.argv[2],sys.argv[3])
48 |         else:
49 |             for char in line:
50 |                 if state == 0 and char == "N":
51 |                     state = 1
52 |                     start = n
53 |                 elif state == 1 and char != "N":
54 |                     state = 0
55 |                     end = n
56 |                     print '\t'.join([chrom ,str(start), str(end)])
57 |                 else:
58 |                     pass
59 | 
60 |                 n += 1 # First base is 0 in bed format.
61 | 
62 | # Print mask close if on the last chromosome.
63 | if state == 1:
64 |             print '\t'.join([chrom ,str(start), str(n)])
65 |             start, end, state  = 0, 0, 0
66 | 


--------------------------------------------------------------------------------
/get_GitHub_file/get_GitHub_file.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "----------------"
 4 | echo "USAGE:"
 5 | echo "       . ./get_GitHub_folder.sh <URL to the GitHub file>"
 6 | echo "e.g.,  . ./get_GitHub_folder.sh https://github.com/ISUgenomics/common_scripts/blob/master/get_GitHub_file/get_GitHub_file.sh"
 7 | echo "----------------"
 8 | echo ""
 9 | 
10 | URL=`echo $1`
11 | folder=`echo $URL | sed 's|blob/master|trunk|g'`
12 | 
13 | svn export $folder


--------------------------------------------------------------------------------
/get_GitHub_file/get_GitHub_folder.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "----------------"
 4 | echo "USAGE:"
 5 | echo "       . ./get_GitHub_folder.sh <URL to the GitHub folder>"
 6 | echo "       . ./get_GitHub_folder.sh https://github.com/ISUgenomics/common_scripts/tree/master/get_GitHub_file"
 7 | echo "----------------"
 8 | echo ""
 9 | 
10 | URL=`echo $1`
11 | folder=`echo $URL | sed 's|tree/master|trunk|g'`
12 | 
13 | svn export $folder


--------------------------------------------------------------------------------
/get_ip:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ifconfig | \
3 | grep -Eo 'inet (addr:)?([0-9]*\.){3}[0-9]*' | \
4 | grep -Eo '([0-9]*\.){3}[0-9]*' | \
5 | grep -v '127.0.0.1'
6 | 


--------------------------------------------------------------------------------
/get_taxanomy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NAMES="names.dmp"
 4 | NODES="nodes.dmp"
 5 | GI_TO_TAXID="gi_taxid_prot.dmp"
 6 | TAXONOMY=""
 7 | GI="${1}"
 8 | 
 9 | # Obtain the name corresponding to a taxid or the taxid of the parent taxa
10 | get_name_or_taxid()
11 | {
12 |     grep --max-count=1 "^${1}"$'\t' "${2}" | cut --fields="${3}"
13 | }
14 | 
15 | # Get the taxid corresponding to the GI number
16 | TAXID=$(get_name_or_taxid "${GI}" "${GI_TO_TAXID}" "2")
17 | 
18 | # Loop until you reach the root of the taxonomy (i.e. taxid = 1)
19 | while [[ "${TAXID}" -gt 1 ]] ; do
20 |     # Obtain the scientific name corresponding to a taxid
21 |     NAME=$(get_name_or_taxid "${TAXID}" "${NAMES}" "3")
22 |     # Obtain the parent taxa taxid
23 |     PARENT=$(get_name_or_taxid "${TAXID}" "${NODES}" "3")
24 |     # Build the taxonomy path
25 |     TAXONOMY="${NAME};${TAXONOMY}"
26 |     TAXID="${PARENT}"
27 | done
28 | 
29 | echo -e "${GI}\t${TAXONOMY}"
30 | 
31 | exit 0
32 | 


--------------------------------------------------------------------------------
/gitgrep:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | git grep -i $1  $(git rev-list --all)
4 | 


--------------------------------------------------------------------------------
/gmap_cdna.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export GMAPDB=/home/arnstrm/arnstrm/GMAPDB
3 | DB_NAME="masurca_TIL01"
4 | FILE1="$1"
5 | OUTFILE=$(basename ${FILE1} | sed 's/.fasta$//g')
6 | gmap -d ${DB_NAME} -t 32 -B 5 -A -f samse --input-buffer-size=1000000 --output-buffer-size=1000000 ${FILE1} > ${OUTFILE}_se.sam
7 | 


--------------------------------------------------------------------------------
/gsize.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | file="$1"
 3 | #lines=$(zcat ${file} | wc -l)
 4 | size=$(zcat ${file} | wc -c)
 5 | #count=$(($lines / 4))
 6 | #echo -n -e "\t$i : "
 7 | echo -n "${file} :"
 8 | echo "${size}"  | \
 9 | sed -r '
10 |   :L
11 |   s=([0-9]+)([0-9]{3})=\1,\2=
12 |   t L'
13 | 
14 | 


--------------------------------------------------------------------------------
/gsnap_pe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | module load gmap
 3 | 
 4 | export GMAPDB=/home/arnstrm/arnstrm/GMAPDB
 5 | DB_NAME=""GRCm38.78_musmus""
 6 | FILE1="$1"
 7 | FILE2=$(echo ${FILE1} |sed 's/_R1_/_R2_/g')
 8 | OUTFILE=$(basename ${FILE1} | sed 's/.fastq.gz$//g')
 9 | # Note: "-N" option for detecting novel splice sites, remove if not needed (0=OFF; 1=ON)
10 | gsnap -d ${DB_NAME} -t 32 -B 5 -m 5 -N 1 --gunzip --fails-as-input --input-buffer-size=1000000 --output-buffer-size=1000000 -A sam --split-output=${DB_NAME}_${OUTFILE} ${FILE1} ${FILE2}
11 | 


--------------------------------------------------------------------------------
/gsnap_pe2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export GMAPDB=/home/arnstrm/arnstrm/20140304_Hufford_teosinte_TIL01/01_DATA/E_GSNAPdb
3 | DB_NAME="zeamays"
4 | FILE1="$1"
5 | FILE2="$2"
6 | OUTFILE=$(basename ${FILE1} | sed 's/1.fastq.gz$//g')
7 | # Note: "-N" option for detecting novel splice sites, remove if not needed (0=OFF; 1=ON)
8 | gsnap -d ${DB_NAME} -t 32 -B 5 -m 5 --gunzip --fails-as-input --input-buffer-size=1000000 --output-buffer-size=1000000 -A sam --split-output=${DB_NAME}_${OUTFILE} ${FILE1} ${FILE2}
9 | 


--------------------------------------------------------------------------------
/gsnap_pe_clip.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # this is optimized to run on 32 procs: spliting input to 16 peices, 2 procs per peice
 3 | 
 4 | ## MODULES
 5 | module use /data004/software/GIF/modules
 6 | module load parallel
 7 | module load gmap
 8 | 
 9 | ## PATHS
10 | export GMAPDB=/home/arnstrm/arnstrm/GMAPDB
11 | DB_NAME="GRCm38.78_musmus"
12 | 
13 | FILE1="$1"
14 | FILE2=$(echo "$1" |sed 's/_R1_/_R2_/g')
15 | 
16 | OUTFILE=$(basename ${FILE1%%.*})
17 | 
18 | ## COMMAND
19 | # important options to consider
20 | #==============================
21 | # if using RNA-seq, use: --novelsplicing=1
22 | #
23 | # if mate pairs use: --orientation=RF
24 | # if paired end use: --orientation=FR
25 | # if not sure, don't include --orientation option
26 | #
27 | # for allowing soft-clipping of alignments, exlucde all 3 options below:
28 | # --terminal-threshold=100
29 | # --indel-penalty=1
30 | # --trim-mismatch-score=0
31 | #
32 | # if fastq is gzipped use:
33 | # --gunzip
34 | 
35 | parallel --jobs 4 \
36 |   "gsnap \
37 | --db=${DB_NAME} \
38 | --part={}/4 \
39 | --batch=4 \
40 | --nthreads=8 \
41 | --novelsplicing=1 \
42 | --gunzip \
43 | --expand-offsets=1 \
44 | --max-mismatches=5.0 \
45 | --input-buffer-size=1000000 \
46 | --output-buffer-size=1000000 \
47 | --format=sam \
48 | --split-output=${DB_NAME}_AP_${OUTFILE}.{} \
49 | --failed-input=${DB_NAME}_AP_${OUTFILE}.not_mapped.{} \
50 | ${FILE1} \
51 | ${FILE2} " \
52 | ::: {0..3}
53 | 


--------------------------------------------------------------------------------
/gsnap_pe_clip_final.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # this is optimized to run on 32 procs: spliting input to 16 peices, 2 procs per peice
 3 | 
 4 | ## MODULES
 5 | module use /data004/software/GIF/modules
 6 | module load parallel
 7 | module load gmap
 8 | 
 9 | ## PATHS
10 | export GMAPDB=/home/arnstrm/arnstrm/GMAPDB
11 | DB_NAME="GRCm38.78_musmus"
12 | 
13 | FILE1="$1"
14 | FILE2=$(echo "$1" |sed 's/_R1_/_R2_/g')
15 | 
16 | OUTFILE=$(basename ${FILE1%%.*})
17 | 
18 | ## COMMAND
19 | # important options to consider
20 | #==============================
21 | # if using RNA-seq, use: --novelsplicing=1
22 | #
23 | # if mate pairs use: --orientation=RF
24 | # if paired end use: --orientation=FR
25 | # if not sure, don't include --orientation option
26 | #
27 | # for allowing soft-clipping of alignments, exlucde all 3 options below:
28 | # --terminal-threshold=100
29 | # --indel-penalty=1
30 | # --trim-mismatch-score=0
31 | #
32 | # if fastq is gzipped use:
33 | # --gunzip
34 | 
35 | parallel --jobs 4 \
36 |   "gsnap \
37 | --db=${DB_NAME} \
38 | --part={}/4 \
39 | --batch=4 \
40 | --nthreads=8 \
41 | --novelsplicing=1 \
42 | --gunzip \
43 | --expand-offsets=1 \
44 | --max-mismatches=5.0 \
45 | --input-buffer-size=1000000 \
46 | --output-buffer-size=1000000 \
47 | --format=sam \
48 | --split-output=${DB_NAME}_AP_${OUTFILE}.{} \
49 | --failed-input=${DB_NAME}_AP_${OUTFILE}.not_mapped.{} \
50 | ${FILE1} \
51 | ${FILE2} " \
52 | ::: {0..3}
53 | 


--------------------------------------------------------------------------------
/gsnap_pe_noclip.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # this is optimized to run on 32 procs: spliting input to 16 peices, 2 procs per peice
 3 | 
 4 | ## MODULES
 5 | module use /data004/software/GIF/modules
 6 | module load parallel
 7 | module load gmap/2014-06-10
 8 | 
 9 | ## PATHS
10 | export GMAPDB=/home/arnstrm/arnstrm/GMAPDB
11 | #export GMAPDB=/home/arnstrm/arnstrm/20140304_Hufford_teosinte_TIL01/01_DATA/E_GSNAPdb
12 | DB_NAME="masurca_seriola2_scf"
13 | 
14 | ## VARIABLES
15 | #DB_NAME="zeamays_b73"
16 | FILE1="$1"
17 | FILE2="$2"
18 | OUTFILE=$(basename ${FILE1} | sed 's/_R1.fastq.gz$//g')
19 | 
20 | ## COMMAND
21 | parallel --jobs 4 \
22 |   "gsnap \
23 | --db=${DB_NAME} \
24 | --part={}/4 \
25 | --orientation=RF \
26 | --batch=4 \
27 | --nthreads=8 \
28 | --expand-offsets=1 \
29 | --max-mismatches=5.0 \
30 | --terminal-threshold=100 \
31 | --indel-penalty=1 \
32 | --trim-mismatch-score=0 \
33 | --gunzip \
34 | --pairmax-dna=10000 \
35 | --pairdev=500 \
36 | --fails-as-input \
37 | --input-buffer-size=1000000 \
38 | --output-buffer-size=1000000 \
39 | --format=sam \
40 | --split-output=${DB_NAME}_${OUTFILE}.{} \
41 | ${FILE1} \
42 | ${FILE2} " \
43 | ::: {0..3}
44 | 
45 | ## OPTIONS
46 | #  --novelsplicing=1 \
47 | 
48 | 


--------------------------------------------------------------------------------
/gsnap_pe_noclip_final.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # this is optimized to run on 32 procs: spliting input to 16 peices, 2 procs per peice
 3 | 
 4 | ## MODULES
 5 | module load LAS/parallel/20150922
 6 | module load Serdor
 7 | module load gmap-gsnap/2015-09-29
 8 | 
 9 | #create different amount of wait times to ensure we don't have multiple rsyncs
10 | sleep $(( ( RANDOM % 30 )  + 1 ))
11 | 
12 | if [ ! -d "$TMPDIR/GMAPDBTMP" ]; then
13 | echo "copying over GSNAPDB"
14 | mkdir $TMPDIR/GMAPDBTMP
15 | #wait for up 2 four minutes to spread out the rsync jobs.
16 | sleep $(( ( RANDOM % 240 )  + 1 ))
17 | rsync -avz $GMAPDB/ $TMPDIR/GMAPDBTMP/
18 | fi
19 |   
20 | export GMAPDBTMP=$TMPDIR/GMAPDBTMP
21 | ## PATHS
22 | DB_NAME=$GNAME
23 | 
24 | FILE1="$1"
25 | FILE2=$(echo "$1" |sed 's/_R1_/_R2_/g')
26 | 
27 | OUTFILE=$(basename ${FILE1%%.*})
28 | 
29 | ## COMMAND
30 | # important options to consider
31 | #==============================
32 | # if using RNA-seq, use: --novelsplicing=1
33 | #
34 | # if mate pairs use: --orientation=RF
35 | # and specify the insert size using −−pairlength=2000 (for 2kb insert)
36 | # and −−pairmax=5000 (for max insert size)
37 | #
38 | # for allowing soft-clipping of alignments, exlucde all 3 options below:
39 | # --terminal-threshold=100
40 | # --indel-penalty=1
41 | # --trim-mismatch-score=0
42 | #
43 | # if fastq is gzipped use:
44 | # --gunzip
45 | #--novelsplicing=1 \
46 | #--gunzip \
47 | mkdir OUTPUT_${OUTFILE}
48 | parallel --env _ --jobs 1 \
49 |   "gsnap \
50 | --dir=$GMAPDBTMP \
51 | --db=${DB_NAME} \
52 | --part={}/4 \
53 | --batch=4 \
54 | --nthreads=15 \
55 | --indel-penalty=1 \
56 | --trim-mismatch-score=0 \
57 | --expand-offsets=1 \
58 | --max-mismatches=5.0 \
59 | --input-buffer-size=1000000 \
60 | --output-buffer-size=1000000 \
61 | --format=sam \
62 | --split-output=OUTPUT_${OUTFILE}/${DB_NAME}_AP_${OUTFILE}.{} \
63 | --failed-input=OUTPUT_${OUTFILE}/${DB_NAME}_AP_${OUTFILE}.not_mapped.{} \
64 | ${FILE1} \
65 | ${FILE2} " \
66 | ::: {0..0}
67 | 


--------------------------------------------------------------------------------
/gsnap_se.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export GMAPDB=/home/arnstrm/arnstrm/20140324_Bhattacharyya_fusarium_RNAseq/01_DATA/E_GSNAPdb
3 | DB_NAME="psojae"
4 | INPUT_FILE="$1"
5 | OUTFILE=$(basename $INPUT_FILE | sed 's/.fastq$//g')
6 | 
7 | gsnap -d ${DB_NAME} -t 2 -B 4 -m 3 -N 1 --input-buffer-size=1000000 --output-buffer-size=1000000 -A sam --split-output=${OUTFILE} ${INPUT_FILE}
8 | 


--------------------------------------------------------------------------------
/guess_encoding.py:
--------------------------------------------------------------------------------
 1 | """
 2 |    awk 'NR % 4 == 0' your.fastq | python %prog [options]
 3 | 
 4 | guess the encoding of a stream of qual lines.
 5 | """
 6 | import sys
 7 | import optparse
 8 | 
 9 | RANGES = {
10 |     'Sanger': (33, 73),
11 |     'Solexa': (59, 104),
12 |     'Illumina-1.3': (64, 104),
13 |     'Illumina-1.5': (67, 104)
14 | }
15 | 
16 | 
17 | def get_qual_range(qual_str):
18 |     """
19 |     >>> get_qual_range("DLXYXXRXWYYTPMLUUQWTXTRSXSWMDMTRNDNSMJFJFFRMV")
20 |     (68, 89)
21 |     """
22 | 
23 |     vals = [ord(c) for c in qual_str]
24 |     return min(vals), max(vals)
25 | 
26 | def get_encodings_in_range(rmin, rmax, ranges=RANGES):
27 |     valid_encodings = []
28 |     for encoding, (emin, emax) in ranges.items():
29 |         if rmin >= emin and rmax <= emax:
30 |             valid_encodings.append(encoding)
31 |     return valid_encodings
32 | 
33 | def main():
34 |     p = optparse.OptionParser(__doc__)
35 |     p.add_option("-n", dest="n", help="number of qual lines to test default:-1"
36 |                  " means test until end of file or until it it possible to "
37 |                  " determine a single file-type",
38 |                  type='int', default=-1)
39 | 
40 |     opts, args = p.parse_args()
41 |     print >>sys.stderr, "# reading qualities from stdin"
42 |     gmin, gmax  = 99, 0
43 |     valid = []
44 |     for i, line in enumerate(sys.stdin):
45 |         lmin, lmax = get_qual_range(line.rstrip())
46 |         if lmin < gmin or lmax > gmax:
47 |             gmin, gmax = min(lmin, gmin), max(lmax, gmax)
48 |             valid = get_encodings_in_range(gmin, gmax)
49 |             if len(valid) == 0:
50 |                 print >>sys.stderr, "no encodings for range: %s" % str((gmin, gmax))
51 |                 sys.exit()
52 |             if len(valid) == 1 and opts.n == -1:
53 |                 print "\t".join(valid) + "\t" + str((gmin, gmax))
54 |                 sys.exit()
55 | 
56 |         if opts.n > 0 and i > opts.n:
57 |             print "\t".join(valid) + "\t" + str((gmin, gmax))
58 |             sys.exit()
59 | 
60 |     print "\t".join(valid) + "\t" + str((gmin, gmax))
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     import doctest
65 |     if doctest.testmod(optionflags=doctest.ELLIPSIS |\
66 |                                    doctest.NORMALIZE_WHITESPACE).failed == 0:
67 |         main()
68 | 
69 | 


--------------------------------------------------------------------------------
/header_replace_fasta.pl:
--------------------------------------------------------------------------------
 1 | use strict;
 2 | use warnings;
 3 | 
 4 | open(my $in1, '<', $ARGV[0])
 5 |     or die "Can't read from multifasta file '$ARGV[0]': $!";
 6 | my @multifasta = <$in1>;
 7 | close $in1;
 8 | 
 9 | open(my $in2, '<', $ARGV[1])
10 |     or die "Can't read from header replacement file '$ARGV[1]': $!";
11 | my %headers;
12 | 
13 | while (<$in2>)
14 | {
15 |     my ($orig, $new) = split;
16 |     warn "Duplicate replacement: $orig --> $new\n" if exists $headers{$orig};
17 |     $headers{$orig}  = $new;
18 | }
19 | 
20 | close $in2;
21 | 
22 | my $destination = $ARGV[0] . '_headers-replaced.fasta';
23 | open (my $out, '>', $destination)
24 |     or die "Can't write to file '$destination': $!"; 
25 | 
26 | foreach my $line (@multifasta)
27 | {
28 |     if ($line =~ /^\>/)  # it is a header
29 |     {
30 |         foreach my $key (keys %headers)
31 |         {
32 |             if ($line =~ /$key/)
33 |             {
34 |                 $line =~ s/$key/$headers{$key}/;
35 |                 last;
36 |             }
37 |         }
38 |     }
39 | 
40 |     print $out $line;
41 | }
42 | 
43 | close $out;
44 | 


--------------------------------------------------------------------------------
/histogram.awk:
--------------------------------------------------------------------------------
 1 | #/usr/bin/awk -f
 2 | 
 3 | BEGIN{
 4 |     bin_width=100;
 5 | 
 6 | }
 7 | {
 8 |     bin=int(($1-0.0001)/bin_width);
 9 |     if( bin in hist){
10 |         hist[bin]+=1
11 |     }else{
12 |         hist[bin]=1
13 |     }
14 | }
15 | END{
16 |     for (h in hist)
17 |         printf " * > %2.2f  ->  %i \n", h*bin_width, hist[h]
18 | }
19 | 


--------------------------------------------------------------------------------
/htseq_count.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | PATH=$PATH:~/.local/bin
4 | GFF="/home/arnstrm/arnstrm/20140324_Bhattacharyya_fusarium_RNAseq/01_DATA/C_GFF/Fusvul.gff3"
5 | INFILE="$1"
6 | OUTFILE=$(echo $INFILE | sed 's/unpaired/count/g')
7 | 
8 | htseq-count -s no -m intersection-nonempty -t gene -i ID $INFILE $GFF > $OUTFILE
9 | 


--------------------------------------------------------------------------------
/interleave_PE_fastq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Usage: interleave_fastq.sh f.fastq r.fastq > interleaved.fastq
 3 | # Interleaves the reads of two FASTQ files specified on the
 4 | # command line and outputs a single FASTQ file of STDOUT.
 5 | # 
 6 | # Can interleave 100 million paired reads (200 million total
 7 | # reads; a 2 x 22Gbyte files), in memory (/dev/shm), in 6m54s (414s)
 8 | # 
 9 | # Latest code: https://gist.github.com/4544979
10 | # Also see my deinterleaving script: https://gist.github.com/3521724
11 | 
12 | paste $1 $2 | paste - - - - | awk -v OFS="\n" -v FS="\t" '{print($1,$3,$5,$7,$2,$4,$6,$8)}'
13 | 


--------------------------------------------------------------------------------
/intervalBins.awk:
--------------------------------------------------------------------------------
 1 | ##to execute 
 2 | #cat inputfile | intervalBins.awk interval
 3 | #cat inputfile | intervalBins.awk 1000
 4 | ##input looks like
 5 | #Chr01 genomicPosition
 6 | #Chr02 genomicPosition
 7 | #Scaffold1 genomicPosition
 8 | ##output looks like
 9 | #Chr01 interval geneticElementsPerInterval
10 | 
11 | /usr/bin/gawk 'BEGIN {  OFS="\t"; BINSIZE="'$1'"; } 
12 | { A=sprintf("%d", $2/BINSIZE);
13 |   BIN[$1][A]++; } 
14 | END { for (X in BIN) { 
15 | 	for (Y in BIN[X]) {print X,BINSIZE*Y, BIN[X][Y] }} }'
16 | 


--------------------------------------------------------------------------------
/join_files.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ $# -lt 1 ] ; then
 3 |         echo ""
 4 |         echo "usage: join_files.sh [file1] [file2] ...|| files*"
 5 |         echo "merges multiple files based on first (common) field"
 6 |         echo "Note: filenames will be used as headers"
 7 |         echo ""
 8 |         exit 0
 9 | fi
10 | 
11 | files=${@};
12 | echo -en "\t"
13 | for file in ${files[@]}
14 | do
15 | echo -en "$file\t";
16 | done
17 | echo "";
18 | awk '{arr[$1]=arr[$1]"\t"$2}END{for(i in arr)print i,arr[i]}' ${files[@]};
19 | 
20 | 


--------------------------------------------------------------------------------
/joinr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # join multiple txt files into based on column
 3 | # run it as:
 4 | # joinr.sh *.txt > combined_file.out
 5 | # Files should be sorted prior to joining
 6 | # first field is used as the key for joining (but can be modified by editing the script)
 7 | # arnstrm@gmail.com
 8 | # source: http://stackoverflow.com/a/18153506/1564521
 9 | 
10 | if [[ $# -ge 2 ]]; then
11 |     function __r {
12 |         if [[ $# -gt 1 ]]; then
13 |             exec join - "$1" | __r "${@:2}"
14 |         else
15 |             exec join - "$1"
16 |         fi
17 |     }
18 | 
19 |     __r "${@:2}" < "$1"
20 | fi
21 | 


--------------------------------------------------------------------------------
/khmer_pe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FILE1=$1
 4 | FILE2=$2
 5 | OUTNAME=$(basename $1 | sed 's/.1.fastq.gz//g')
 6 | hashsize=96e9
 7 | cutoff=10
 8 | ksize=20
 9 | numHashes=4
10 | 
11 | normalize-by-median.py -k $ksize -N $ksize -C $cutoff -x $numHashes --paired --report-to-file ${OUTNAME}.report -o ${OUTNAME} ${FILE1} ${FILE2};
12 | 
13 | 


--------------------------------------------------------------------------------
/mac2unix:
--------------------------------------------------------------------------------
1 | dos2unix


--------------------------------------------------------------------------------
/makeLocalSLURMs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | Usage = """
 4 | creates a job file for desired number of commands per job
 5 | 
 6 | Usage:
 7 | 
 8 |   makeSLURMs.py <number of jobs per PBS file> <commands file>
 9 | 
10 | eg:
11 |   
12 |   makeSLURMs.py 10 bowtie2.cmds
13 | 
14 | will create bowtie2_N.sub files, where N equals to number of lines in bowtie2.cmds divided by 10
15 | 
16 | If you have large number of commands that you would like to package (a set number) in a single
17 | PBS script file, you can run this script along with desired number of commands per job.
18 | Note that all commands will run in serially with this script (s suffix). If you want to run all commands at a time,
19 | parallel fashion, then use the p suffix script 
20 | 
21 | Arun Seetharam
22 | arnstrm@iastate.edu
23 | 11/08/2016
24 | """
25 | import sys
26 | import os
27 | if len(sys.argv)<3:
28 |     print Usage
29 | else:
30 |    cmdargs = str(sys.argv)
31 |    cmds = open(sys.argv[2],'r')
32 |    jobname = str(os.path.splitext(sys.argv[2])[0])
33 |    filecount = 0
34 |    numcmds = int(sys.argv[1])
35 |    line = cmds.readline()
36 |    while line:
37 |         cmd = []
38 |         while len(cmd) != int(sys.argv[1]):
39 |                 cmd.append(line)
40 |                 line = cmds.readline()
41 |         w = open(jobname+'_'+str(filecount)+'.sub','w')
42 |         w.write("#!/bin/bash\n")
43 |         w.write("#SBATCH -N 1\n")
44 |         w.write("#SBATCH --ntasks-per-node=16\n")
45 |         w.write("#SBATCH -t 96:00:00\n")
46 |         w.write("#SBATCH -J "+jobname+"_"+str(filecount)+"\n")
47 |         w.write("#SBATCH -o "+jobname+"_"+str(filecount)+".o%j\n")
48 |         w.write("#SBATCH -e "+jobname+"_"+str(filecount)+".e%j\n")
49 |         w.write("#SBATCH --mail-user=arnstrm@gmail.com\n")
50 |         w.write("#SBATCH --mail-type=begin\n")
51 |         w.write("#SBATCH --mail-type=end\n")
52 |         w.write("cd $SLURM_SUBMIT_DIR\n")
53 |         w.write("ulimit -s unlimited\n")
54 |         w.write("module use /work/GIF/software/modules\n")
55 |         w.write("module purge\n")
56 |         w.write("cd ${TMPDIR}\n")
57 |         w.write("RC=1\n")
58 |         w.write("date\n")
59 |         w.write("while [[ $RC -ne 0 ]]; do\n")
60 |         w.write("rsync -rtsLP $SLURM_SUBMIT_DIR/ $TMPDIR/\n")
61 |         w.write("RC=$?\n")
62 |         w.write("sleep 10\n")
63 |         w.write("done\n")
64 |         w.write("date\n")
65 |         count = 0
66 |         while (count < numcmds):
67 |            w.write(cmd[count])
68 |            count = count + 1
69 |         w.write("RC=1\n")
70 |         w.write("date\n")
71 |         w.write("while [[ $RC -ne 0 ]]; do\n")
72 |         w.write("rsync -rts $TMPDIR/ $SLURM_SUBMIT_DIR/\n")
73 |         w.write("RC=$?\n")
74 |         w.write("sleep 10\n")
75 |         w.write("done\n")
76 |         w.write("date\n")
77 |         w.write("scontrol show job $SLURM_JOB_ID\n")
78 |         w.close()
79 |         filecount += 1
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/makeNovaSLURMs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | Usage = """
 4 | creates a job file for desired number of commands per job
 5 | 
 6 | Usage:
 7 | 
 8 |   makeSLURMs.py <number of jobs per PBS file> <commands file>
 9 | 
10 | eg:
11 |   
12 |   makeSLURMs.py 10 bowtie2.cmds
13 | 
14 | will create bowtie2_N.sub files, where N equals to number of lines in bowtie2.cmds divided by 10
15 | 
16 | If you have large number of commands that you would like to package (a set number) in a single
17 | PBS script file, you can run this script along with desired number of commands per job.
18 | Note that all commands will run in serially with this script (s suffix). If you want to run all commands at a time,
19 | parallel fashion, then use the p suffix script 
20 | 
21 | Arun Seetharam
22 | arnstrm@iastate.edu
23 | 11/08/2016
24 | """
25 | import sys
26 | import os
27 | if len(sys.argv)<3:
28 |     print Usage
29 | else:
30 |    cmdargs = str(sys.argv)
31 |    cmds = open(sys.argv[2],'r')
32 |    jobname = str(os.path.splitext(sys.argv[2])[0])
33 |    filecount = 0
34 |    numcmds = int(sys.argv[1])
35 |    line = cmds.readline()
36 |    while line:
37 |         cmd = []
38 |         while len(cmd) != int(sys.argv[1]):
39 |                 cmd.append(line)
40 |                 line = cmds.readline()
41 |         w = open(jobname+'_'+str(filecount)+'.sub','w')
42 |         w.write("#!/bin/bash\n")
43 |         w.write("#SBATCH -N 1\n")
44 |         w.write("#SBATCH -n 36\n")
45 |         w.write("#SBATCH -t 96:00:00\n")
46 |         w.write("#SBATCH -J "+jobname+"_"+str(filecount)+"\n")
47 |         w.write("#SBATCH -o "+jobname+"_"+str(filecount)+".o%j\n")
48 |         w.write("#SBATCH -e "+jobname+"_"+str(filecount)+".e%j\n")
49 |         w.write("#SBATCH --mail-user=arnstrm@gmail.com\n")
50 |         w.write("#SBATCH --mail-type=begin\n")
51 |         w.write("#SBATCH --mail-type=end\n")
52 |         w.write("cd $SLURM_SUBMIT_DIR\n")
53 |         w.write("ulimit -s unlimited\n")
54 |         w.write("module purge\n")
55 |         w.write("source /work/LAS/mhufford-lab/arnstrm/miniconda/etc/profile.d/conda.sh\n")
56 |         count = 0
57 |         while (count < numcmds):
58 |            w.write(cmd[count])
59 |            count = count + 1
60 |         w.write("scontrol show job $SLURM_JOB_ID\n")
61 |         w.close()
62 |         filecount += 1
63 | 


--------------------------------------------------------------------------------
/makePBSp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | Usage = """
 4 | creates a job file for desired number of commands per job
 5 | 
 6 | Usage:
 7 | 
 8 |   makePBSp.py <number of jobs per PBS file> <commands file>
 9 | 
10 | eg:
11 |   
12 |   makePBSp.py 10 bowtie2.cmds
13 | 
14 | will create bowtie2_N.sub files, where N equals to number of lines in bowtie2.cmds divided by 10
15 | 
16 | If you have large number of commands that you would like to package (a set number) in a single
17 | PBS script file, you can run this script along with desired number of commands per job.
18 | Note that all commands will run in parallel with this script (p suffix). If you want to run one command
19 | at a time then use the s suffix script 
20 | 
21 | Arun Seetharam
22 | arnstrm@iastate.edu
23 | 11/08/2016
24 | """
25 | import sys
26 | import os
27 | if len(sys.argv)<3:
28 |     print Usage
29 | else:
30 |    cmdargs = str(sys.argv)
31 |    cmds = open(sys.argv[2],'r')
32 |    jobname = str(os.path.splitext(sys.argv[2])[0])
33 |    filecount = 0
34 |    numcmds = int(sys.argv[1])
35 |    line = cmds.readline()
36 |    while line:
37 |         cmd = []
38 |         while len(cmd) != int(sys.argv[1]):
39 |                 cmd.append(line)
40 |                 line = cmds.readline()
41 |         w = open(jobname+'_'+str(filecount)+'.sub','w')
42 |         w.write("#!/bin/bash\n")
43 |         w.write("#PBS -l nodes=1:ppn=16:compute\n")
44 |         w.write("#PBS -l walltime=96:00:00\n")
45 |         w.write("#PBS -N "+jobname+"_"+str(filecount)+"\n")
46 |         w.write("#PBS -e ${PBS_JOBNAME}.e${PBS_JOBID}\n")
47 |         w.write("#PBS -o ${PBS_JOBNAME}.o${PBS_JOBID}\n")
48 |         w.write("#PBS -m ae -M arnstrm@gmail.com\n")
49 |         w.write("cd $PBS_O_WORKDIR\n")
50 |         w.write("ulimit -s unlimited\n")
51 |         w.write("chmod g+rw ${PBS_JOBNAME}.[eo]${PBS_JOBID}\n")
52 |         w.write("module use /shared/software/GIF/modules\n")
53 |         w.write("module load parallel\n")
54 |         w.write("parallel --joblog "+jobname+"_progress_"+str(filecount)+".log --workdir $PWD <<FIL\n")
55 |         count = 0
56 |         while (count < numcmds):
57 |            w.write(cmd[count])
58 |            count = count + 1
59 |         w.write("FIL\n")
60 |         w.write("qstat -f ${PBS_JOBID} |head\n")
61 |         w.close()
62 |         filecount += 1
63 | 


--------------------------------------------------------------------------------
/makePBSs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | Usage = """
 4 | creates a job file for desired number of commands per job
 5 | 
 6 | Usage:
 7 | 
 8 |   makePBSs.py <number of jobs per PBS file> <commands file>
 9 | 
10 | eg:
11 |   
12 |   makePBSs.py 10 bowtie2.cmds
13 | 
14 | will create bowtie2_N.sub files, where N equals to number of lines in bowtie2.cmds divided by 10
15 | 
16 | If you have large number of commands that you would like to package (a set number) in a single
17 | PBS script file, you can run this script along with desired number of commands per job.
18 | Note that all commands will run in serially with this script (s suffix). If you want to run all commands at a time,
19 | parallel fashion, then use the p suffix script 
20 | 
21 | Arun Seetharam
22 | arnstrm@iastate.edu
23 | 11/08/2016
24 | """
25 | import sys
26 | import os
27 | if len(sys.argv)<3:
28 |     print Usage
29 | else:
30 |    cmdargs = str(sys.argv)
31 |    cmds = open(sys.argv[2],'r')
32 |    jobname = str(os.path.splitext(sys.argv[2])[0])
33 |    filecount = 0
34 |    numcmds = int(sys.argv[1])
35 |    line = cmds.readline()
36 |    while line:
37 |         cmd = []
38 |         while len(cmd) != int(sys.argv[1]):
39 |                 cmd.append(line)
40 |                 line = cmds.readline()
41 |         w = open(jobname+'_'+str(filecount)+'.sub','w')
42 |         w.write("#!/bin/bash\n")
43 |         w.write("#PBS -l nodes=1:ppn=16:compute\n")
44 |         w.write("#PBS -l walltime=96:00:00\n")
45 |         w.write("#PBS -N "+jobname+"_"+str(filecount)+"\n")
46 |         w.write("#PBS -e ${PBS_JOBNAME}.e${PBS_JOBID}\n")
47 |         w.write("#PBS -o ${PBS_JOBNAME}.o${PBS_JOBID}\n")
48 |         w.write("#PBS -m ae -M arnstrm@gmail.com\n")
49 |         w.write("cd $PBS_O_WORKDIR\n")
50 |         w.write("ulimit -s unlimited\n")
51 |         w.write("chmod g+rw ${PBS_JOBNAME}.[eo]${PBS_JOBID}\n")
52 |         w.write("module use /shared/software/GIF/modules\n")
53 |         count = 0
54 |         while (count < numcmds):
55 |            w.write(cmd[count])
56 |            count = count + 1
57 |         w.write("qstat -f ${PBS_JOBID} |head\n")
58 |         w.close()
59 |         filecount += 1
60 | 


--------------------------------------------------------------------------------
/makeSLURM_bridges.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | Usage = """
 4 | creates a job file for desired number of commands per job
 5 | 
 6 | Usage:
 7 | 
 8 |   makeSLURMs.py <number of jobs per PBS file> <commands file>
 9 | 
10 | eg:
11 |   
12 |   makeSLURMs.py 10 bowtie2.cmds
13 | 
14 | will create bowtie2_N.sub files, where N equals to number of lines in bowtie2.cmds divided by 10
15 | 
16 | If you have large number of commands that you would like to package (a set number) in a single
17 | PBS script file, you can run this script along with desired number of commands per job.
18 | Note that all commands will run in serially with this script (s suffix). If you want to run all commands at a time,
19 | parallel fashion, then use the p suffix script 
20 | 
21 | Arun Seetharam
22 | arnstrm@iastate.edu
23 | 11/08/2016
24 | 
25 | Modified 8/14/2018
26 | Andrew Severin
27 | severin@iastate.edu
28 | 
29 | """
30 | import sys
31 | import os
32 | if len(sys.argv)<3:
33 |     print Usage
34 | else:
35 |    cmdargs = str(sys.argv)
36 |    cmds = open(sys.argv[2],'r')
37 |    jobname = str(os.path.splitext(sys.argv[2])[0])
38 |    filecount = 0
39 |    numcmds = int(sys.argv[1])
40 |    line = cmds.readline()
41 |    while line:
42 |         cmd = []
43 |         while len(cmd) != int(sys.argv[1]):
44 |                 cmd.append(line)
45 |                 line = cmds.readline()
46 |         w = open(jobname+'_'+str(filecount)+'.sub','w')
47 |         
48 | #SBATCH -J canu4
49 | #SBATCH -o canu4.o%j
50 | #SBATCH -p RM
51 | #SBATCH -N 1
52 | #SBATCH -n 28
53 | #SBATCH -t 48:00:00
54 | 
55 | w.write("#!/bin/bash\n")
56 |         w.write("#SBATCH -N 1\n")
57 |         w.write("#SBATCH -n 28\n")
58 |         w.write("#SBATCH -p RM\n")
59 |         w.write("#SBATCH -t 48:00:00\n")
60 |         w.write("#SBATCH -J "+jobname+"_"+str(filecount)+"\n")
61 |         w.write("#SBATCH -o "+jobname+"_"+str(filecount)+".o%j\n")
62 |         w.write("#SBATCH -e "+jobname+"_"+str(filecount)+".e%j\n")
63 |         w.write("#SBATCH --mail-user=userid@host.com\n")
64 |         w.write("#SBATCH --mail-type=begin\n")
65 |         w.write("#SBATCH --mail-type=end\n")
66 |         w.write("cd $SLURM_SUBMIT_DIR\n")
67 |         w.write("ulimit -s unlimited\n")
68 |         count = 0
69 |         while (count < numcmds):
70 |            w.write(cmd[count])
71 |            count = count + 1
72 |         w.write("scontrol show job $SLURM_JOB_ID\n")
73 |         w.close()
74 |         filecount += 1
75 | 


--------------------------------------------------------------------------------
/makeSLURMp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | Usage = """
 4 | creates a job file for desired number of commands per job
 5 | 
 6 | Usage:
 7 | 
 8 |   makePBSp.py <number of jobs per PBS file> <commands file>
 9 | 
10 | eg:
11 |   
12 |   makePBSp.py 10 bowtie2.cmds
13 | 
14 | will create bowtie2_N.sub files, where N equals to number of lines in bowtie2.cmds divided by 10
15 | 
16 | If you have large number of commands that you would like to package (a set number) in a single
17 | PBS script file, you can run this script along with desired number of commands per job.
18 | Note that all commands will run in parallel with this script (p suffix). If you want to run one command
19 | at a time then use the s suffix script 
20 | 
21 | Arun Seetharam
22 | arnstrm@iastate.edu
23 | 11/08/2016
24 | """
25 | import sys
26 | import os
27 | if len(sys.argv)<3:
28 |     print Usage
29 | else:
30 |    cmdargs = str(sys.argv)
31 |    cmds = open(sys.argv[2],'r')
32 |    jobname = str(os.path.splitext(sys.argv[2])[0])
33 |    filecount = 0
34 |    numcmds = int(sys.argv[1])
35 |    line = cmds.readline()
36 |    while line:
37 |         cmd = []
38 |         while len(cmd) != int(sys.argv[1]):
39 |                 cmd.append(line)
40 |                 line = cmds.readline()
41 |         w = open(jobname+'_'+str(filecount)+'.sub','w')
42 |         w.write("#!/bin/bash\n")
43 |         w.write("#SBATCH -N 1\n")
44 |         w.write("#SBATCH -n 36\n")
45 |         w.write("#SBATCH -t 96:00:00\n")
46 |         w.write("#SBATCH -J "+jobname+"_"+str(filecount)+"\n")
47 |         w.write("#SBATCH -o "+jobname+"_"+str(filecount)+".o%j\n")
48 |         w.write("#SBATCH -e "+jobname+"_"+str(filecount)+".e%j\n")
49 |         w.write("#SBATCH --mail-user=arnstrm@gmail.com\n")
50 |         w.write("#SBATCH --mail-type=begin\n")
51 |         w.write("#SBATCH --mail-type=end\n")
52 |         w.write("cd $SLURM_SUBMIT_DIR\n")
53 |         w.write("ulimit -s unlimited\n")
54 |         w.write("source /work/LAS/mhufford-lab/arnstrm/miniconda/etc/profile.d/conda.sh\n")
55 |         w.write("module purge\n")
56 |         w.write("module load parallel\n")
57 |         w.write("parallel -j 1 --joblog "+jobname+"_progress_"+str(filecount)+".log --workdir $PWD <<FIL\n")
58 |         count = 0
59 |         while (count < numcmds):
60 |            w.write(cmd[count])
61 |            count = count + 1
62 |         w.write("FIL\n")
63 |         w.write("scontrol show job $SLURM_JOB_ID\n")
64 |         w.close()
65 |         filecount += 1
66 | 


--------------------------------------------------------------------------------
/makeSLURMp_ceres.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | Usage = """
 4 | creates a job file for desired number of commands per job
 5 | 
 6 | Usage:
 7 | 
 8 |   makeSLURMp.py <number of jobs per PBS file> <commands file>
 9 | 
10 | eg:
11 |   
12 |   makeSLURMp.py 10 bowtie2.cmds
13 | 
14 | will create bowtie2_N.sub files, where N equals to number of lines in bowtie2.cmds divided by 10
15 | 
16 | If you have large number of commands that you would like to package (a set number) in a single
17 | PBS script file, you can run this script along with desired number of commands per job.
18 | Note that all commands will run in parallel with this script (p suffix). If you want to run one command
19 | at a time then use the s suffix script 
20 | 
21 | PLEASE EDIT THIS SCRIPT TO INCLUDE THE RIGHT EMAIL!
22 | 
23 | Arun Seetharam
24 | arnstrm@iastate.edu
25 | 11/08/2016
26 | """
27 | import sys
28 | import os
29 | if len(sys.argv)<3:
30 |     print Usage
31 | else:
32 |    cmdargs = str(sys.argv)
33 |    cmds = open(sys.argv[2],'r')
34 |    jobname = str(os.path.splitext(sys.argv[2])[0])
35 |    filecount = 0
36 |    numcmds = int(sys.argv[1])
37 |    line = cmds.readline()
38 |    while line:
39 |         cmd = []
40 |         while len(cmd) != int(sys.argv[1]):
41 |                 cmd.append(line)
42 |                 line = cmds.readline()
43 |         w = open(jobname+'_'+str(filecount)+'.sub','w')
44 |         w.write("#!/bin/bash\n")
45 |         w.write("#SBATCH --partition=short\n")
46 |         w.write("#SBATCH --mem-per-cpu=3G\n")
47 |         w.write("#SBATCH -n 20\n")
48 |         w.write("#SBATCH -t 48:00:00\n")
49 |         w.write("#SBATCH -J "+jobname+"_"+str(filecount)+"\n")
50 |         w.write("#SBATCH -o "+jobname+"_"+str(filecount)+".o%j\n")
51 |         w.write("#SBATCH -e "+jobname+"_"+str(filecount)+".e%j\n")
52 |         w.write("#SBATCH --mail-user=email@gmail.com\n")
53 |         w.write("#SBATCH --mail-type=BEGIN,END,FAIL\n")
54 |         w.write("cd $SLURM_SUBMIT_DIR\n")
55 |         w.write("ulimit -s unlimited\n")
56 |         w.write("module load parallel\n")
57 |         w.write("parallel -j 10 --joblog "+jobname+"_progress_"+str(filecount)+".log --workdir $PWD <<FIL\n")
58 |         count = 0
59 |         while (count < numcmds):
60 |            w.write(cmd[count])
61 |            count = count + 1
62 |         w.write("FIL\n")
63 |         w.write("scontrol show job $SLURM_JOB_ID\n")
64 |         w.close()
65 |         filecount += 1
66 | 


--------------------------------------------------------------------------------
/makeSLURMs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | Usage = """
 4 | creates a job file for desired number of commands per job
 5 | 
 6 | Usage:
 7 | 
 8 |   makeSLURMs.py <number of jobs per PBS file> <commands file>
 9 | 
10 | eg:
11 |   
12 |   makeSLURMs.py 10 bowtie2.cmds
13 | 
14 | will create bowtie2_N.sub files, where N equals to number of lines in bowtie2.cmds divided by 10
15 | 
16 | If you have large number of commands that you would like to package (a set number) in a single
17 | PBS script file, you can run this script along with desired number of commands per job.
18 | Note that all commands will run in serially with this script (s suffix). If you want to run all commands at a time,
19 | parallel fashion, then use the p suffix script 
20 | 
21 | Arun Seetharam
22 | arnstrm@iastate.edu
23 | 11/08/2016
24 | """
25 | import sys
26 | import os
27 | if len(sys.argv)<3:
28 |     print(Usage)
29 | else:
30 |    cmdargs = str(sys.argv)
31 |    cmds = open(sys.argv[2],'r')
32 |    jobname = str(os.path.splitext(sys.argv[2])[0])
33 |    filecount = 0
34 |    numcmds = int(sys.argv[1])
35 |    line = cmds.readline()
36 |    while line:
37 |         cmd = []
38 |         while len(cmd) != int(sys.argv[1]):
39 |                 cmd.append(line)
40 |                 line = cmds.readline()
41 |         w = open(jobname+'_'+str(filecount)+'.sub','w')
42 |         w.write("#!/bin/bash\n")
43 |         w.write("#SBATCH -N 1\n")
44 |         w.write("#SBATCH -n 36\n")
45 |         w.write("#SBATCH -t 96:00:00\n")
46 |         w.write("#SBATCH -J "+jobname+"_"+str(filecount)+"\n")
47 |         w.write("#SBATCH -o "+jobname+"_"+str(filecount)+".o%j\n")
48 |         w.write("#SBATCH -e "+jobname+"_"+str(filecount)+".e%j\n")
49 |         w.write("#SBATCH --mail-user=arnstrm@gmail.com\n")
50 |         w.write("#SBATCH --mail-type=begin\n")
51 |         w.write("#SBATCH --mail-type=end\n\n")
52 |         w.write("cd $SLURM_SUBMIT_DIR\n")
53 |         w.write("ulimit -s unlimited\n")
54 |         w.write("module purge\n")
55 |         w.write("module use /opt/rit/spack-modules/lmod/linux-rhel7-x86_64/Core\n")
56 |         w.write("module use /opt/rit/spack-modules/lmod/linux-rhel7-x86_64/gcc/7.3.0\n")
57 |         w.write("#module use /work/GIF/software/modules\n\n")
58 |         count = 0
59 |         while (count < numcmds):
60 |            w.write(cmd[count])
61 |            count = count + 1
62 |         w.write("\nscontrol show job $SLURM_JOB_ID\n")
63 |         w.close()
64 |         filecount += 1
65 | 


--------------------------------------------------------------------------------
/makeSLURMs_ceres.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | Usage = """
 4 | creates a job file for desired number of commands per job
 5 | 
 6 | Usage:
 7 | 
 8 |   makeSLURMs.py <number of jobs per PBS file> <commands file>
 9 | 
10 | eg:
11 |   
12 |   makeSLURMs.py 10 bowtie2.cmds
13 | 
14 | will create bowtie2_N.sub files, where N equals to number of lines in bowtie2.cmds divided by 10
15 | 
16 | If you have large number of commands that you would like to package (a set number) in a single
17 | PBS script file, you can run this script along with desired number of commands per job.
18 | Note that all commands will run in serially with this script (s suffix). If you want to run all commands at a time,
19 | parallel fashion, then use the p suffix script.
20 | 
21 | 
22 | PLEASE EDIT THIS SCRIPT TO INCLUDE THE RIGHT EMAIL! 
23 | 
24 | Arun Seetharam
25 | arnstrm@iastate.edu
26 | 11/08/2016
27 | """
28 | import sys
29 | import os
30 | if len(sys.argv)<3:
31 |     print Usage
32 | else:
33 |    cmdargs = str(sys.argv)
34 |    cmds = open(sys.argv[2],'r')
35 |    jobname = str(os.path.splitext(sys.argv[2])[0])
36 |    filecount = 0
37 |    numcmds = int(sys.argv[1])
38 |    line = cmds.readline()
39 |    while line:
40 |         cmd = []
41 |         while len(cmd) != int(sys.argv[1]):
42 |                 cmd.append(line)
43 |                 line = cmds.readline()
44 |         w = open(jobname+'_'+str(filecount)+'.sub','w')
45 |         w.write("#!/bin/bash\n")
46 |         w.write("#SBATCH --partition=short\n")
47 |         w.write("#SBATCH --mem-per-cpu=3G\n")
48 |         w.write("#SBATCH -n 20\n")
49 |         w.write("#SBATCH -t 48:00:00\n")
50 |         w.write("#SBATCH -J "+jobname+"_"+str(filecount)+"\n")
51 |         w.write("#SBATCH -o "+jobname+"_"+str(filecount)+".o%j\n")
52 |         w.write("#SBATCH -e "+jobname+"_"+str(filecount)+".e%j\n")
53 |         w.write("#SBATCH --mail-user=email@gmail.com\n")
54 |         w.write("#SBATCH --mail-type=BEGIN,END,FAIL\n")
55 |         w.write("cd $SLURM_SUBMIT_DIR\n")
56 |         w.write("ulimit -s unlimited\n")
57 |         count = 0
58 |         while (count < numcmds):
59 |            w.write(cmd[count])
60 |            count = count + 1
61 |         w.write("scontrol show job $SLURM_JOB_ID\n")
62 |         w.close()
63 |         filecount += 1
64 | 


--------------------------------------------------------------------------------
/mayday.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is a bash script to delete all running jobs.
 3 | # 04/17/2014
 4 | # Arun Seetharam <arnstrm@iastate.edu>
 5 | 
 6 | scriptName="${0##*/}"
 7 | function deleteJobs () {
 8 | 
 9 | re='^[0-9]+$'
10 | qstat -u ${USER} | sed 1d | \
11 | while read line
12 | do
13 |   jobid=$(echo $line |cut -d "." -f 1);
14 |       if  [[ $jobid =~ $re ]] ; then
15 |       echo "qdel $jobid";
16 |       fi
17 | done
18 | 
19 | }
20 | 
21 | function printUsage () {
22 |     cat <<EOF
23 | 
24 | Synopsis
25 | 
26 |     $scriptName [-h | --help]
27 | 
28 | Description
29 | 
30 |     This is a bash script that kills all the running jobs of the executing user
31 |     It just asks for confirmation once, if true deletes ALL running jobs.
32 | 
33 |         -h, --help
34 |         Brings up this help page
35 | 
36 | Author
37 | 
38 |     Arun Seetharam, Genome Informatics Facilty, Iowa State University
39 |     arnstrm@iastate.edu
40 |     06 April, 2014
41 | 
42 | 
43 | 
44 | EOF
45 | }
46 | 
47 | while :
48 | do
49 |     case $1 in
50 |         -h | --help | -\?)
51 |             printUsage
52 |             exit 0
53 |             ;;
54 |         -*)
55 |             printf >&2 'WARNING: Unknown option : %s ... now exiting (try -h or --help) \n' "$1"
56 |             exit 0
57 |             ;;
58 |         *)
59 |             break
60 |             ;;
61 |     esac
62 | done
63 | 
64 | while true; do
65 |     read -p "Do you want to kill all your running jobs? [Y/N] :" yn
66 |     case $yn in
67 |         [Yy]* )
68 | 		   deleteJobs
69 | 		   exit 0
70 | 		   ;;
71 |         [Nn]* )
72 | 		   exit 0
73 | 		   ;;
74 |         * ) echo "Please answer yes or no.";;
75 |     esac
76 | done
77 | 
78 | 


--------------------------------------------------------------------------------
/md:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | perl -pe 's/  +/\t/g' | awk 'BEGIN{FS="\t"} (NR==2) {for (i = 1; i <= NF; i++) printf "--|";print "\n"$0} (NR==3){print ""$0} (NR>3 || NR <2) {print $0}' | perl -pe 's/\t/|/g'  | awk '(NR==2){print "|"$0} (NR!=2) {print "|"$0"|"}'
4 | 


--------------------------------------------------------------------------------
/mismatch-counter.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # This script compares the data in 3 columns, priting the column 2 and number of character mismatches the column 2 has
  3 | # compared to the first column
  4 | # Arun Seetharam <arnstrm@iastate.edu>
  5 | # 03/29/2019
  6 | 
  7 | function printUsage () {
  8 |     cat <<EOF
  9 | 
 10 | Synopsis
 11 | 
 12 |     $scriptName [-h | --help] tab-delimited-2-column-text file
 13 | 
 14 | Description
 15 | 
 16 |     This is a awk script that counts the character mis-matches between 2 strings
 17 |     It requires a tab-delimited file with 2 columns, with first column having the query string
 18 |     and the second column with target string. The script pritns the target string and the number of mis-matches
 19 |     the target string has compared to the query string. 
 20 | 
 21 | 
 22 |         -h, --help
 23 |         Brings up this help page
 24 | 
 25 | 
 26 |        	tab-delimited 2-column text file
 27 |         A text file with any extension (un-compressed), with 2 columns. First column query and second target.
 28 | 
 29 | Author
 30 | 
 31 |     Arun Seetharam, Genome Informatics Facilty, Iowa State University
 32 |     arnstrm@iastate.edu
 33 |     29 March, 2019
 34 | 
 35 | 
 36 | 
 37 | EOF
 38 | }
 39 | 
 40 | if [ $# -lt 1 ] ; then
 41 |         printUsage
 42 |         exit 0
 43 | fi
 44 | 
 45 | file=$1
 46 | 
 47 | # sanity checks
 48 | # see if file exists
 49 | if [ ! -f $file ]; then
 50 |     echo "\"$i\" file not found!"
 51 |     exit 1;
 52 | fi
 53 | 
 54 | # see if it has uneven number of columns
 55 | 
 56 | col1=$(awk '{print NF}' $file | sort | uniq |wc -l)
 57 | 
 58 | if [ ${col1} -gt 1 ]; then
 59 |     echo "check file format, some rows have more/less than 2 columns!"
 60 |     exit 1;
 61 | 
 62 | fi
 63 | 
 64 | # check if they have exactly 2 columns
 65 | col2=$(awk '{print NF}' $file | sort | uniq | head -n 1)
 66 | 
 67 | 
 68 | if [ ${col2} -ne 2 ]; then
 69 |     echo "check file format, there should be exactly 2 columns!"
 70 |     exit 1;
 71 | 
 72 | fi
 73 | 
 74 | 
 75 | while :
 76 | do
 77 |     case $1 in
 78 |         -h | --help | -\?)
 79 |             printUsage
 80 |             exit 0
 81 |             ;;
 82 |         --)
 83 |             shift
 84 |             break
 85 |             ;;
 86 |         -*)
 87 |             printf >&2 'WARNING: Unknown option (ignored): %s\n' "$1"
 88 |             shift
 89 |             ;;
 90 |         *)
 91 |             break
 92 |             ;;
 93 |     esac
 94 | done
 95 | 
 96 | 
 97 | awk '{
 98 |      {mm=0} max=(length($1) >= length($2))? length($1): length($2)
 99 |      for(i=1; i <= max; i++)
100 |      {
101 |      	 v1=substr($1, i, 1)
102 |      	 v2=substr($2, i, 1)
103 |      	 if(v1 != v2){ mm++ }
104 |      }
105 |     }
106 |     { print $2"\t"mm }' $file
107 | 


--------------------------------------------------------------------------------
/nanoVersions.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # nanoVerions.sh report_FAO68032_20210219_1551_cc50bf20.md 
 4 | # you will need to find the markdown FAO report file.
 5 | 
 6 | guppy=$(cat $1 | grep guppy | awk '{print $NF}') # guppy version
 7 | minknow=$(cat $1 | grep distribution_version| awk '{print $NF}') # MinKNOW version
 8 | bream=$(cat $1 | grep protocols_version| awk '{print $NF}') # Bream version
 9 | minknowcore=$(cat $1 | grep configuration_version| awk '{print $NF}') # #MinKNOW Core version
10 | flowcell=$(cat $1 | grep exp_script_name | awk '{print $NF}')
11 | 
12 | cat <<OUTPUT 
13 | | Software | Version | 
14 | | -- | -- | 
15 | | MinKNOW	| $minknow |
16 | | MinKNOW Core | $minknowcore |
17 | | Bream	| $bream |
18 | | Guppy	| $guppy |
19 | | flowcell | $flowcell | 
20 | OUTPUT
21 | 


--------------------------------------------------------------------------------
/noTabCompletion.sh:
--------------------------------------------------------------------------------
1 | unsetopt AUTO_MENU       # Disable automatic menu on ambiguous completions
2 | unsetopt MENU_COMPLETE   # Don’t use menu completion
3 | unsetopt AUTO_LIST       # Don’t automatically list choices on ambiguous completion
4 | unsetopt COMPLETE_IN_WORD # Disable completion within a word
5 | bindkey "^I" self-insert
6 | # Bind Shift + Tab to delete the last character (acts like "undoing" the tab)
7 | bindkey "^[[Z" backward-delete-char
8 | 
9 | 


--------------------------------------------------------------------------------
/numberPatMatch_byLine:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #this script was written to count heterozygous and homozygous snp calls for each snp across many lines.
 4 | #it was used to determine how much heterozygosity there was per snp before and after filtering.
 5 |  
 6 | while read line
 7 | do
 8 | counts=`echo $line | grep -o "0|1" | wc -l;`
 9 | counts2=`echo $line | grep -o "1|0" | wc -l;`
10 | counts3=`echo $line | grep -o "1|1" | wc -l;`
11 | counts4=`echo $line | grep -o "0|0" | wc -l;`
12 | countsHet=$(expr $counts + $counts2) 
13 | echo "$line $counts4 $counts3 $counts $counts2 $countsHet"
14 | done
15 | 


--------------------------------------------------------------------------------
/parse.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | while read line
 3 |   do
 4 | if grep -q "^[0-9]" <<<${line}; then
 5 | org=$(echo $line  | cut -d "." -f 2 | sed -e 's/^[ \t]*//')
 6 | echo -en "$org\t"
 7 | fi
 8 | if grep -q "Chromosomes:" <<<${line}; then
 9 | chr=$(echo $line  | cut -d ":" -f 2 | sed -e 's/^[ \t]*//')
10 | echo -en "$chr\t"
11 | fi
12 | if grep -q "Genome ID:" <<<${line}; then
13 | gen=$(echo $line  | cut -d ":" -f 2 | sed -e 's/^[ \t]*//')
14 | echo "$gen"
15 | fi
16 | done < $1
17 | 
18 | 


--------------------------------------------------------------------------------
/pathadd:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #add a path to your PATH variable quickly
4 | 
5 | export PATH=$PATH:$1
6 | echo $PATH
7 | 


--------------------------------------------------------------------------------
/pb_errc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | source /data004/software/GIF/packages/SMRT/2.2.0/install/smrtanalysis-2.2.0.133377/etc/setup.sh
4 | FILE="$1"
5 | OUT=$(echo basename $FILE | cut -d "_" -f 2);
6 | SPEC="/home/arnstrm/arnstrm/20140325_Bing_Rice_error_correction/pacbio.spec"
7 | FRG="$2"
8 | pacBioToCA -length 500 -partitions 200 -l ec_${OUT}.fastq -t 16 -s ${SPEC} -fastq ${FILE} ${FRG}
9 | 


--------------------------------------------------------------------------------
/pb_errc2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | source /data004/software/GIF/packages/SMRT/2.2.0/install/smrtanalysis-2.2.0.133377/etc/setup.sh
4 | FILE="$1"
5 | OUT=$(echo basename $FILE | cut -d "_" -f 2);
6 | SPEC="/home/arnstrm/arnstrm/20140325_Bing_Rice_error_correction/pacbio2.spec"
7 | FRG="$2"
8 | /data004/software/GIF/packages/SMRT/2.2.0/install/smrtanalysis-2.2.0.133377/analysis/bin/pacBioToCA -length 500 -partitions 200 -l ec_${OUT}.fastq -t 32 -s ${SPEC} -fastq ${FILE} ${FRG}
9 | 


--------------------------------------------------------------------------------
/qcMarkdownGenerator.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # use multiqc -p . 
 4 | # the -p will generate the required plots
 5 | # then add the png folder to the multiqc_data folder and it works.
 6 | 
 7 | 
 8 | png=multiqc_plots/png
 9 | 
10 | cat <<MDFile > 01b_QC-results.md
11 | # QC
12 | ## Report
13 | ### Sequence Quality Histograms
14 | The mean quality value across each base position in the read.
15 | ![Mean Quality Score]($png/mqc_fastqc_per_sequence_quality_scores_plot_1.png)
16 | 
17 | ### Per Sequence Quality Scores
18 | The number of reads with average quality scores. Shows if a subset of reads has poor quality.
19 | ![Per Seq QC]($png/mqc_fastqc_per_sequence_quality_scores_plot_1.png)
20 | 
21 | ### Per Base Sequence Content
22 | The proportion of each base position for which each of the four normal DNA bases has been called.
23 | ![Per BAse Seq Content]($png/mqc_fastqc_per_base_sequence_quality_plot_1.png)
24 | 
25 | ### Per Sequence GC Content
26 | The average  GC content of reads. Normal random library typically have a roughly normal distribution of GC content.
27 | ![Per Seq GC content]($png/mqc_fastqc_per_sequence_gc_content_plot_Counts.png)
28 | 
29 | ### Per Base N Content
30 | The percentage of bases calles at each position for which an N was called.
31 | ![Per base N content]($png/mqc_fastqc_per_base_sequence_quality_plot_1.png)
32 | 
33 | ### Sequence legth distribution
34 | 
35 | ### Sequence Duplication Levels
36 | The relative level of duplication found for every sequence.
37 | ![Seq duplicate levels]($png/mqc_fastqc_sequence_duplication_levels_plot_1.png)
38 | 
39 | ### Adapter Content
40 | The cumulative percentage count of the proportion of your library which has seen each of the adapter sequences at each position.
41 | ![adapter content]($png/mqc_fastqc_adapter_content_plot_1.png)
42 | MDFile
43 | 
44 | echo "## General Stats" >> 01b_QC-results.md
45 | cat ./multiqc_data/multiqc_general_stats.txt | md  >> 01b_QC-results.md
46 | 
47 | echo >> 01b_QC-results.md
48 | 
49 | echo "## FastQC Stats" >> 01b_QC-results.md
50 | cat ./multiqc_data/multiqc_fastqc.txt |tr -s " " "-" | md  >> 01b_QC-results.md
51 | #I had to replace spaces with `-` in order to get the correct number of feilds in awk! I used md (from common_scripts)
52 | 


--------------------------------------------------------------------------------
/qn:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo -e "Node\tState\tnp\tProperties\n==================================";
3 | qnodes | grep -A 3 "^node" | paste - - - - - | awk '{print $1"\t\t"$4"\t\t"$7"\t"$10}'| sort -k2,2 -r;
4 | 
5 | 


--------------------------------------------------------------------------------
/qourum_ec.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | module load quorum
 3 | 
 4 | read1="$1"
 5 | #read2=$(echo ${read1} | sed 's/_R1_/_R2_/g')
 6 | read2="$2"
 7 | 
 8 | quorum \
 9 |   -s 200000000000 \
10 |   -t 32 \
11 |   -p QC \
12 |   -m 5 \
13 |   -q 33 \
14 |   ${read1} ${read2}
15 | 


--------------------------------------------------------------------------------
/quedel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ $# -lt 1 ] ; then
 3 |         echo ""
 4 |         echo "usage: quedel.sh [qstat -u username output]"
 5 |         echo "deletes all job ids submitted by a user, needs the list of jobs saved as a file"
 6 |         echo "Note: run \"qstat -u username > file\" first"
 7 |         echo "and then run \"quedel.sh file\" next"
 8 |         echo ""
 9 |         exit 0
10 | fi
11 | 
12 | FILE="$1"
13 | 
14 | while read line;
15 | do
16 | 
17 |     if [ "$line" != "" ]; then
18 |         jobid=$(echo $line |cut -d "." -f 1)
19 |         qdel ${jobid};
20 |     fi
21 | 
22 | done < ${FILE}
23 | 
24 | 


--------------------------------------------------------------------------------
/removeSeqsFromReads.sh:
--------------------------------------------------------------------------------
 1 | ###This script should remove a particular sequence from a read and retain the largest fragment of the read that remains.  
 2 | module load samtools
 3 | module load ncbi-blast
 4 | #FASTA FILE OF READS
 5 | READS=$1
 6 | #THE LENGTH OF YOUR READS
 7 | READLENGTH=$2
 8 | #THE SEQUENCES THAT YOU WANT REMOVED FROM YOUR READS
 9 | BLASTQUERY=$3
10 | 
11 | #index the reads containing spliced leaders
12 | samtools faidx $READS
13 | #make a blast database of these same reads
14 | makeblastdb -in $READS -dbtype nucl -out $READS.blast.db
15 | #blasting the spliced leaders against these sequences again to gain info on their position within the reads
16 | blastn -db $READS.blast.db -query $BLASTQUERY -outfmt 6 -word_size 19 -out $READS.blast.out
17 | 
18 | #Each line below evaluates whether the longest fragment of the read is at the 5' or 3' and then extracts the larger fragment.
19 | rm seqremoved.faidx.sh
20 | awk '$10<$9 && $9<50 {print "samtools faidx  '$READS' " '\$2' ":" '\$9' "-" "100"}' $READS.blast.out >>seqremoved.faidx.sh
21 | awk '$9<$10 && $10<50 {print "samtools faidx  '$READS' " '\$2' ":" '\$10' "-" "100"}'  $READS.blast.out >>seqremoved.faidx.sh
22 | awk '$10<$9 && $10>50 {print "samtools faidx  '$READS' "  '\$2' ":" "1" "-" '\$10'}'  $READS.blast.out >>seqremoved.faidx.sh
23 | awk '$9<$10 && $9>50 {print "samtools faidx  '$READS' "  '\$2' ":" "1" "-" '\$9'}'  $READS.blast.out >> seqremoved.faidx.sh
24 | awk '$10<$9 && $9<50 {print "samtools faidx  '$READS' " '\$2' ":" '\$9' "-" "100"}'  $READS.blast.out >> seqremoved.faidx.sh
25 | awk '$9<$10 && $10<50 {print "samtools faidx  '$READS' " '\$2' ":" '\$10' "-" "100"}'  $READS.blast.out >>seqremoved.faidx.sh
26 | awk '$10<$9 && $10>50 {print "samtools faidx  '$READS' " '\$2' ":" "1" "-" '\$10'}'  $READS.blast.out >>seqremoved.faidx.sh
27 | awk '$9<$10 && $9>50 {print "samtools faidx  '$READS' " '\$2' ":" "1" "-" '\$9'}'  $READS.blast.out >>seqremoved.faidx.sh
28 | 
29 | 
30 | 
31 | ##readlength loop
32 | if [ $READLENGTH -eq 50 ]
33 | then	
34 | 	sed -i 's/50/25/g' seqremoved.faidx.sh
35 | 	sed -i 's/100/50/g' seqremoved.faidx.sh
36 |         echo "Reads are 50bp"
37 | elif [ $READLENGTH -eq 150 ]
38 | then
39 | 	sed -i 's/50/75/g' seqremoved.faidx.sh
40 |         sed -i 's/100/150/g' seqremoved.faidx.sh	
41 | 	echo "reads are 150bp"
42 | else
43 | 	echo "assuming reads are 100bp"
44 | fi
45 | 
46 | #sometimes there are duplicates although I do not understand how.  so remove them.
47 | sort seqremoved.faidx.sh |uniq >uniq.seqremoved.faidx.sh
48 | 
49 | #run the samtools script that was just made to extract the largest fragment of the read that is not a spliced leader.
50 | sh uniq.seqremoved.faidx.sh >seqremoved.reads.fasta
51 | 
52 | 


--------------------------------------------------------------------------------
/renamed_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ $# -lt 1 ] ; then
 3 | echo "renamed_results.sh <foldername to get the fullnames>"
 4 | exit 0
 5 | fi
 6 | 
 7 | pre=$(pwd)
 8 | outdir=$(dirname ${pre})
 9 | suffix="$1"
10 | 
11 | for folderPath in ${pre}/0?; do
12 |    cd $folderPath;
13 |    folder=$(basename "$folderPath");
14 |    echo $folder
15 |    for errC in ec_*; do
16 |       filename=$(basename "$errC")
17 |       extension="${filename##*.}"
18 |       filename="${filename%%.*}"
19 |       filename=$(echo "${filename}" |cut -d "_" -f 1-2)
20 |       cat ${pre}/${folder}/${errC} >> ${outdir}/${filename}.${extension};
21 |       cd ${outdir}
22 |    done
23 | done
24 | for pbfastq in $(find /home/arnstrm/arnstrm/20140325_Bing_Rice_error_correction/02_H5_FASTQ -name "*${suffix}*.fastq"); do
25 |       pbfastqName=$(basename "${pbfastq}" )
26 |       newName="${pbfastqName%.*}";
27 |       oldName=$(echo "${pbfastqName}" | cut -d "_" -f 2);
28 |       rename ec_${oldName} ${newName}_err_corrected ec_${oldName}*;
29 | done
30 | 


--------------------------------------------------------------------------------
/reorder_fasta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | docstring= """DESCRIPTION
 6 |     Reorder the sequences in a FASTA file according to the order given in a reference
 7 |     file. The reference file has one sequence name per line.
 8 | USAGE
 9 |     reorder_fasta.py <file.fasta> <file.reference>
10 | 
11 | ----------- EXAMPLE -------------
12 | ## fasta file
13 | echo '>second_seq
14 | AAAAAAAAAAAA
15 | AAAAAAAAAAAA
16 | AAAA
17 | >first_seq
18 | TTTTTTTTTTTTTT
19 | TTTTTTTTTTTTTT
20 | TTTTTTTTT
21 | >third_seq
22 | CCCCCCCCCCCCCCCCCC' > seq.fasta
23 | 
24 | ## reference file
25 | echo 'first_seq
26 | second_seq
27 | third_seq' > ref.txt
28 | 
29 | ## Reorder fasta according to reference:
30 | reorder_fasta.py seq.fasta ref.txt
31 | >first_seq
32 | TTTTTTTTTTTTTT
33 | TTTTTTTTTTTTTT
34 | TTTTTTTTT
35 | >second_seq
36 | AAAAAAAAAAAA
37 | AAAAAAAAAAAA
38 | AAAA
39 | >third_seq
40 | CCCCCCCCCCCCCCCCCC
41 | """
42 | 
43 | if len(sys.argv) != 3:
44 |     sys.exit(docstring)
45 | 
46 | fasta= open(sys.argv[1])
47 | ref= open(sys.argv[2])
48 | 
49 | seq_dict= {}
50 | while True:
51 |     line= fasta.readline()
52 |     if line == '':
53 |         break
54 |     if line.strip().startswith('>'):
55 |         seq_name= line.strip()[1:]
56 |         seq_dict[seq_name]= []
57 |     else:
58 |         seq_dict[seq_name].append(line.strip())
59 | fasta.close()
60 | for seq_name in ref:
61 |     seq_name= seq_name.strip()
62 |     print('>' + seq_name)
63 |     print('\n'.join(seq_dict[seq_name]))
64 | ref.close()
65 | sys.exit()
66 | 


--------------------------------------------------------------------------------
/rowsums:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rowsum() # default
 4 | {
 5 | awk '{sum=0; for (i=1; i<=NF; i++) { sum+= $i } print $0,sum}'
 6 | }
 7 | 
 8 | rowsum_only() # -n
 9 | {
10 | awk '{sum=0; for (i=1; i<=NF; i++) { sum+= $i } print sum}'
11 | }
12 | 
13 | rowsum_count() # -c
14 | {
15 | awk '{sum=0;sumc=0; {for (i=1; i<=NF; i++) { if($i>0 && $i~/^[0-9]*$/) sumc+= 1; }} for (i=1; i<=NF; i++) { sum+= $i }   print $0,sum,sumc}'
16 | }
17 | 
18 | usage() # -h
19 | {
20 | 
21 | cat <<HELP
22 | usage: rowsums [OPTION...] 
23 | 	-h, 		This help page
24 | 	-n,		Print only the rowsums and not the original row + rowsum
25 | 	-c,		Print the rowsum as the total number of elements that have a value > 0 rather than the sum of those values 
26 | 	 		Default with no option prints the row then the sum at the end
27 | HELP
28 | exit
29 | }
30 | 
31 | ### Main
32 | 
33 | #defaults here
34 | 
35 | while getopts 'h|n|c' option
36 | do
37 | 	case ${option} in
38 | 		h) usage ;;
39 | 		n) rowsum_only;exit ;;
40 | 		c) rowsum_count; exit ;;
41 | 	esac
42 | done
43 | 
44 | rowsum
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/rsem_analyize_dge.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # draws cluster maps once the DGE is done.
 3 | # run it form the genes.counts or isoforms.counts directory
 4 | # needs samples descriptions (conditions.txt) file
 5 | module use /data004/software/GIF/modules
 6 | module load trinity/r20140717
 7 | module load rsem
 8 | module load samtools
 9 | module load R
10 | export PERL5LIB=/home/arnstrm/perl5/lib/perl5/x86_64-linux-thread-multi
11 | progdir="/data004/software/GIF/packages/trinity/r20140717/Analysis/DifferentialExpression"
12 | input="$1"
13 | outdir=$(basename ${input%.*})
14 | ${progdir}/analyze_diff_expr.pl \
15 |    --matrix ${input} \
16 |    --samples ../conditions.txt \
17 |    --max_genes_clust 50000 \
18 |    -P 1e-3 \
19 |    -C 2
20 | 


--------------------------------------------------------------------------------
/rsem_dge.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Runs the DGE once coutns table is generated
 3 | # you may have to run this on both isoforms.counts and genes.counts
 4 | # depending on your needs.
 5 | module use /data004/software/GIF/modules
 6 | module load trinity/r20140717
 7 | module load rsem
 8 | module load samtools
 9 | module load R
10 | export PERL5LIB=/home/arnstrm/perl5/lib/perl5/x86_64-linux-thread-multi
11 | progdir="/data004/software/GIF/packages/trinity/r20140717/Analysis/DifferentialExpression"
12 | input="$1"
13 | outdir=$(basename ${input%.*})
14 | ${progdir}/run_DE_analysis.pl \
15 |    --matrix ${input} \
16 |    --method DESeq2 \
17 |    --samples_file conditions.txt \
18 |    --output ${outdir}
19 | 


--------------------------------------------------------------------------------
/rsem_estm_matrix.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # this script runs the counts matrix generation script once the mapping is done
 3 | # rename the directories accordingly and move the ones that aren't needed.
 4 | module use /data004/software/GIF/modules
 5 | module load trinity/r20140717
 6 | module load rsem
 7 | module load samtools
 8 | module load R
 9 | module load parallel
10 | export PERL5LIB=/home/arnstrm/perl5/lib/perl5/x86_64-linux-thread-multi
11 | progdir="/data004/software/GIF/packages/trinity/r20140717/util"
12 | isofiles=(*/RSEM.isoforms.results)
13 | genfiles=(*/RSEM.genes.results)
14 | ${progdir}/abundance_estimates_to_matrix.pl \
15 |   --est_method RSEM \
16 |   --name_sample_by_basedir \
17 |   --out_prefix isoforms \
18 |   "${isofiles[@]}" > isoforms.matrix.log
19 | ${progdir}/abundance_estimates_to_matrix.pl \
20 |   --est_method RSEM \
21 |   --name_sample_by_basedir \
22 |   --out_prefix genes \
23 |   "${genfiles[@]}" > genes.matrix.log
24 | 


--------------------------------------------------------------------------------
/runAugustus.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | module load augusuts
 4 | module load maker
 5 | 
 6 | species="genus_spp"
 7 | genome="yourgenomefile.fasta"
 8 | cdna="yourtrinityfile.fasta"
 9 | maker="maker.gff"
10 | cwd=$(pwd)
11 | 
12 | # convert gff to gff3
13 | maker2zff -n ${maker}
14 | zff2gff3.pl genome.ann | perl -plne 's/\t(\S+)$/\t\.\t$1/' > ${maker%.*}.gff3
15 | 
16 | autoAug.pl --species=${species} --genome=${genome} --cdna=${cdna} --trainingset=${maker%.*}.gff3
17 | cd ./autoAug/autoAugPred_abinitio/shells
18 | parallel --joblog progress.log --workdir $PWD  "./{}" ::: aug* || {
19 | echo >&2 step 2 failed for $FILE
20 | exit 1
21 | }
22 | cd $cwd
23 | autoAug.pl --species=${species} --genome=${genome} --useexisting --hints=${cwd}/autoAug/hints/hints.E.gff  -v -v  --index=1
24 | cd autoAug/autoAugPred_hints/shells
25 | parallel --joblog progress.log --workdir $PWD  "./{}" ::: aug* || {
26 | echo >&2 step 4 failed for $FILE
27 | exit 1
28 | }
29 | autoAug.pl --species=${species} --genome=${genome} --useexisting --hints=${cwd}/autoAug/hints/hints.E.gff --estali=${cwd}/autoAug/cdna/cdna.psl -v -v -v  --index=2
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/runBUSCO.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # runs the busco pipeline on the genome assesment mode
 3 | # if your genome is not plan change the select the suitable option below
 4 | # run it as:
 5 | # sh runBUSCO_genome.sh genome.fasta OrganismProfile
 6 | # Arun Seetharam
 7 | # 2015/10/09 <arnstrm@iastate.edu>
 8 | #
 9 | #
10 | # SELECT ONE OF THE BELOW ORGANISM PROFILES
11 | #
12 | #ORG=actinopterygii_odb9
13 | #ORG=arthropoda_odb9
14 | #ORG=ascomycota_odb9
15 | #ORG=aves_odb9
16 | #ORG=basidiomycota_odb9
17 | #ORG=dikarya_odb9
18 | #ORG=diptera_odb9
19 | #ORG=embryophyta_odb9
20 | #ORG=endopterygota_odb9
21 | #ORG=euarchontoglires_odb9
22 | #ORG=eukaryota_odb9
23 | #ORG=eurotiomycetes_odb9
24 | #ORG=fungi_odb9
25 | #ORG=hymenoptera_odb9
26 | #ORG=insecta_odb9
27 | #ORG=laurasiatheria_odb9
28 | #ORG=mammalia_odb9
29 | #ORG=metazoa_odb9
30 | #ORG=microsporidia_odb9
31 | #ORG=nematoda_odb9
32 | #ORG=pezizomycotina_odb9
33 | #ORG=saccharomycetales_odb9
34 | #ORG=saccharomyceta_odb9
35 | #ORG=sordariomyceta_odb9
36 | #ORG=tetrapoda_odb9
37 | #ORG=vertebrata_odb9
38 | 
39 | #
40 | # SELECT ONE MODE BELOW
41 | #
42 | MODE=genome
43 | #MODE=transcriptome
44 | #MODE=proteins
45 | 
46 | # results will be stored in the new directroy with the genome suffix
47 | if [ $# -lt 2 ] ; then
48 | echo "please specify both your genome and the organism profile you wish to use";
49 | exit;
50 | fi
51 | 
52 | module use /work/GIF/software/modules
53 | module load GIF/busco/2.0
54 | genome="$1"
55 | ORG="$2"
56 | outname=$(basename ${genome%.*})
57 | export AUGUSTUS_CONFIG_PATH="$AUGUSTUS_CONFIG_PATH;./config"
58 | 
59 | python3 ${BUSCO_HOME}/BUSCO.py \
60 |   -o ${outname} \
61 |   -i ${genome} \
62 |   -l ${BUSCO_HOME}/${ORG} \
63 |   -m ${MODE} \
64 |   -c 16 \
65 |   -f
66 | 


--------------------------------------------------------------------------------
/runBWAcrossspp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # perfomrs mapping of reads to the indexed reference genome uses the options specified in "best practices" command genomeModule READ1 READ2
 3 | module load bwa/0.7.12
 4 | module load samtools
 5 | module load $1
 6 | REF="$GENOMEDIR/$GNAME"
 7 | # this option might be the frequetly changed, hence not it's a variable
 8 | THREADS="16"
 9 | # if the reads are paired then use -p option
10 | if [ "$#" -eq 3 ]; then
11 |   READ1="$2"
12 |   READ2="$3"
13 |   OUTNAME=$(basename ${READ1%.*} | cut -f 1-2 -d "_")
14 |   bwa mem -M -x ont2d -t ${THREADS} ${REF} ${READ1} ${READ2} | samtools view -buS - > ${OUTNAME}.bam
15 | # if not just use the reads as single reads
16 | elif [ "$#" -eq 1 ]; then
17 |   READ1="$2"
18 |   OUTNAME=$(basename ${READ1%.*} | cut -f 1-2 -d "_")
19 |   bwa mem -M -x ont2d -t ${THREADS} ${REF} ${READ1} | samtools view -buS - > ${OUTNAME}.bam
20 | # if number of arguments do not match, raise error
21 | else
22 |   echo "ERROR: INVALID NUMBER OF ARGUMENTS"
23 |   echo "runBWAcrossspp.sh genomeModule READ1 READ2"
24 | fi
25 | 


--------------------------------------------------------------------------------
/runBayesScan.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | module use /data004/software/GIF/modules
 3 | module load bayescan
 4 | module load parallel
 5 | FILE=$(basename $1);
 6 | pre=$(pwd)
 7 | progdir="${pre}/prog"
 8 | 
 9 | awk 'NR > 1309' $FILE | cut -f 1-9,15-18,20-21,25-27,31,34,38,40-42,48-49,52-53,55-56,58-62,64,68,70,73,77-79,81-84,86 > temp.vcf
10 | head -n 1309 $FILE > header.vcf
11 | cat header.vcf temp.vcf >> ${FILE%%.*}_grp.vcf
12 | cat header.vcf temp.vcf >> ${FILE%%.*}_gen.vcf
13 | 
14 | java -Xmx200g -Xms10g -jar ${progdir}/PGDSpider2-cli.jar -inputfile ${pre}/${FILE%%.*}_grp.vcf -inputformat VCF -outputfile ${pre}/${FILE%%.*}_grp.bayes -outputformat GESTE_BAYE_SCAN -spid ${pre}/vcf_bayescan_group.spid
15 | java -Xmx200g -Xms10g -jar ${progdir}/PGDSpider2-cli.jar -inputfile ${pre}/${FILE%%.*}_gen.vcf -inputformat VCF -outputfile ${pre}/${FILE%%.*}_gen.bayes -outputformat GESTE_BAYE_SCAN -spid ${pre}/vcf_bayescan_gen.spid
16 | java -Xmx200g -Xms10g -jar ${progdir}/PGDSpider2-cli.jar -inputfile ${pre}/${FILE} -inputformat VCF -outputfile ${pre}/${FILE%%.*}_yer.bayes -outputformat GESTE_BAYE_SCAN -spid ${pre}/vcf_bayescan_year.spid
17 | 
18 | mkdir -p ${FILE%%.*}_group ${FILE%%.*}_generations ${FILE%%.*}_year
19 | 
20 | parallel <<CMDS
21 | bayescan ${pre}/${FILE%%.*}_grp.bayes -od ${pre}/${FILE%%.*}_group -threads 10 -n 1000000 -pr_odds 10000
22 | bayescan ${pre}/${FILE%%.*}_gen.bayes -od ${pre}/${FILE%%.*}_generations -threads 10 -n 1000000 -pr_odds 10000
23 | bayescan ${pre}/${FILE%%.*}_yer.bayes -od ${pre}/${FILE%%.*}_year -threads 12 -n 1000000 -pr_odds 10000
24 | CMDS
25 | 


--------------------------------------------------------------------------------
/runBlobtools.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | module use /shared/software/GIF/modules/
 4 | module load bwa
 5 | module load samtools
 6 | module load GIF2/blobtools
 7 | 
 8 | BAM=red_abalone_02Jun2017_5fUJu_rnaseq.bam
 9 | GENOME=red_abalone_02Jun2017_5fUJu.fasta
10 | BLAST=red_abalone_02Jun2017_5fUJu.vs.nt.cul5.1e25.megablast.out
11 | 
12 | 
13 | NODES=/shared/software/GIF/programs/blobtools/nodes.dmp
14 | NAMES=/shared/software/GIF/programs/blobtools/names.dmp
15 | 
16 | blobtools create \
17 |   -i $GENOME \
18 |   -b $BAM \
19 |   -t $BLAST \
20 |   --nodes $NODES \
21 |   --names $NAMES \
22 |   -o blobplot_out
23 | 
24 | mkdir -p blobplot_files
25 | 
26 | blobtools view \
27 |   -i blobplot_out.blobDB.json \
28 |   -o blobplot_files/
29 | 
30 | blobtools blobplot -i blobplot_out.blobDB.json -o blobplot_files/
31 | 
32 | grep -v '^#' blobplot_files/blobplot_out.blobDB.table.txt | cut -f 1,3 > blobDB.id.gc.txt
33 | awk '$2 < 0.25' blobDB.id.gc.txt |   cut -f1 |   perl -lne 'print $_.",<20%"'   > blobDB.id.gc.catcolour.txt
34 | awk '$2 >= 0.20 && $2 < 0.30' blobDB.id.gc.txt |   cut -f1 |   perl -lne 'print $_.",20-29%"'   >> blobDB.id.gc.catcolour.txt
35 | awk '$2 >= 0.30 && $2 < 0.40' blobDB.id.gc.txt |   cut -f1 |   perl -lne 'print $_.",30-39%"'   >> blobDB.id.gc.catcolour.txt
36 | awk '$2 >= 0.40 && $2 < 0.50' blobDB.id.gc.txt |   cut -f1 |   perl -lne 'print $_.",40-49%"'   >> blobDB.id.gc.catcolour.txt
37 | awk '$2 >= 0.50 && $2 < 0.60' blobDB.id.gc.txt |   cut -f1 |   perl -lne 'print $_.",50-59%"'   >> blobDB.id.gc.catcolour.txt
38 | awk '$2 >= 0.60 && $2 < 0.70' blobDB.id.gc.txt |   cut -f1 |   perl -lne 'print $_.",60-69%"'   >> blobDB.id.gc.catcolour.txt
39 | awk '$2 >= 0.70 && $2 < 0.80' blobDB.id.gc.txt |   cut -f1 |   perl -lne 'print $_.",70-79%"'   >> blobDB.id.gc.catcolour.txt
40 | awk '$2 >= 0.80 && $2 < 0.90' blobDB.id.gc.txt |   cut -f1 |   perl -lne 'print $_.",80-89%"'   >> blobDB.id.gc.catcolour.txt
41 | awk '$2 >= 0.90 && $2 < 1.00' blobDB.id.gc.txt |   cut -f1 |   perl -lne 'print $_.",90-99%"'   >> blobDB.id.gc.catcolour.txt
42 | 
43 | blobtools covplot \
44 |   -i blobplot_out.blobDB.json \
45 |   -c all_reads.bam.cov \
46 |   --catcolour blobDB.id.gc.catcolour.txt \
47 |   --notitle \
48 |   --ylabel WGA-resequencing-library \
49 |   --xlabel WGS-resequencing-library \
50 | 
51 | mkdir -p blobplot_blobs
52 | 
53 | blobtools blobplot \
54 |   -i blobplot_out.blobDB.json \
55 |   -o blobplot_blobs/
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/runBraker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # works on L3
 3 | # change this to condo is you need to run it there
 4 | # needs rnaseq reads as well as the genome to be annotated
 5 | # if you have multiple RNA seq libraries merge them together (all R1's and all R2's seperately)
 6 | if [ $# -lt 3 ] ; then
 7 | 	echo "usage: runBraker.sh <RNAseq_R1> <RNAseq_R2> <genome.fasta>"
 8 | 	echo ""
 9 | 	echo "To align RNAseq reads to genome and run Braker gene prediction program"
10 | 	echo ""
11 | exit 0
12 | fi
13 | module use /shared/software/GIF/modules
14 | module load hisat2
15 | module load braker/1.9
16 | 
17 | cp $GENEMARK_PATH/gm_key ~/.gm_key
18 | 
19 | R1="$1"
20 | R2="$2"
21 | GENOME="$3"
22 | BASE=$(basename ${GENOME%.*})
23 | 
24 | hisat2-build ${GENOME} ${GENOME%.*}
25 | hisat2 -p 15 -x ${GENOME%.*} -1 ${R1} -2 ${R2} | samtools view -bS - > ${BASE}_rnaseq.bam
26 | samtools sort -m 5G ${BASE}_rnaseq.bam > ${BASE}_sorted_rnaseq.bam
27 | braker.pl --cores=32 --overwrite --species=${BASE} --genome=${GENOME} --bam=${BASE}_sorted_rnaseq.bam --gff3
28 | 


--------------------------------------------------------------------------------
/runGenomeScope.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # runGenomeScope R1Read R2read 21
 4 | 
 5 | readR1=$1
 6 | #readR2=$2
 7 | kmer=$2
 8 | outName=$(basename $1 .gz)
 9 | outName=${outName}_${kmer}
10 | 
11 | module load jellyfish/2.2.7-py2-euxjh7e
12 | jellyfish count -C -m ${kmer} -s 3000000000 -t 30 <(zcat ${readR1} ${readR2}) -o $outName.jf
13 | # Create histogram from that data
14 | 
15 | jellyfish histo -t 36 ${outName}.jf > ${outName}.hist
16 | 
17 | # Run genomescope with hist or
18 | # once the histo file is created, visit http://qb.cshl.edu/genomescope/ website to upload the histo file
19 | 
20 | module use /work/gif/modules/software/
21 | module load genomescope2.0
22 | module load r
23 | 
24 | genomescope.R -i ${outName}.hist --max_kmercov 1000 -k 21 -o ${outName}_genomescope_out -p 2 --fitted_hist
25 | 


--------------------------------------------------------------------------------
/runGenomeScope_arun.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | module load GIF2/jellyfish/2.2.5
 3 | set +o posix
 4 | readR1=$1 
 5 | readR2=$2
 6 | kmer=$3
 7 | baseName=$4
 8 | ###############################################
 9 | #paste <(ls -1 *R1*) <(ls -1 *R2*)  |awk '{print $1,$2,"21",$1}' |sed 's/\./\t/5' |awk '{print "sh runGenomeScope.sh "$1,$2,$3,$4}' >jellyfish.sh
10 | #the above creates the sh script 
11 | #sh runGenomeScope.sh 312_S59_R1.fq.gz 312_S59_R2.fq.gz 21 312_S59_R1
12 | #sh runGenomeScope.sh 314_S67_R1.fq.gz 314_S67_R2.fq.gz 21 314_S67_R1
13 | 
14 | ###############################################
15 | 
16 | 
17 | jellyfish count -C -m $kmer -s 1000000000 -t 10 <(zcat ${readR1} ${readR2}) -o ${baseName}_K${kmer}.jf  ;\
18 | jellyfish histo -t 16 ${baseName}_K${kmer}.jf > ${baseName}_K${kmer}.histo ; \
19 | 
20 | 
21 | 
22 | # once the histo file is created, visit http://qb.cshl.edu/genomescope/ website to upload the histo file
23 | 
24 | 


--------------------------------------------------------------------------------
/runGmap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #Makes a database and searches your sequences.
 4 | #sh runGmap.sh <database name> <folder of database file ending with a "/"> <Fasta file> <query file>
 5 | 
 6 | #examples
 7 | #sh run_gmap.sh red_abalone_02Jun2017_5fUJu /work/GIF/remkv6/Serb/03_DavideGMAP/ red_abalone_02Jun2017_5fUJu.fasta DavideQuerydna.fasta
 8 | #sh run_gmap.sh  m.yessoensisGenome /work/GIF/remkv6/Serb/03_DavideGMAP DavideQuerydna.fasta
 9 | #sh run_gmap.sh Crassostreagigasgenome /work/GIF/remkv6/Serb/03_DavideGMAP Crassostreagigasgenome.fa DavideQuerydna.fasta
10 | 
11 | 
12 | module load gsnap
13 | dbname=$1
14 | dbloc=$2
15 | dbfasta=$3
16 | query=$4
17 | gmap_build -d $dbname  -D $dbloc $dbfasta
18 | gmap -D $dbloc -d $dbname -B 5 -t 16  --input-buffer-size=1000000 --output-buffer-size=1000000 -f psl  $query >${dbname%.*}.${query%.*}.psl
19 | 


--------------------------------------------------------------------------------
/runGuidenceAA.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | module load guidance/2.02
 3 | # proteins using MAFFT
 4 | perl /shared/software/GIF/programs/guidance/2.02/www/Guidance/guidance.pl \
 5 |         --seqFile "$1"     \
 6 |         --msaProgram MAFFT \
 7 |         --seqType aa    \
 8 |         --outDir $(pwd)/${1%.*} \
 9 |         --proc_num 16
10 | 
11 | 


--------------------------------------------------------------------------------
/runGuidenceNT.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | module load guidance/2.02
 3 | # nucleotide using PRANK
 4 | perl /shared/software/GIF/programs/guidance/2.02/www/Guidance/guidance.pl \
 5 |         --seqFile "$1"     \
 6 |         --msaProgram PRANK \
 7 |         --seqType codon    \
 8 |         --outDir $(pwd)/${1%.*} \
 9 |         --proc_num 16
10 | 
11 | 


--------------------------------------------------------------------------------
/runHISAT2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | module load hisat2
 4 | module load samtools
 5 | DBDIR="/data013/GIF/arnstrm/GENOMEDB"
 6 | GENOME="Mus_musculus.GRCm38.dna.toplevel_hisat2"
 7 | 
 8 | p=16
 9 | R1_FQ="$1"
10 | R2_FQ="$2"
11 | 
12 | OUTPUT=$(basename ${R1_FQ} |cut -f 1 -d "_");
13 | 
14 | hisat2 \
15 |   -p ${p} \
16 |   -x ${DBDIR}/${GENOME} \
17 |   -1 ${R1_FQ} \
18 |   -2 ${R2_FQ} | \
19 |   -S  ${OUTPUT}.sam &> ${OUTPUT}.log
20 | samtools view --threads 16 -b -o ${OUTPUT}.bam ${OUTPUT}.sam
21 | samtools sort -m 7G -o ${OUTPUT}_sorted.bam -T ${OUTPUT}_temp --threads 16 ${OUTPUT}.bam
22 | 


--------------------------------------------------------------------------------
/runIRF.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # run it as:
 3 | # runIRF.sh genome.fasta
 4 | #you still need to check  overlap with repeatmodeler tracks, and use bedtools merge to eliminate redundancy in any subsequent analyses.
 5 | if [ $# -lt 1 ] ; then
 6 |         echo "usage: runIRF.sh <genome.fasta>"
 7 |         echo ""
 8 |         echo "Identifies terminal inverted repeats in the genome"
 9 | 
10 |         echo ""
11 | exit 0
12 | fi
13 | 
14 | 
15 | GENOME=$1
16 | module purge
17 | module load perl
18 | module load GIF/irf/307
19 | 
20 | irf ${GENOME}  2 3 5 80 10 40 500000 10000 -t7 20000 -ngs >IRF.out
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/runLTRfinder.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # run it as:
 3 | # runLTRfinder.sh genome.fasta
 4 | #you still need to check  overlap with repeatmodeler tracks, and use bedtools merge to eliminate redundancy in any subsequent analyses.
 5 | if [ $# -lt 1 ] ; then
 6 |         echo "usage: runLTRfinder.sh <genome.fasta>"
 7 |         echo ""
 8 |         echo "Identifies LTRs and other features of LTR retrotrtansposons"
 9 | 
10 |         echo ""
11 | exit 0
12 | fi
13 | 
14 | 
15 | GENOME=$1
16 | module purge
17 | module load GIF/ltrfinder
18 | /work/GIF/software/programs/ltrfinder/1.0.5//ltr_finder ${GENOME} -w 1 2 >ltrFinderTable
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/runMaSuRCA.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | module load masurca
3 | module load jellyfish
4 | module load gcc
5 | masurca sr_config.txt
6 | ./assemble.sh
7 | 
8 | 


--------------------------------------------------------------------------------
/runMegablast.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | wget ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz
 3 | tar -zxvf taxdb.tar.gz
 4 | 
 5 | module load ncbi-blast
 6 | FASTA="$1"
 7 | blastn \
 8 | -task megablast \
 9 | -query ${FASTA} \
10 | -db /work/GIF/GIF3/arnstrm/Baum/GenePrediction_Hg_20160115/05_databases/nt/nt \
11 | -outfmt '6 qseqid staxids bitscore std sscinames sskingdoms stitle' \
12 | -culling_limit 5 \
13 | -num_threads 16 \
14 | -evalue 1e-25 \
15 | -out ${FASTA%.**}.vs.nt.cul5.1e25.megablast.out
16 | 


--------------------------------------------------------------------------------
/runMinimap2-cDNA.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | genome=$1
3 | cdna=$2
4 | out=$(basename ${genome%.*} |cut -f 1 -d ".")
5 | minimap2 -ax splice -a -uf -C1 -k 12 -t 36 ${genome} --cs ${cdna} > ${out}-${cdna%.*}.sam
6 | 


--------------------------------------------------------------------------------
/runPlatanus.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #create files list as follows
 3 | #SRR943126_1.fastq
 4 | #SRR943126_2.fastq
 5 | #SRR943127_1.fastq
 6 | #SRR943127_2.fastq
 7 | #SRR943130_1.fastq
 8 | #SRR943130_2.fastq
 9 | #SRR943129_1.fastq
10 | #SRR943129_2.fastq
11 | #SRR943128_1.fastq
12 | #SRR943128_2.fastq
13 | #single field with just the filename followed by its mate
14 | 
15 | platanus_trim -i paired_end_files_list -t 16
16 | platanus_internal_trim -i mate_pair_files_list -t 16
17 | 
18 | platanus assemble \
19 |     -o plat_assembly \
20 |     -f read_250_[AB]_R[12].fq.trimmed \
21 |     -t 40 \
22 |     -m 500
23 | 
24 | platanus scaffold \
25 |     -o plat_scaf \
26 |     -c plat_assembly_contig.fa \
27 |     -b  plat_assembly_contigBubble.fa \
28 |     -IP1 read_250_[AB]_R[12].fq.trimmed \
29 |     -OP2 matepair_R[12].fq.int_trimmed \
30 |     -n2 7000 \
31 |     -a2 8000 \
32 |     -d2 500 \
33 |     -t 40 \
34 |     -tmp $TMPDIR
35 | 
36 | platanus gap_close \
37 |     -o plat_gapclose \
38 |     -c plat_scaf_scaffold.fa \
39 |     -IP1 matepair_R[12].fq.int_trimmed \
40 |     -OP2 read_250_[AB]_R[12].fq.trimmed \
41 |     -t 40 \
42 |     -tmp $TMPDIR
43 | 
44 | 


--------------------------------------------------------------------------------
/runRAxML.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | module load raxml
 3 | #modified for use with condo2017
 4 | if [ $# -lt 2 ] ; then
 5 |         echo "usage: runRAxML.sh <alignment_file> <AA|NT>"
 6 |         echo ""
 7 |         echo "run rapid Bootstrap analysis and find the best scoring ML tree from alignment file"
 8 |         echo "if alignment is protein, use AA; if nucleotide use NT"
 9 |         echo "for AA it uses use Automatic protein model assignment, and for NT it is GTRGAMMA model"
10 |         echo ""
11 | exit 0
12 | fi
13 | 
14 | if [ $2 == "AA" ] 
15 | then
16 | MOD="PROTGAMMAAUTO"
17 | else
18 | 	if [$2 == "NT" ] 
19 | 	then
20 | 	MOD="GTRGAMMA"
21 | 	fi
22 | fi
23 | 
24 | #the last section of the input file name will be used to name the guidance run. Adjust the cut column to obtain unique names for files.  Guidance will not run without.
25 | ALN="$1"
26 | SUF=$(basename ${ALN} |cut -f 2 -d "_")
27 | 
28 | raxmlHPC-PTHREADS-AVX \
29 |           -T 16 \
30 |           -f a \
31 |           -m ${MOD} \
32 |           -p 12345 \
33 |           -x 12345 \
34 |           -# 100 \
35 |           -s ${ALN} \
36 |           -n ${SUF}
37 | 
38 | # -T 16 : number of threads set to 16 (condo)
39 | # -f a : rapid Bootstrap analysis and search for best-scoring ML tree in one program run
40 | # -m PROTGAMMAAUTO : use Automatic protein model assignment algorithm based on ML scores
41 | # -p 12345 : random number seed for the parsimony inferences
42 | # -x 12345 : random seed for rapid bootstrap
43 | # -# 100 : number of bootstraps set to 100
44 | # -s ${ALN}: alignment file in fasta or phylip format
45 | # -n ${SUF}: naming suffix for files generated for a individual run
46 | 


--------------------------------------------------------------------------------
/runRepeatExplorer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Prepares DNA-seq reads for repeat explorer
 3 | if [ $# -lt 2 ] ; then
 4 |         echo "usage: bash runRepeatExplorer.sh <DNAseq_R1> <DNAseq_R2> <90% read length>"
 5 |         echo ""
 6 |         echo "To obtain reads to run through repeat explorer, files must be fastq.gz"
 7 |         echo ""
 8 | exit 0
 9 | fi
10 | module load trimmomatic
11 | module load java
12 | module load GIF2/seqtk
13 | 
14 | R1="$1"
15 | R2="$2"
16 | ReadLength="$3"
17 | head -n 2000000 <(zcat $R1) >200kForward.fastq
18 | head -n 2000000 <(zcat $R2) >200kReverse.fastq
19 | 
20 | java -jar /opt/rit/app/trimmomatic/0.36/bin/trimmomatic-0.36.jar PE -threads 16 -phred33 -trimlog 200k.log 200kForward.fastq 200kReverse.fastq 200kForward.trimmedpaired.fastq 200kForward.trimmedunpaired.fastq 200kReverse.trimmedpaired.fastq 200kReverse.trimmedunpaired.fastq ILLUMINACLIP:/opt/rit/app/trimmomatic/0.36/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:$ReadLength
21 | 
22 | awk 'NR % 4 == 1 {print ">" $0 } NR % 4 == 2 {print $0}' 200kForward.trimmedpaired.fastq |head -n 400000 > Really200kForward.trimmedpaired.fasta
23 | awk 'NR % 4 == 1 {print ">" $0 } NR % 4 == 2 {print $0}' 200kReverse.trimmedpaired.fastq |head -n 400000 > Really200kReverse.trimmedpaired.fasta
24 | 
25 | awk '/^>/{print ">" ++i"f"; next}{print}' Really200kForward.trimmedpaired.fasta >Really200kForward.trimmedpairedRenamed.fasta
26 | awk '/^>/{print ">" ++i"f"; next}{print}' Really200kReverse.trimmedpaired.fasta >Really200kReverse.trimmedpairedRenamed.fasta
27 | 
28 | seqtk mergepe Really200kForward.trimmedpairedRenamed.fasta Really200kReverse.trimmedpairedRenamed.fasta >Really200kReverse.trimmedpairedRenamedInterlaced.fasta
29 | 


--------------------------------------------------------------------------------
/runRepeatModeler.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # runs repeat masking for the genome after constructing custom repeat library
 3 | # uses repeat modeler for building custom db and RepeatMasking for masking
 4 | # run it as:
 5 | # runRepeatModeler.sh Genome.fasta
 6 | # based on Rick's guide https://intranet.gif.biotech.iastate.edu/doku.php/people:remkv6:genome738polished_repeatmodeler_--de_novo_repeat_identification
 7 | 
 8 | if [ $# -lt 1 ] ; then
 9 |         echo "usage: runRepeatModeler <genome.fasta>"
10 |         echo ""
11 |         echo "To build custom repeat library and mask the repeats of the genome"
12 |         echo ""
13 | exit 0
14 | fi
15 | 
16 | 
17 | GENOME="$1"
18 | module use /shared/software/GIF/modules/
19 | module purge
20 | module load parallel
21 | 
22 | module load GIF2/repeatmasker/4.0.6
23 | module load GIF2/repeatmodeler/1.0.8
24 | module load GIF2/perl/5.22.1
25 | DATABASE="$(basename ${GENOME%.*}).DB"
26 | BuildDatabase -name ${DATABASE} -engine ncbi ${GENOME}
27 | RepeatModeler -database ${DATABASE}  -engine ncbi -pa 16
28 | ln -s $(find $(pwd) -name "consensi.fa.classified")
29 | RepeatMasker -pa 16 -gff -lib consensi.fa.classified ${GENOME}
30 | 
31 | 


--------------------------------------------------------------------------------
/runSNPHYLO.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | file=$1
 3 | module load snphylo
 4 | xvfb-run /opt/rit/spack-app/linux-rhel7-x86_64/gcc-4.8.5/snphylo-2016-02-04-qftdohupwpclptju5kxtkhlfcq3cfwux/snphylo.sh \
 5 |   -v $file
 6 |   -l 0.4 \
 7 |   -m 0.1 \
 8 |   -M  0.1 \
 9 |   -P ${file%.*} \
10 |   -a 200000
11 | 


--------------------------------------------------------------------------------
/runSOAP.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | SOAPdenovo-127mer all  -s soap_config_file.txt -o zdiploGraph -R -u -w -K 127 -p 40 -a 1500G -k 33 
4 | 


--------------------------------------------------------------------------------
/runSPAdes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | /shared/software/GIF/programs/SPAdes/3.7.1/bin/spades.py \
 4 | --pe1-1 /data019/LAS/mhufford/zea_diplo/PE250-2/Diplo_Perennis_NoIndex_L001_R1_001.fastq.gz \
 5 | --pe1-2 /data019/LAS/mhufford/zea_diplo/PE250-2/Diplo_Perennis_NoIndex_L001_R3_001.fastq.gz \
 6 | --pe2-1 /data019/LAS/mhufford/zea_diplo/PE250-1/Diplo_Perennis_NoIndex_L001_R1_001.fastq.gz \
 7 | --pe2-2 /data019/LAS/mhufford/zea_diplo/PE250-1/Diplo_Perennis_NoIndex_L001_R4_001.fastq.gz \
 8 | --mp1-1 /data019/LAS/mhufford/zea_diplo/ZeaDipMatepair/Data/stfxbcdo6y/Unaligned/Project_JRAL_L3_Zea_dip_matepair/Zea-dip-matepair_S3_L003_R1_001.fastq.gz \
 9 | --mp1-2 /data019/LAS/mhufford/zea_diplo/ZeaDipMatepair/Data/stfxbcdo6y/Unaligned/Project_JRAL_L3_Zea_dip_matepair/Zea-dip-matepair_S3_L003_R3_001.fastq.gz \
10 | --threads 40 \
11 | --tmp-dir $TMPDIR \
12 | -k 127 \
13 | -o spades_diplo
14 | 
15 | 


--------------------------------------------------------------------------------
/runSTAR.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | R1="$1"
 3 | R2=$(echo $R1 |sed 's/_R1_/_R2_/g')
 4 | OUT=$(basename ${R1%%.*})
 5 | GFF="/home/arnstrm/arnstrm/GMAPDB/Gmax_275_Wm82.a2.v1.gene.gff3"
 6 | DB="/home/arnstrm/arnstrm/GMAPDB/Gmax_275_v2.0_star"
 7 | 
 8 | STAR \
 9 |  --runMode alignReads \
10 |  --runThreadN 32 \
11 |  --genomeDir ${DB} \
12 |  --readFilesCommand zcat \
13 |  --outFileNamePrefix ${OUT} \
14 |  --readFilesIn ${R1} ${R2}
15 | 
16 | # --sjdbGTFtagExonParentTranscript \
17 | # --sjdbGTFfile ${GFF} \
18 | # --sjdbOverhang 99 \
19 | # --sjdbGTFfeatureExon CDS \
20 | 


--------------------------------------------------------------------------------
/runStringtie.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | module load stringtie
 4 | 
 5 | #Stringtie generates FPKM, read coverage, and TPM based on RNA-seq mapping to a genome/transcriptome. 
 6 | #Bam file must be sorted by chromosome and position, typically sort -k1,1V -k4,5nr
 7 | #-B creates a ballgown output file that can be used to generate RNA-seq statistics, but the file is autonamed.  Beware, if multiple bams are ran in the same folder, this file will be overwritten.
 8 | #-e uses only the gff supplied to generate the stats, which is useful if you dont want to have stringtie perform a de-novo assembly of possible missing genes. 
 9 | #sh runStringtie.sh SOMETHINGSOMETHING_SORTED.BAM
10 | 
11 | BAMFILE=$1
12 | SortedGFF3=$2
13 | 
14 | stringtie ${BAMFILE} -p 16 -G ${SortedGFF3} -e -B -A ${BAMFILE%.*}_GeneAbundance.tab >${BAMFILE%.*}_GeneAbundance.gtf
15 | 
16 | 


--------------------------------------------------------------------------------
/runSynteny.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | GENOME=$1
 3 | TARGET=$2
 4 | module load pyscaf
 5 | module load last/770
 6 | 
 7 | mkdir -p pyscaf_synteny
 8 | 
 9 | python /shared/software/GIF/programs/pyscaf/2016-04-08/pyScaf.py \
10 |     --fasta $GENOME \
11 |     --reference $TARGET \
12 |     --norearrangements \
13 |     --threads 16 \
14 |     --dotplot pdf \
15 |     --log ${GENOME%.*}_pynast.log
16 | 


--------------------------------------------------------------------------------
/runTabix.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This just sorts, checks for deficiencies, bgzips, and indexes with tabix
 3 | # usage: sh run_JBupload.sh <your_gff>
 4 | module use /work/GIF/software/modules
 5 | module load genometools/1.5.9
 6 | module load tabix/0.2.6
 7 | 
 8 | 
 9 | GFF="$1"
10 | 
11 | sort -k1,1 -k 4,4n $GFF |grep -v "###" |grep -v "======" |grep -v "#" >${GFF%.*}_sorted.gff
12 | #this checks your format for correctness
13 | gt gff3 -sortlines -tidy ${GFF%.*}_sorted.gff > ${GFF%.*}_sorted_formatted.gff
14 | bgzip  ${GFF%.*}_sorted_formatted.gff
15 | tabix -p gff  ${GFF%.*}_sorted_formatted.gff.gz
16 | 


--------------------------------------------------------------------------------
/runTasselGWAS.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | module load GIF2/tassel
 3 | VCF="$1"
 4 | TRAITS="/work/GIF/remkv6/Purcell/Abalone/15_WhiteWildCultured/02_GWAS/wildCultivatedPhenotypeFixed.txt"
 5 | # traits file should be numerical
 6 | # else the file will be empty
 7 | 
 8 | #sort
 9 | ${TASSEL_HOME}/run_pipeline.pl -SortGenotypeFilePlugin -inputFile ${VCF} -outputFile ${VCF%.*}_sorted.vcf
10 | #impute
11 | ${TASSEL_HOME}/run_pipeline.pl -fork1 -importGuess ${VCF%.*}_sorted.vcf -ImputationPlugin -ByMean true -nearestNeighbors 5 -distance Euclidean -endPlugin -export ${VCF%.*}_imputed -exportType VCF
12 | # generate kinship matrix
13 | ${TASSEL_HOME}/run_pipeline.pl -fork2 -importGuess ${VCF%.*}_imputed.vcf -KinshipPlugin -method Centered_IBS -endPlugin -export ${VCF%.*}_kinship.txt -exportType SqrMatrix
14 | # GLM
15 | ${TASSEL_HOME}/run_pipeline.pl \
16 |      -fork1 \
17 |          -importGuess ${VCF%.*}_imputed.vcf \
18 |      -fork2 \
19 |          -importGuess ${TRAITS} \
20 |      -fork3 \
21 |          -importGuess ${VCF%.*}_kinship.txt \
22 |      -combine4 \
23 |          -input1 \
24 |          -input2 \
25 |          -input3 \
26 |          -intersect \
27 |          -FixedEffectLMPlugin \
28 |          -maxP 1e-3 -permute true -nperm 1000 -biallelicOnly true \
29 |          -endPlugin -export ${VCF%.*}_glm_output
30 | 


--------------------------------------------------------------------------------
/runTraning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | module use /shared/software/GIF/modules
 3 | module load hisat2
 4 | module load maker/2.31.8b
 5 | module load samtools
 6 | module load gmap-gsnap/20160404
 7 | module load assemblage
 8 | 
 9 | pre=$(pwd)
10 | CHR="CHRFILE"
11 | BASE=$(basename $(dirname $(pwd)))
12 | TRANS="${pre}/${CHR}.all.maker.transcripts.fasta"
13 | #process maker
14 | 
15 | gff3_merge  -d ${CHR}.maker.output/${CHR}_master_datastore_index.log
16 | fasta_merge -d ${CHR}.maker.output/${CHR}_master_datastore_index.log
17 | 
18 | 
19 | # train snap
20 | mkdir ${CHR}_SNAP
21 | cd ${CHR}_SNAP
22 | maker2zff -n ${pre}/${CHR}.all.gff
23 | fathom genome.ann genome.dna -categorize 1000
24 | fathom -export 1000 -plus uni.ann uni.dna
25 | forge export.ann export.dna
26 | hmm-assembler.pl ${CHR} . > ${pre}/${CHR}.snap.hmm
27 | 
28 | 
29 | 
30 | # train augustus
31 | GENOME="${pre}/${CHR}.fasta"
32 | zff2gff3.pl genome.ann | perl -plne 's/\t(\S+)$/\t\.\t$1/' > ${CHR}.temp.gff3
33 | autoAug.pl --genome=${GENOME} --species=${CHR}.${BASE} --cdna=${TRANS} --trainingset=${CHR}.temp.gff3
34 | 
35 | 
36 | #dna="${pre}/${CHR}_SNAP/autoAug/cdna/cdna.psl"
37 | #hints="${pre}/${CHR}_SNAP/autoAug/hints/hints.E.gff"
38 | 
39 | #autoAug.pl --genome=${GENOME} --species=${CHR}.${BASE} --useexisting --hints=${hints}  -v -v  --index=1
40 | #autoAug.pl --genome=${GENOME} --species=${CHR}.${BASE} --useexisting --hints=${hints} --estali=${cdna} -v -v -v  --index=2
41 | 
42 | 


--------------------------------------------------------------------------------
/run_gmap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | module load gsnap
 4 | dbname=$1
 5 | dbloc=$2
 6 | dbfasta=$3
 7 | query=$4
 8 | gmap_build -d $dbname  -D $dbloc $dbfasta
 9 | gmap -D $dbloc -d $dbname -B 5 -t 16  --input-buffer-size=1000000 --output-buffer-size=1000000 -f psl  $query >${dbname%.*}.${query%.*}.psl
10 | 


--------------------------------------------------------------------------------
/rundipSPAdes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | dipspades.py \
 4 | --pe1-1 /data019/LAS/mhufford/zea_diplo/PE250-2/Diplo_Perennis_NoIndex_L001_R1_001.fastq.gz \
 5 | --pe1-2 /data019/LAS/mhufford/zea_diplo/PE250-2/Diplo_Perennis_NoIndex_L001_R3_001.fastq.gz \
 6 | --pe2-1 /data019/LAS/mhufford/zea_diplo/PE250-1/Diplo_Perennis_NoIndex_L001_R1_001.fastq.gz \
 7 | --pe2-2 /data019/LAS/mhufford/zea_diplo/PE250-1/Diplo_Perennis_NoIndex_L001_R4_001.fastq.gz \
 8 | --pe1-fr --pe2-fr \
 9 | --mp1-1 /data019/LAS/mhufford/zea_diplo/ZeaDipMatepair/Data/stfxbcdo6y/Unaligned/Project_JRAL_L3_Zea_dip_matepair/Zea-dip-matepair_S3_L003_R1_001.fastq.gz \
10 | --mp1-2 /data019/LAS/mhufford/zea_diplo/ZeaDipMatepair/Data/stfxbcdo6y/Unaligned/Project_JRAL_L3_Zea_dip_matepair/Zea-dip-matepair_S3_L003_R3_001.fastq.gz \
11 | --mp1-rf \
12 | --threads 40 \
13 | --tmp-dir $TMPDIR \
14 | -k 127 \
15 | --expect-gaps \
16 | --expect-rearrangements \
17 | --hap-assembly \
18 | --memory 1800 \
19 | -o spades_diplo
20 | 
21 | 


--------------------------------------------------------------------------------
/sample_fastq.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import random
 3 | import argparse
 4 | import sys
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument("input", help="input FASTQ filename")
 8 | parser.add_argument("output", help="output FASTQ filename")
 9 | parser.add_argument("-f", "--fraction", type=float, help="fraction of reads to sample")
10 | parser.add_argument("-n", "--number", type=int, help="number of reads to sample")
11 | parser.add_argument("-s", "--sample", type=int, help="number of output files to write", default=1)
12 | args = parser.parse_args()
13 | 
14 | if args.fraction and args.number:
15 |    sys.exit("give either a fraction or a number, not both")
16 | 
17 | if not args.fraction and not args.number:
18 |    sys.exit("you must give either a fraction or a number")
19 | 
20 | print("counting records....")
21 | with open(args.input) as input:
22 |     num_lines = sum([1 for line in input])
23 | total_records = int(num_lines / 4)
24 | 
25 | if args.fraction:
26 |     args.number = int(total_records * args.fraction)
27 | 
28 | print("sampling " + str(args.number) + " out of " + str(total_records) + " records")
29 | 
30 | output_files = []
31 | output_sequence_sets = []
32 | for i in range(args.sample):
33 |     output_files.append(open(args.output + "." + str(i), "w"))
34 |     output_sequence_sets.append(set(random.sample(xrange(total_records + 1), args.number)))
35 | 
36 | record_number = 0
37 | with open(args.input) as input:
38 |         for line1 in input:
39 |             line2 = input.next()
40 |             line3 = input.next()
41 |             line4 = input.next()
42 |             for i, output in enumerate(output_files):
43 |                 if record_number in output_sequence_sets[i]:
44 |                         output.write(line1)
45 |                         output.write(line2)
46 |                         output.write(line3)
47 |                         output.write(line4)
48 |             record_number += 1
49 |             if record_number % 100000 == 0:
50 |                 print(str((record_number / total_records) * 100)  + " % done")
51 | 
52 | 
53 | for output in output_files:
54 |     output.close()
55 | print("done!")
56 | print("want to learn how to write useful tools like this? Go to http://pythonforbiologists.com/books")
57 | 


--------------------------------------------------------------------------------
/scaffold2contig.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | #modified from http://seqanswers.com/forums/showthread.php?t=12993
 4 | 
 5 | #Usage  perl scaffold2contig.pl <fasta seq> <integer>
 6 | 
 7 | my $file = $ARGV[0];
 8 | my $number =$ARGV[1];
 9 | open(IN,$file) || die "Incorrect file $file. Exiting...\n";
10 | 
11 | my ($seq, $name)=('','');
12 | while(<IN>){
13 |   chomp;
14 |   my $line = $_;
15 |   $seq.= uc($line) if(eof(IN));
16 |   if (/\>(\S+)/ || eof(IN)){
17 |     if($seq ne ''){
18 | #change number 1 to the number of N's you want to split by. Ex. "my @seqgaps = split(/[N]{10,}/, $seq); "
19 |       my @seqgaps = split(/[N]{$number,}/, $seq);
20 |       if($#seqgaps > 0){
21 |         my $ctgcount=0;
22 |         foreach my $ctgseq (@seqgaps){
23 |           $ctgcount++;
24 |           print "$name contig$ctgcount (size=".length($ctgseq).")\n$ctgseq\n";
25 |         }
26 |       }else{
27 |         print ">$name\n$seq\n";
28 |       }
29 |     }
30 |     $seq='';
31 |     $name = $_;
32 |   }else{
33 |     $seq.=uc($line);
34 |   }
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/scores2dokuwiki.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 |  tr -d '\015' < $1 | \
 3 |  sed 's/_fastqc.zip//g' |\
 4 |  sed 's/,-1/|@red: -1/g' |\
 5 |  sed 's/,0/|@yellow: 0/g' |\
 6 |  sed 's/,0/|@yellow: 0/g' |\
 7 |  sed 's/,1/|@lightgreen: 1/g' |\
 8 |  sed 's/$/|/' |sed 's/^/^/' |\
 9 |  sed 1i'^Fastq file ^BST ^BSQ ^SQS ^BSC ^GCc ^BNC ^SLD ^SDL ^ORS ^ADP ^KMR ^'
10 | 
11 | cat <<EOF
12 | <code>
13 | Where:
14 | BST : Basic_statistics, 
15 | TSQ : Per_base_sequence_quality, 
16 | SQS : Per_sequence_quality_scores, 
17 | BSC : Per_base_sequence_content, 
18 | GCc : Per_sequence_GC_content, 
19 | BNC : Per_base_N_content, 
20 | SLD : Sequence_Length_Distribution, 
21 | SDL : Sequence_Duplication_Levels, 
22 | ORS : Overrepresented_sequences, 
23 | ADP : Adapter_Content, 
24 | KMR : Kmer_Content 
25 | </code>
26 | EOF
27 | 


--------------------------------------------------------------------------------
/seprate_paired_reads.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | infile="$1"
 3 | #gunzip ${infile}
 4 | module load python
 5 | pwd=$(pwd)
 6 | outfile=$(echo ${infile} | sed 's/.fastq.gz$//g')
 7 | extract-paired-reads.py ${outfile}.fastq
 8 | split-paired-reads.py ${outfile}.fastq.pe
 9 | gzip *.pe
10 | 


--------------------------------------------------------------------------------
/soap_config_file.txt:
--------------------------------------------------------------------------------
 1 | max_rd_len=150
 2 | 
 3 | [LIB]
 4 | avg_ins=450
 5 | reverse_seq=0
 6 | asm_flags=3
 7 | rd_len_cutoff=150
 8 | rank=1
 9 | pair_num_cutoff=3
10 | map_len=32
11 | #first set
12 | q1=/data019/LAS/mhufford/zea_diplo/PE250-2/Diplo_Perennis_NoIndex_L001_R1_001.fastq.gz 
13 | q2=/data019/LAS/mhufford/zea_diplo/PE250-2/Diplo_Perennis_NoIndex_L001_R3_001.fastq.gz
14 | #second set
15 | q1=/data019/LAS/mhufford/zea_diplo/PE250-1/Diplo_Perennis_NoIndex_L001_R1_001.fastq.gz
16 | q2=/data019/LAS/mhufford/zea_diplo/PE250-1/Diplo_Perennis_NoIndex_L001_R4_001.fastq.gz
17 | 
18 | [LIB]
19 | avg_ins=9000
20 | reverse_seq=1
21 | asm_flags=2
22 | rank=2
23 | pair_num_cutoff=5
24 | map_len=35
25 | q1=/data019/LAS/mhufford/zea_diplo/ZeaDipMatepair/Data/stfxbcdo6y/Unaligned/Project_JRAL_L3_Zea_dip_matepair/Zea-dip-matepair_S3_L003_R1_001.fastq.gz
26 | q2=/data019/LAS/mhufford/zea_diplo/ZeaDipMatepair/Data/stfxbcdo6y/Unaligned/Project_JRAL_L3_Zea_dip_matepair/Zea-dip-matepair_S3_L003_R3_001.fastq.gz
27 | 


--------------------------------------------------------------------------------
/sr_config_MaSuRCA.txt:
--------------------------------------------------------------------------------
 1 | # example configuration file 
 2 | 
 3 | # DATA is specified as type {PE,JUMP,OTHER} and 5 fields:
 4 | # 1)two_letter_prefix 2)mean 3)stdev 4)fastq(.gz)_fwd_reads
 5 | # 5)fastq(.gz)_rev_reads. The PE reads are always assumed to be
 6 | # innies, i.e. --->.<---, and JUMP are assumed to be outties
 7 | # <---.--->. If there are any jump libraries that are innies, such as
 8 | # longjump, specify them as JUMP and specify NEGATIVE mean. Reverse reads
 9 | # are optional for PE libraries and mandatory for JUMP libraries. Any
10 | # OTHER sequence data (454, Sanger, Ion torrent, etc) must be first
11 | # converted into Celera Assembler compatible .frg files (see
12 | # http://wgs-assembler.sourceforge.com)
13 | DATA
14 | PE= aa 250 100 /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/PE/NVASPF_NoIndex_L008_R1.fastq.gz /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/PE/NVASPF_NoIndex_L008_R2.fastq.gz
15 | PE= ab 250 100 /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/PE/NVASPM_NoIndex_L007_R1.fastq.gz /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/PE/NVASPM_NoIndex_L007_R2.fastq.gz
16 | JUMP= ja 11000 1000 /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/MP/Sample_NV-S17_R1.fastq.gz /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/MP/Sample_NV-S17_R2.fastq.gz
17 | JUMP= jb 11000 1000 /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/MP/Sample_NV-S18_R1.fastq.gz /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/MP/Sample_NV-S18_R2.fastq.gz
18 | END
19 | 
20 | PARAMETERS
21 | #this is k-mer size for deBruijn graph values between 25 and 101 are supported, auto will compute the optimal size based on the read data and GC content
22 | GRAPH_KMER_SIZE = auto
23 | #set this to 1 for Illumina-only assemblies and to 0 if you have 1x or more long (Sanger, 454) reads, you can also set this to 0 for large data sets with high jumping clone coverage, e.g. >50x
24 | USE_LINKING_MATES = 0
25 | #this parameter is useful if you have too many jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms 
26 | LIMIT_JUMP_COVERAGE = 300
27 | #these are the additional parameters to Celera Assembler.  do not worry about performance, number or processors or batch sizes -- these are computed automatically. 
28 | #set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms.
29 | CA_PARAMETERS = cgwErrorRate=0.15 ovlMemory=4GB
30 | #minimum count k-mers used in error correction 1 means all k-mers are used.  one can increase to 2 if coverage >100
31 | KMER_COUNT_THRESHOLD = 1
32 | #auto-detected number of cpus to use
33 | NUM_THREADS = 32
34 | #this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*estimated_coverage
35 | JF_SIZE = 20000000000
36 | #this specifies if we do (1) or do not (0) want to trim long runs of homopolymers (e.g. GGGGGGGG) from 3' read ends, use it for high GC genomes
37 | DO_HOMOPOLYMER_TRIM = 0
38 | END
39 | 


--------------------------------------------------------------------------------
/sr_config_MaSuRSOAP.txt:
--------------------------------------------------------------------------------
 1 | # example configuration file 
 2 | 
 3 | # DATA is specified as type {PE,JUMP,OTHER} and 5 fields:
 4 | # 1)two_letter_prefix 2)mean 3)stdev 4)fastq(.gz)_fwd_reads
 5 | # 5)fastq(.gz)_rev_reads. The PE reads are always assumed to be
 6 | # innies, i.e. --->.<---, and JUMP are assumed to be outties
 7 | # <---.--->. If there are any jump libraries that are innies, such as
 8 | # longjump, specify them as JUMP and specify NEGATIVE mean. Reverse reads
 9 | # are optional for PE libraries and mandatory for JUMP libraries. Any
10 | # OTHER sequence data (454, Sanger, Ion torrent, etc) must be first
11 | # converted into Celera Assembler compatible .frg files (see
12 | # http://wgs-assembler.sourceforge.com)
13 | DATA
14 | PE= aa 250 100 /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/PE/NVASPF_NoIndex_L008_R1.fastq.gz /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/PE/NVASPF_NoIndex_L008_R2.fastq.gz
15 | PE= ab 250 100 /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/PE/NVASPM_NoIndex_L007_R1.fastq.gz /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/PE/NVASPM_NoIndex_L007_R2.fastq.gz
16 | JUMP= ja 11000 1000 /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/MP/Sample_NV-S17_R1.fastq.gz /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/MP/Sample_NV-S17_R2.fastq.gz
17 | JUMP= jb 11000 1000 /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/MP/Sample_NV-S18_R1.fastq.gz /home/arnstrm/arnstrm/20150924_Nicole_Turtle/01_DATA/A_DNAseq/MP/Sample_NV-S18_R2.fastq.gz
18 | END
19 | 
20 | PARAMETERS
21 | #this is k-mer size for deBruijn graph values between 25 and 101 are supported, auto will compute the optimal size based on the read data and GC content
22 | GRAPH_KMER_SIZE = auto
23 | #set this to 1 for Illumina-only assemblies and to 0 if you have 1x or more long (Sanger, 454) reads, you can also set this to 0 for large data sets with high jumping clone coverage, e.g. >50x
24 | USE_LINKING_MATES = 0
25 | #this parameter is useful if you have too many jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms 
26 | LIMIT_JUMP_COVERAGE = 300
27 | #these are the additional parameters to Celera Assembler.  do not worry about performance, number or processors or batch sizes -- these are computed automatically. 
28 | #set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms.
29 | CA_PARAMETERS = cgwErrorRate=0.15 ovlMemory=4GB
30 | #minimum count k-mers used in error correction 1 means all k-mers are used.  one can increase to 2 if coverage >100
31 | KMER_COUNT_THRESHOLD = 1
32 | #auto-detected number of cpus to use
33 | SOAP_ASSEMBLY = 1
34 | NUM_THREADS = 32
35 | #this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*estimated_coverage
36 | JF_SIZE = 20000000000
37 | #this specifies if we do (1) or do not (0) want to trim long runs of homopolymers (e.g. GGGGGGGG) from 3' read ends, use it for high GC genomes
38 | DO_HOMOPOLYMER_TRIM = 0
39 | END
40 | 


--------------------------------------------------------------------------------
/star_build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | module load star
 4 | FASTA="$1"
 5 | DB=$(pwd)
 6 | mkdir -p ${DB}/${FASTA%.*}
 7 | 
 8 | STAR \
 9 |   --runMode genomeGenerate \
10 |   --runThreadN 16 \
11 |   --genomeDir ${DB}/${FASTA%.*}
12 |   --genomeFastaFiles ${FASTA}
13 | 


--------------------------------------------------------------------------------
/sub:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | scriptName="sub"
 5 | function printUsage () {
 6 |     cat <<EOF
 7 | 
 8 | Synopsis
 9 | Add a module load command inside a submission script
10 | 
11 |     $scriptName [-h | --help] submissionscriptName ModuleName
12 | 
13 |         submissionscriptname: 	submission script name that was probably generated using JobR.sh or JobR_condo.sh
14 | 	ModuleName:		Need to add the module to load to this script in order for the commands to run.  Example runFASTQC
15 | 	#NAME:			#add a way to append to the existing name will improve walltime script
16 |  
17 | Author
18 | 
19 |     Andrew Severin, Genome Informatics Facilty, Iowa State University
20 |     severin@iastate.edu
21 |     18 February, 2016
22 | 
23 | 
24 | EOF
25 | }
26 | 
27 | 
28 | if [ $# -lt 1 ] ; then
29 |         printUsage
30 |         exit 0
31 | fi
32 | 
33 | dir=$1
34 | 
35 | sed -i '0,/module load/{s/module load/module load '$2'\nmodule load/}' $1
36 | #perl -i -pe 's/module use/module load '$2' \nmodule use/' $1
37 | 
38 | 


--------------------------------------------------------------------------------
/summary.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # reads stdin and prints summary statistics
 3 | # total, count, mean, median, min and max
 4 | # you can pipe this through scripts or redirect input <
 5 | # modified from http://unix.stackexchange.com/a/13779/27194
 6 | sort -n | awk '
 7 |   $1 ~ /^[0-9]*(\.[0-9]*)?$/ {
 8 |     a[c++] = $1;
 9 |     sum += $1;
10 |   }
11 |   END {
12 |     ave = sum / c;
13 |     if( (c % 2) == 1 ) {
14 |       median = a[ int(c/2) ];
15 |     } else {
16 |       median = ( a[c/2] + a[c/2-1] ) / 2;
17 |     }
18 |     OFS="\t";
19 |     { printf ("Total:\t""%'"'"'d\n", sum)}
20 |     { printf ("Count:\t""%'"'"'d\n", c)}
21 |     { printf ("Mean:\t""%'"'"'d\n", ave)}
22 |     { printf ("Median:\t""%'"'"'d\n", median)}
23 |     { printf ("Min:\t""%'"'"'d\n", a[0])}
24 |     { printf ("Max:\t""%'"'"'d\n", a[c-1])}
25 |   }
26 | '
27 | 


--------------------------------------------------------------------------------
/taxadump_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Download NCBI's taxonomic data and GenBank ID taxonomic
 4 | # assignments.
 5 | 
 6 | ## Download taxdump (clean first)
 7 | TAXDUMP="taxdump.tar.gz"
 8 | rm --force *.dmp gc.prt readme.txt ${TAXDUMP}
 9 | wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/${TAXDUMP} && \
10 | tar -zxvf ${TAXDUMP} && \
11 | rm --force ${TAXDUMP} {citations,division,gencode,merged,delnodes}.dmp gc.prt readme.txt
12 | 
13 | # limit search space to scientific names
14 | NAMES="names.dmp"
15 | grep "scientific name" ${NAMES} > ${NAMES/.dmp/_reduced.dmp}
16 | mv ${NAMES/.dmp/_reduced.dmp} ${NAMES}
17 | 
18 | ## Download taxid
19 | # select one of the below 2 tables
20 | # for nt database, uncheck this
21 | #TAXID="gi_taxid_nucl.dmp.gz"
22 | # for nr database, uncheck this
23 | TAXID="gi_taxid_prot.dmp.gz"
24 | rm --force ${TAXID/.gz/}*
25 | wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/${TAXID} && \
26 | gunzip ${TAXID}
27 | 
28 | exit 0
29 | 


--------------------------------------------------------------------------------
/taxaids2division.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # converts taxa ids to various "divisions" classified by NCBI
 4 | # uses NCBI e-uitls which feteches directly from the NCBI server
 5 | # run it as taxaids2division.sh taxaids
 6 | 
 7 | TAXAIDS="$1"
 8 | 
 9 | if [ $# -lt 1 ] ; then
10 |         echo "needs taxa id file to run"
11 |         echo "run this script as: taxaids2division.sh taxaids"
12 |         exit 0
13 | fi
14 | 
15 | 
16 | while read line; do
17 | g=$(curl http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi\?db\=taxonomy\&id="${line}"\&retmode=xml | grep "Division" | cut -f 2 -d ">" | cut -f 1 -d "<");
18 | echo -e "$line\t$g";
19 | done<"${TAXAIDS}" 2> /dev/null
20 | 


--------------------------------------------------------------------------------
/taxaids2lineage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # converts taxa ids to complete "Lineage" classified by NCBI
 4 | # uses NCBI e-uitls which feteches directly from the NCBI server
 5 | # run it as taxaids2lineage.sh taxaids
 6 | 
 7 | TAXAIDS="$1"
 8 | 
 9 | }
10 | if [ $# -lt 1 ] ; then
11 |         echo "needs taxa id file to run"
12 |         echo "run this script as: taxaids2lineage.sh taxaids"
13 |         exit 0
14 | fi
15 | 
16 | 
17 | while read line; do
18 | g=$(curl http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi\?db\=taxonomy\&id="${line}"\&retmode=xml | grep "Lineage" | cut -f 2 -d ">" | cut -f 1 -d "<");
19 | echo -e "$line\t$g";
20 | done<"${TAXAIDS} 2> /dev/null
21 | 


--------------------------------------------------------------------------------
/taxid_ranks.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | Usage = """
 4 | Print taxid's lineage and ranks
 5 | by default prints to the stdout
 6 | Usage:
 7 |   taxid_ranks.py taxid > ouput.txt
 8 | 
 9 | Arun Seetharam
10 | arnstrm@iastate.edu
11 | taxid_ranks.py -version 1.0
12 | 04/13/2017
13 | """
14 | from ete3 import NCBITaxa
15 | import sys
16 | ncbi = NCBITaxa()
17 | if len(sys.argv)<2:
18 |     print Usage
19 | else:
20 |     cmdargs = str(sys.argv)
21 |     lineage = ncbi.get_lineage((sys.argv[1]))
22 |     names = ncbi.get_taxid_translator(lineage)
23 |     for taxid in lineage:
24 |         print [ncbi.get_rank([taxid])], [names[taxid]]        
25 | #    print [names[taxid] for taxid in lineage]
26 | #    print [ncbi.get_rank([taxid]) for taxid in lineage]
27 | #    print [ncbi.get_rank([name]) for name in names]
28 | 


--------------------------------------------------------------------------------
/template.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -J jobname
 3 | #SBATCH -N 1 
 4 | #SBATCH --ntasks-per-node=16 
 5 | #SBATCH -t 24:00:00
 6 | #SBATCH -o $SLURM_JOB_NAME.o%j
 7 | #SBATCH -e $SLURM_JOB_NAME.e%j
 8 | #SBATCH --mail-user=arnstrm@gmail.com
 9 | #SBATCH --mail-type=begin
10 | #SBATCH --mail-type=end
11 | cd $SLURM_SUBMIT_DIR
12 | module load name
13 | 


--------------------------------------------------------------------------------
/test_bioperl.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | eval 'require Bio::Seq;';
 3 | if ($@) {
 4 | 	print "We don't have bioperl\n";
 5 | 	}
 6 | else {
 7 | 	print "Bioperl found\n";
 8 | 	}
 9 |      
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/tophat_se.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | module use /data004/software/GIF/modules
 3 | module load parallel
 4 | module load tophat
 5 | 
 6 | INDEXDB="/home/arnstrm/arnstrm/GMAPDB"
 7 | DB_NAME="Gmax_275_v2.0"
 8 | GFF="/home/arnstrm/arnstrm/20141114_Bhattacharyya_Psojae_RNAseq/05_BOWTIE_SB/Gmax_275_Wm82.a2.v1.gene_exons.gff3"
 9 | FILE1="$1"
10 | #FILE2=$(echo "$1" |sed 's/.fastq.gz$/2.fastq.gz/g')
11 | 
12 | OUTFILE=$(basename ${FILE1} | sed 's/.fastq$//g')
13 | 
14 | echo "$OUTFILE now processing"
15 | tophat -o "${OUTFILE}" -p8 -G "${GFF}" -M "${INDEXDB}/${DB_NAME}" "${FILE1}"
16 | 
17 | 


--------------------------------------------------------------------------------
/transpose.awk:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | awk 'BEGIN { FS=OFS="\t" } \
 4 | { \
 5 |     for (rowNr=1;rowNr<=NF;rowNr++) { \
 6 |         cell[rowNr,NR] = $rowNr \
 7 |     } \
 8 |     maxRows = (NF > maxRows ? NF : maxRows); \
 9 |     maxCols = NR \
10 | } \
11 | END {\
12 |     for (rowNr=1;rowNr<=maxRows;rowNr++) {\
13 |         for (colNr=1;colNr<=maxCols;colNr++) {\
14 |             printf "%s%s", cell[rowNr,colNr], (colNr < maxCols ? OFS : ORS)\
15 |         }\
16 |     }\
17 | }'
18 | 


--------------------------------------------------------------------------------
/trim_pe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | module use /shared/software/GIF/modules
 3 | module load trimmomatic
 4 | pwd=$(pwd)
 5 | input1=$1
 6 | input2=$2
 7 | output1=$(basename ${input1%%.*})
 8 | output2=$(basename ${input2%%.*})
 9 | java -jar ${TRIMMOMATIC_HOME}/trimmomatic-0.36.jar PE \
10 |     -phred33 \
11 |     -threads 16 \
12 |      ${input1} ${input2} \
13 |      ${output1}_paired.fq.gz ${output1}_unpaired.fq.gz \
14 |      ${output2}_paired.fq.gz ${output2}_unpaired.fq.gz \
15 |      ILLUMINACLIP:${TRIMMOMATIC_HOME}/adapters/TruSeq3-PE.fa:2:30:10 \
16 |      LEADING:3 \
17 |      TRAILING:3 \
18 |      SLIDINGWINDOW:4:15 \
19 |      MINLEN:36
20 | 


--------------------------------------------------------------------------------
/trim_se.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | progdir='/data004/software/GIF/GIF/programs/trimmomatic/Trimmomatic-0.32'
3 | pwd=$(pwd)
4 | input1=$1
5 | output1=$(echo ${input1} | sed 's/.fastq$//g')
6 | java -jar ${progdir}/trimmomatic-0.32.jar SE -phred33 -threads 6 ${input1} ${output1}_trimmed.fq ILLUMINACLIP:${progdir}/adapters/TruSeq3-SE:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
7 | 


--------------------------------------------------------------------------------
/trimmomatic_pe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | progdir='/data004/software/GIF/packages/trimmomatic/0.32'
3 | pwd=$(pwd)
4 | input1=$1
5 | input2=$2
6 | output1=$(basename ${input1} | sed 's/.fastq.gz$//g')
7 | output2=$(basename ${input2} | sed 's/.fastq.gz$//g')
8 | java -jar ${progdir}/trimmomatic-0.32.jar PE -phred33 -threads 16 ${input1} ${input2} ${output1}_paired.fq ${output1}_unpaired.fq ${output2}_paired.fq ${output2}_unpaired.fq ILLUMINACLIP:${progdir}/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:25
9 | 


--------------------------------------------------------------------------------
/trimmomatic_se.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | progdir='/data004/software/GIF/GIF/programs/trimmomatic/Trimmomatic-0.32'
3 | pwd=$(pwd)
4 | input1=$1
5 | output1=$(echo ${input1} | sed 's/.fastq$//g')
6 | java -jar ${progdir}/trimmomatic-0.32.jar SE -phred33 -threads 6 ${input1} ${output1}_trimmed.fq ILLUMINACLIP:${progdir}/adapters/TruSeq3-SE:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
7 | 


--------------------------------------------------------------------------------
/unix2dos:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ISUgenomics/common_scripts/68986279ec7634d41aaf0db7e0125a24a9663bd1/unix2dos


--------------------------------------------------------------------------------
/unix2mac:
--------------------------------------------------------------------------------
1 | unix2dos


--------------------------------------------------------------------------------
/vcf-subset.py:
--------------------------------------------------------------------------------
 1 | #!/data004/software//GIF/GIF_kb/bin//python
 2 | 
 3 | import random
 4 | import sys
 5 | import re
 6 | 
 7 | text_file = open(sys.argv[1], "r")
 8 | #print text_file
 9 | import sys
10 | #text_file = sys.stdin
11 | #lines = text_file.readlines()
12 | lines = text_file.readlines()
13 | 
14 | 
15 | regionStep=10000
16 | start=1
17 | end=start+regionStep
18 | chrom="Chr01"
19 | i=0
20 | temp=[]
21 | for line in lines:   	
22 |  #print line
23 |  if not re.search('#', line):
24 |     fields = line.strip().split()
25 |     #print int(fields[1]),chrom,fields[0],start,end
26 |     if (int(fields[1])>=end) and (chrom == fields[0]):
27 |        if len(temp)>0:
28 |           print temp[random.sample(range(len(temp)),1)[0]]
29 |           temp=[]
30 |        start=int(fields[1])-1
31 |        end=start+regionStep
32 |        chrom=fields[0]
33 |        temp.append(line) 
34 |     elif (int(fields[1])>=end) or (chrom != fields[0]):
35 |        if len(temp)>0:
36 |           print temp[random.sample(range(len(temp)),1)[0]]
37 |           temp=[]
38 |        start=+1
39 |        end=start+regionStep
40 |        chrom=fields[0]
41 |        temp.append(line)     
42 |     else:
43 |        if (int(fields[1]) > start) and (int(fields[1])<end) and fields[0]==chrom:
44 |         temp.append(line)
45 |         
46 | if len(temp)>0:
47 |           print temp[random.sample(range(len(temp)),1)[0]]
48 |    
49 | 


--------------------------------------------------------------------------------
/walltime:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | qstat -f "$PBS_JOBID" | awk '/Job_Name/ || /walltime/ || /Id:/ || /Resource_List.nodes/' | awk '{print $3}' | paste - - - - | awk 'BEGIN{OFS="\t"} {print "'"$(date)"'",$0}' | awk 'NR==1'>> ~/walltime.txt
4 | qstat -f "$PBS_JOBID" | awk '/Job_Name/ || /walltime/ || /Id:/ || /Resource_List.nodes/' | awk '{print $3}' | paste - - - - | awk 'BEGIN{OFS="\t"} {print "'"$(date)"'",$0}' | awk 'NR==1'
5 | echo "Written to ~/walltime.txt"
6 | 
7 | 


--------------------------------------------------------------------------------