├── .gitmodules ├── _config.yml ├── source ├── VERSION ├── htslib │ ├── config.h │ ├── .travis.yml │ ├── .gitignore │ ├── htslib │ │ ├── vcf_sweep.h │ │ ├── kfunc.h │ │ ├── tbx.h │ │ ├── hts_defs.h │ │ ├── knetfile.h │ │ └── kstdint.h │ ├── README.md │ ├── htslib_vars.mk │ ├── cram │ │ ├── md5.h │ │ ├── vlen.h │ │ ├── zfio.h │ │ ├── pooled_alloc.h │ │ ├── cram_stats.h │ │ ├── cram.h │ │ └── string_alloc.h │ ├── sam.5 │ └── vcf.5 ├── genomeGenerate.h ├── sortSuffixesBucket.h ├── outputSJ.h ├── systemFunctions.h ├── GlobalVariables.cpp ├── Transcript_variationOutput.cpp ├── sysRemoveDir.h ├── binarySearch2.h ├── ThreadControl.cpp ├── bam_cat.h ├── AlignVsTranscript.h ├── GlobalVariables.h ├── Test.hpp ├── mapThreadsSpawn.h ├── TimeFunctions.h ├── stringSubstituteAll.h ├── blocksOverlap.h ├── sjdbLoadFromStream.h ├── ReadAlign_outputVariation.cpp ├── readBarcodeLoad.h ├── samHeaders.h ├── genomeParametersWrite.h ├── sjdbLoadFromFiles.h ├── funPrimaryAlignMark.h ├── extendAlign.h ├── sjAlignSplit.h ├── sjdbPrepare.h ├── genomeScanFastaFiles.h ├── bamRemoveDuplicates.h ├── sjdbInsertJunctions.h ├── bamSortByCoordinate.h ├── twoPassRunPass1.h ├── funCompareUintAndSuffixes.h ├── sjdbBuildIndex.h ├── SjdbClass.h ├── stitchAlignToTranscript.h ├── signalFromBAM.h ├── funCompareUintAndSuffixesMemcmp.h ├── ErrorWarning.h ├── BAMbinSortUnmapped.h ├── insertSeqSA.h ├── BAMbinSortByCoordinate.h ├── ChimericDetection.cpp ├── readLoad.h ├── Parameters_closeReadsFiles.cpp ├── stringSubstituteAll.cpp ├── soloInputFeatureUMI.h ├── SoloRead_record.cpp ├── genomeSAindex.h ├── ChimericTranscript.h ├── SoloRead.h ├── stitchWindowAligns.h ├── InOutStreams.h ├── sjAlignSplit.cpp ├── ChimericSegment.h ├── SpliceGraph_swTraceBack.cpp ├── SoloRead.cpp ├── TimeFunctions.cpp ├── Solo.h ├── SoloFeatureTypes.h ├── systemFunctions.cpp ├── Quantifications.h ├── ParametersClip.h ├── sysRemoveDir.cpp ├── ThreadControl.h ├── Chain.h ├── ClipMate_initialize.cpp ├── ChimericDetection.h ├── SoloFeature_addBAMtags.cpp ├── streamFuns.h ├── PackedArray.h ├── ClipMate.h ├── ClipCR4.h ├── ReadAlign_mappedFilter.cpp ├── SuffixArrayFuns.h ├── sjdbLoadFromStream.cpp ├── SoloReadBarcodeStats.h ├── GTF.h ├── binarySearch2.cpp ├── ParametersChimeric.h ├── SoloFilteredCells.h ├── opal │ └── LICENSE ├── InOutStreams.cpp ├── SpliceGraph.cpp ├── ReadAlign_chimericDetectionPEmerged.cpp ├── sjdbLoadFromFiles.cpp ├── Quantifications.cpp ├── SoloBarcode_extractBarcode.cpp ├── PackedArray.cpp ├── ReadAlign_mapOneReadSpliceGraph.cpp ├── funCompareUintAndSuffixesMemcmp.cpp ├── BAMoutput.h ├── SuperTranscriptome.h ├── Genome_insertSequences.cpp ├── SoloBarcode.h ├── ErrorWarning.cpp ├── ChimericAlign.h ├── blocksOverlap.cpp ├── SoloReadBarcode.cpp ├── funCompareUintAndSuffixes.cpp ├── soloInputFeatureUMI.cpp ├── ChimericAlign.cpp ├── SoloReadFeature.h ├── Variation.h ├── SequenceFuns.h ├── SoloReadBarcode.h ├── Stats.h ├── ReadAlignChunk.h ├── SoloReadFeatureStats.h ├── ParametersGenome.h ├── Transcript.cpp ├── OutSJ.h ├── mapThreadsSpawn.cpp ├── ReadAnnotations.h ├── ChimericSegment.cpp ├── ChimericAlign_chimericJunctionOutput.cpp ├── SpliceGraph.h ├── Parameters_readSAMheader.cpp ├── Transcriptome_geneFullAlignOverlap_ExonOverIntron.cpp ├── ReadAlign_chimericDetection.cpp ├── ClipMate_clipChunk.cpp ├── funPrimaryAlignMark.cpp ├── stitchGapIndel.cpp ├── SoloReadFeature.cpp ├── genomeParametersWrite.cpp ├── Transcript_alignScore.cpp ├── Genome_genomeOutLoad.cpp ├── SoloFeature.cpp ├── SoloCommon.h ├── ReadAlign_calcCIGAR.cpp ├── Transcriptome_geneFullAlignOverlap.cpp ├── Transcriptome.h ├── Transcript_variationAdjust.cpp └── ClipMate_clip.cpp ├── doc └── STARmanual.pdf ├── bin ├── Linux_x86_64 │ ├── STAR │ └── STARlong ├── MacOSX_x86_64 │ ├── STAR │ └── STARlong └── Linux_x86_64_static │ ├── STAR │ └── STARlong ├── .travis.yml ├── extras ├── scripts │ ├── extractSJfromGTF.sh │ ├── transfragsFromBedGraph.awk │ ├── soloUMIperCell.awk │ ├── sjCollapseSamples.awk │ ├── soloExtractFiltCells.awk │ ├── transcriptTypes.awk │ ├── mergeLogFinal.awk │ ├── filterCirc.awk │ ├── calcInsertCoverage.awk │ ├── tagXSstrandedData.awk │ ├── sjMotif.m │ ├── sjFromSAMcollapseUandM.awk │ ├── soloCompareMtx.awk │ ├── sjFromSAMcollapseUandM_inclOverlaps.awk │ ├── sjBED12.awk │ ├── mergeGeneCounts.awk │ ├── soloCountMatrixFromBAM.awk │ ├── mergeSuperContig.awk │ └── soloBasicCellFilter.awk ├── docker │ └── Dockerfile ├── parameters │ └── ENCODE.txt ├── tests │ └── scripts │ │ ├── checkCellReadsStats_vsMatrix.awk │ │ ├── checkCellReadsStats_vsBAM.awk │ │ └── checkCellReadsStats.awk └── doc-latex │ └── convertParDefToLatexTable.awk ├── .gitignore ├── docs └── STARconsensus.md └── LICENSE /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-slate -------------------------------------------------------------------------------- /source/VERSION: -------------------------------------------------------------------------------- 1 | #define STAR_VERSION "2.7.11b" 2 | -------------------------------------------------------------------------------- /doc/STARmanual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexdobin/STAR/HEAD/doc/STARmanual.pdf -------------------------------------------------------------------------------- /bin/Linux_x86_64/STAR: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexdobin/STAR/HEAD/bin/Linux_x86_64/STAR -------------------------------------------------------------------------------- /source/htslib/config.h: -------------------------------------------------------------------------------- 1 | #define _USE_KNETFILE 2 | #define BGZF_CACHE 3 | #define BGZF_MT 4 | -------------------------------------------------------------------------------- /bin/MacOSX_x86_64/STAR: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexdobin/STAR/HEAD/bin/MacOSX_x86_64/STAR -------------------------------------------------------------------------------- /bin/Linux_x86_64/STARlong: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexdobin/STAR/HEAD/bin/Linux_x86_64/STARlong -------------------------------------------------------------------------------- /bin/MacOSX_x86_64/STARlong: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexdobin/STAR/HEAD/bin/MacOSX_x86_64/STARlong -------------------------------------------------------------------------------- /source/genomeGenerate.h: -------------------------------------------------------------------------------- 1 | #include "Parameters.h" 2 | 3 | void genomeGenerate(Parameters &P); 4 | 5 | -------------------------------------------------------------------------------- /bin/Linux_x86_64_static/STAR: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexdobin/STAR/HEAD/bin/Linux_x86_64_static/STAR -------------------------------------------------------------------------------- /bin/Linux_x86_64_static/STARlong: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexdobin/STAR/HEAD/bin/Linux_x86_64_static/STARlong -------------------------------------------------------------------------------- /source/sortSuffixesBucket.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void sortSuffixesBucket(char *G, void *ind, int indN, int indSkip); -------------------------------------------------------------------------------- /source/outputSJ.h: -------------------------------------------------------------------------------- 1 | #ifndef OUTPUTSJ_DEF 2 | #define OUTPUTSJ_DEF 3 | void outputSJ(ReadAlignChunk** RAchunk, Parameters& P); 4 | #endif 5 | -------------------------------------------------------------------------------- /source/systemFunctions.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_systemFunctions 2 | #include 3 | 4 | std::string linuxProcMemory(); 5 | 6 | #endif 7 | -------------------------------------------------------------------------------- /source/GlobalVariables.cpp: -------------------------------------------------------------------------------- 1 | #include "GlobalVariables.h" 2 | Stats g_statsAll;//global mapping statistics 3 | ThreadControl g_threadChunks; 4 | 5 | -------------------------------------------------------------------------------- /source/Transcript_variationOutput.cpp: -------------------------------------------------------------------------------- 1 | #include "Transcript.h" 2 | 3 | void Transcript::variationOutput(Variation &Var) 4 | { 5 | // 6 | }; 7 | -------------------------------------------------------------------------------- /source/sysRemoveDir.h: -------------------------------------------------------------------------------- 1 | #ifndef DEF_sysRemoveDir 2 | #define DEF_sysRemoveDir 3 | 4 | #include 5 | 6 | void sysRemoveDir(std::string dirName); 7 | 8 | #endif -------------------------------------------------------------------------------- /source/binarySearch2.h: -------------------------------------------------------------------------------- 1 | #ifndef HDEF_binarySearch2 2 | #define HDEF_binarySearch2 3 | 4 | #include "IncludeDefine.h" 5 | int binarySearch2(uint x, uint y, uint *X, uint *Y, int N); 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /source/htslib/.travis.yml: -------------------------------------------------------------------------------- 1 | # Control file for continuous integration testing at http://travis-ci.org/ 2 | 3 | language: c 4 | compiler: 5 | - clang 6 | - gcc 7 | 8 | script: make -e && make test 9 | -------------------------------------------------------------------------------- /source/ThreadControl.cpp: -------------------------------------------------------------------------------- 1 | #include "ThreadControl.h" 2 | 3 | ThreadControl::ThreadControl() { 4 | chunkInN=0; 5 | chunkOutN=0; 6 | // chunkOutBAMposition=new uint [MAX_chunkOutBAMposition]; 7 | }; -------------------------------------------------------------------------------- /source/bam_cat.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_bam_cat 2 | #define CODE_bam_cat 3 | 4 | #include "htslib/htslib/sam.h" 5 | 6 | int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam); 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /source/AlignVsTranscript.h: -------------------------------------------------------------------------------- 1 | #ifndef H_AlignVsTranscript 2 | #define H_AlignVsTranscript 3 | 4 | namespace AlignVsTranscript 5 | { 6 | enum {Intron=0, ExonIntron=1, ExonIntronSpan=2, Concordant=3, N=4}; 7 | }; 8 | 9 | #endif -------------------------------------------------------------------------------- /source/GlobalVariables.h: -------------------------------------------------------------------------------- 1 | #ifndef GLOBAL_VARIABLES_DEF 2 | #define GLOBAL_VARIABLES_DEF 3 | 4 | #include "ThreadControl.h" 5 | extern Stats g_statsAll; 6 | extern ThreadControl g_threadChunks; 7 | 8 | #endif 9 | 10 | -------------------------------------------------------------------------------- /source/Test.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Test.hpp 3 | // 4 | // 5 | // Created by Fahimeh Mirhaj on 6/18/19. 6 | // 7 | 8 | #ifndef Test_hpp 9 | #define Test_hpp 10 | 11 | #include 12 | 13 | #endif /* Test_hpp */ 14 | -------------------------------------------------------------------------------- /source/mapThreadsSpawn.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_mapThreadsSpawn 2 | #define CODE_mapThreadsSpawn 3 | #include "Parameters.h" 4 | #include "ReadAlignChunk.h" 5 | void mapThreadsSpawn (Parameters &P, ReadAlignChunk** RAchunk); 6 | 7 | #endif -------------------------------------------------------------------------------- /source/TimeFunctions.h: -------------------------------------------------------------------------------- 1 | #ifndef TIME_FUNCTIONS_DEF 2 | #define TIME_FUNCTIONS_DEF 3 | #include 4 | #include 5 | 6 | string timeMonthDayTime(); 7 | string timeMonthDayTime(time_t &rawTime); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /source/stringSubstituteAll.h: -------------------------------------------------------------------------------- 1 | #ifndef DEF_stringReplaceAll 2 | #define DEF_stringReplaceAll 3 | 4 | #include 5 | 6 | void stringSubstituteAll(std::string& str, const std::string& from, const std::string& to); 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /source/blocksOverlap.h: -------------------------------------------------------------------------------- 1 | #ifndef BLOCKS_OVERLAP_DEF 2 | #define BLOCKS_OVERLAP_DEF 3 | 4 | #include "IncludeDefine.h" 5 | #include "Transcript.h" 6 | 7 | uint blocksOverlap(Transcript &t1, Transcript &t2); 8 | 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /source/sjdbLoadFromStream.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_sjdbLoadFromStream 2 | #define CODE_sjdbLoadFromStream 3 | 4 | #include 5 | #include "SjdbClass.h" 6 | void sjdbLoadFromStream(ifstream &sjdbStreamIn, SjdbClass &sjdbLoci); 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /source/ReadAlign_outputVariation.cpp: -------------------------------------------------------------------------------- 1 | #include "ReadAlign.h" 2 | 3 | void ReadAlign::outputVariation(Variation &Var, Transcript Tr, uint iTr, uint nTr) 4 | { 5 | if (!Var.yes) 6 | { 7 | return; 8 | }; 9 | 10 | 11 | 12 | 13 | }; 14 | -------------------------------------------------------------------------------- /source/readBarcodeLoad.h: -------------------------------------------------------------------------------- 1 | #ifndef H_readBarcodeLoad 2 | #define H_readBarcodeLoad 3 | 4 | #include "IncludeDefine.h" 5 | #include "Parameters.h" 6 | 7 | void loadBarcodeRead(Parameters &P, istream **readInStream, string &seq1, string &qual1); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /source/samHeaders.h: -------------------------------------------------------------------------------- 1 | #ifndef H_samHeaders 2 | #define H_samHeaders 3 | 4 | #include "Parameters.h" 5 | #include "Genome.h" 6 | #include "Transcriptome.h" 7 | 8 | void samHeaders(Parameters &P, Genome &genomeMain, Transcriptome &transcriptomeMain); 9 | 10 | #endif -------------------------------------------------------------------------------- /source/genomeParametersWrite.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_genomeParametersWrite 2 | #define CODE_genomeParametersWrite 3 | 4 | #include "Parameters.h" 5 | #include "Genome.h" 6 | 7 | void genomeParametersWrite(string fileName, Parameters& P, string errorOut, Genome &mapGen); 8 | 9 | #endif -------------------------------------------------------------------------------- /source/sjdbLoadFromFiles.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_sjdbLoadFromFiles 2 | #define CODE_sjdbLoadFromFiles 3 | 4 | #include 5 | #include "SjdbClass.h" 6 | #include "Parameters.h" 7 | 8 | void sjdbLoadFromFiles(Parameters &P, SjdbClass &sjdbLoci); 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /source/funPrimaryAlignMark.h: -------------------------------------------------------------------------------- 1 | #include "ReadAlign.h" 2 | //#include "Transcript.h" 3 | //#include "Parameters.h" 4 | 5 | void funPrimaryAlignMark(Transcript **trMult, uint64 nTr, Parameters &P, int maxScore, std::uniform_real_distribution rngUniformReal0to1, std::mt19937 rngMultOrder); -------------------------------------------------------------------------------- /source/extendAlign.h: -------------------------------------------------------------------------------- 1 | #include "IncludeDefine.h" 2 | #include "Parameters.h" 3 | #include "Transcript.h" 4 | 5 | bool extendAlign( char* R, char* G, uint rStart, uint gStart, int dR, int dG, uint L, uint Lprev, uint nMMprev, uint nMMmax, double pMMmax, bool extendToEnd, Transcript* trA ); 6 | 7 | -------------------------------------------------------------------------------- /source/sjAlignSplit.h: -------------------------------------------------------------------------------- 1 | #ifndef H_sjAlignSplit 2 | #define H_sjAlignSplit 3 | 4 | #include "IncludeDefine.h" 5 | #include "Genome.h" 6 | 7 | bool sjAlignSplit(uint a1,uint aLength, const Genome &mapGen, uint &a1D, uint &aLengthD, uint &a1A, uint &aLengthA, uint &isj); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /source/sjdbPrepare.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_sjdbPrepare 2 | #define CODE_sjdbPrepare 3 | 4 | #include "SjdbClass.h" 5 | #include "Parameters.h" 6 | #include "Genome.h" 7 | 8 | void sjdbPrepare (SjdbClass &sjdbLoci, Parameters &P, uint nGenomeReal, string outDir, Genome &mapGen, char *Gsj); 9 | 10 | #endif -------------------------------------------------------------------------------- /source/genomeScanFastaFiles.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_genomeScanFastaFiles 2 | #define CODE_genomeScanFastaFiles 3 | 4 | #include "Parameters.h" 5 | #include "IncludeDefine.h" 6 | #include "Genome.h" 7 | 8 | uint genomeScanFastaFiles (Parameters &P, char* G, bool flagRun, Genome &mapGen); 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /source/bamRemoveDuplicates.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_bamRemoveDuplicates 2 | #define CODE_bamRemoveDuplicates 3 | #include 4 | #include "Parameters.h" 5 | 6 | using namespace std; 7 | 8 | void bamRemoveDuplicates(const string bamFileName, const string bamFileNameOut, Parameters &P); 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /source/sjdbInsertJunctions.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_sjdbInsertJunctions 2 | #define CODE_sjdbInsertJunctions 3 | 4 | #include "Parameters.h" 5 | #include "Genome.h" 6 | #include "SjdbClass.h" 7 | 8 | void sjdbInsertJunctions(Parameters & P, Genome & mapGen, Genome & mapGen1, SjdbClass & sjdbLoci); 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /source/htslib/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.pico 3 | /version.h 4 | 5 | lib*.a 6 | lib*.dylib 7 | lib*.so 8 | lib*.so.* 9 | 10 | /bgzip 11 | /tabix 12 | /test/fieldarith 13 | /test/hfile 14 | /test/sam 15 | /test/test-vcf-api 16 | /test/test-vcf-sweep 17 | /test/test_view 18 | /test/*.tmp 19 | /test/*.tmp.* 20 | 21 | /TAGS 22 | -------------------------------------------------------------------------------- /source/bamSortByCoordinate.h: -------------------------------------------------------------------------------- 1 | #ifndef H_bamSortByCoordinate 2 | #define H_bamSortByCoordinate 3 | 4 | #include "Parameters.h" 5 | #include "ReadAlignChunk.h" 6 | #include "Genome.h" 7 | #include "Solo.h" 8 | 9 | void bamSortByCoordinate(Parameters &P, ReadAlignChunk **RAchunk, Genome &genome, Solo &solo) ; 10 | 11 | #endif 12 | -------------------------------------------------------------------------------- /source/twoPassRunPass1.h: -------------------------------------------------------------------------------- 1 | #ifndef H_twoPassRunPass1 2 | #define H_twoPassRunPass1 3 | 4 | #include "Parameters.h" 5 | #include "Genome.h" 6 | #include "Transcriptome.h" 7 | #include "SjdbClass.h" 8 | 9 | void twoPassRunPass1(Parameters &P, Genome &genomeMain, Transcriptome *transcriptomeMain, SjdbClass &sjdbLoci); 10 | 11 | #endif -------------------------------------------------------------------------------- /source/funCompareUintAndSuffixes.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_funCompareUintAndSuffixes 2 | #define CODE_funCompareUintAndSuffixes 3 | 4 | #include 5 | 6 | extern char* g_funCompareUintAndSuffixes_G; 7 | extern uint64_t g_funCompareUintAndSuffixes_L; 8 | 9 | int funCompareUintAndSuffixes ( const void *a, const void *b); 10 | 11 | #endif 12 | -------------------------------------------------------------------------------- /source/sjdbBuildIndex.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_sjdbBuildIndex 2 | #define CODE_sjdbBuildIndex 3 | 4 | #include "Parameters.h" 5 | #include "PackedArray.h" 6 | #include "Genome.h" 7 | 8 | void sjdbBuildIndex (Parameters &P, char *Gsj, char *G, PackedArray &SA, PackedArray &SA2, PackedArray &SAi, Genome &mapGen, Genome &mapGen1); 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /source/SjdbClass.h: -------------------------------------------------------------------------------- 1 | #ifndef DEF_SjdbClass 2 | #define DEF_SjdbClass 3 | 4 | #include "IncludeDefine.h" 5 | #include 6 | 7 | class SjdbClass { 8 | public: 9 | vector chr; 10 | vector start,end; 11 | vector str; 12 | vector priority; 13 | 14 | vector> gene; 15 | }; 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /source/stitchAlignToTranscript.h: -------------------------------------------------------------------------------- 1 | #include "IncludeDefine.h" 2 | #include "Parameters.h" 3 | #include "Transcript.h" 4 | #include "Genome.h" 5 | 6 | intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBstart, uint L, uint iFragB, uint sjAB, Parameters& P, char* R, Genome &mapGen, Transcript *trA, uint outFilterMismatchNmaxTotal); 7 | 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | compiler: g++ 3 | 4 | before_install: 5 | - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test 6 | - sudo apt-get update -qq 7 | 8 | install: 9 | - sudo apt-get install -qq g++-4.9 10 | - export CXX="g++-4.9" 11 | 12 | git: 13 | submodules: false 14 | before_script: 15 | - cd source 16 | script: 17 | - make STAR 18 | -------------------------------------------------------------------------------- /source/signalFromBAM.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_signalFromBAM 2 | #define CODE_signalFromBAM 3 | #include "htslib/htslib/sam.h" 4 | #include 5 | #include 6 | #include "Stats.h" 7 | #include "Parameters.h" 8 | 9 | using namespace std; 10 | 11 | void signalFromBAM(const string bamFileName, const string sigFileName, Parameters P); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /source/funCompareUintAndSuffixesMemcmp.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_funCompareUintAndSuffixesMemcmp 2 | #define CODE_funCompareUintAndSuffixesMemcmp 3 | 4 | #include 5 | 6 | extern char* g_funCompareUintAndSuffixesMemcmp_G; 7 | extern uint64_t g_funCompareUintAndSuffixesMemcmp_L; 8 | int funCompareUintAndSuffixesMemcmp ( const void *a, const void *b); 9 | 10 | #endif -------------------------------------------------------------------------------- /extras/scripts/extractSJfromGTF.sh: -------------------------------------------------------------------------------- 1 | # usage: 2 | # extractSJfromGTF.sh in.gtf > out.sj 3 | # 4 | # assumes transcript_id in the 12th field of GTF 5 | # 6 | awk '$3=="exon" {print $12,$1,$4,$5,$7}' $1 |\ 7 | sort -k1,1V -k2,2V -k3,3n |\ 8 | awk 'BEGIN {OFS="\t"} {if (t==$1) {print $2,e1+1,$3-1,$5}; e1=$4;t=$1 }' |\ 9 | sort -k1,1V -k2,2n -k3,3n -k4,4 | uniq 10 | 11 | -------------------------------------------------------------------------------- /source/ErrorWarning.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_ErrorWarning 2 | #define CODE_ErrorWarning 3 | 4 | #include "IncludeDefine.h" 5 | #include "Parameters.h" 6 | 7 | void exitWithError(string messageOut, ostream &streamOut1, ostream &streamOut2, int errorInt, Parameters &P); 8 | void warningMessage(string messageOut, ostream &streamOut1, ostream &streamOut2, Parameters &P); 9 | #endif 10 | -------------------------------------------------------------------------------- /source/BAMbinSortUnmapped.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_BAMbinSortUnmapped 2 | #define CODE_BAMbinSortUnmapped 3 | #include "IncludeDefine.h" 4 | #include "Parameters.h" 5 | #include "Genome.h" 6 | #include "Solo.h" 7 | 8 | #include SAMTOOLS_BGZF_H 9 | 10 | void BAMbinSortUnmapped(uint32 iBin, uint nThreads, string dirBAMsort, Parameters &P, Genome &genome, Solo &solo); 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /source/insertSeqSA.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_insertSeqSA 2 | #define CODE_insertSeqSA 3 | 4 | #include "IncludeDefine.h" 5 | #include "PackedArray.h" 6 | #include "Parameters.h" 7 | #include "Genome.h" 8 | 9 | uint insertSeqSA(PackedArray & SA, PackedArray & SA1, PackedArray & SAi, char * G, char * G1, uint64 nG, uint64 nG1, uint64 nG2, Parameters & P, Genome &mapGen); 10 | 11 | #endif 12 | -------------------------------------------------------------------------------- /source/BAMbinSortByCoordinate.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_BAMbinSortByCoordinate 2 | #define CODE_BAMbinSortByCoordinate 3 | #include "IncludeDefine.h" 4 | #include "Parameters.h" 5 | #include "Genome.h" 6 | #include "Solo.h" 7 | 8 | #include SAMTOOLS_BGZF_H 9 | 10 | void BAMbinSortByCoordinate(uint32 iBin, uint binN, uint binS, uint nThreads, string dirBAMsort, Parameters &P, Genome &mapGen, Solo &solo); 11 | 12 | #endif -------------------------------------------------------------------------------- /source/ChimericDetection.cpp: -------------------------------------------------------------------------------- 1 | #include "ChimericDetection.h" 2 | 3 | ChimericDetection::ChimericDetection(Parameters &Pin, Transcript ***trAll, uint *nWinTr, char** Read1in, Genome &mapGenIn, fstream *ostreamChimJunctionIn, ReadAlign *RAin) 4 | : P(Pin), RA(RAin), trAll(trAll), nWinTr(nWinTr), Read1(Read1in), outGen(mapGenIn), ostreamChimJunction(ostreamChimJunctionIn) 5 | { 6 | }; 7 | -------------------------------------------------------------------------------- /source/htslib/htslib/vcf_sweep.h: -------------------------------------------------------------------------------- 1 | #ifndef __VCF_SWEEP_H__ 2 | #define __VCF_SWEEP_H__ 3 | 4 | #include "hts.h" 5 | #include "vcf.h" 6 | 7 | typedef struct _bcf_sweep_t bcf_sweep_t; 8 | 9 | bcf_sweep_t *bcf_sweep_init(const char *fname); 10 | void bcf_sweep_destroy(bcf_sweep_t *sw); 11 | bcf_hdr_t *bcf_sweep_hdr(bcf_sweep_t *sw); 12 | bcf1_t *bcf_sweep_fwd(bcf_sweep_t *sw); 13 | bcf1_t *bcf_sweep_bwd(bcf_sweep_t *sw); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /source/readLoad.h: -------------------------------------------------------------------------------- 1 | #ifndef H_readLoad 2 | #define H_readLoad 3 | 4 | #include "IncludeDefine.h" 5 | #include "Parameters.h" 6 | #include "SequenceFuns.h" 7 | 8 | int readLoad(istream& readInStream, Parameters& P, uint& Lread, uint& LreadOriginal, \ 9 | char* readName, char* Seq, char* SeqNum, char* Qual, vector &clipOneMate, \ 10 | uint &iReadAll, uint32 &readFilesIndex, char &readFilter, string &readNameExtra); 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /source/Parameters_closeReadsFiles.cpp: -------------------------------------------------------------------------------- 1 | #include "Parameters.h" 2 | #include "ErrorWarning.h" 3 | #include 4 | #include 5 | void Parameters::closeReadsFiles() { 6 | for (uint imate=0; imatereadIn[imate].is_open() ) 8 | inOut->readIn[imate].close(); 9 | if (readFilesCommandPID[imate]>0) 10 | kill(readFilesCommandPID[imate],SIGKILL); 11 | }; 12 | }; -------------------------------------------------------------------------------- /source/stringSubstituteAll.cpp: -------------------------------------------------------------------------------- 1 | #include "stringSubstituteAll.h" 2 | 3 | void stringSubstituteAll(std::string& str, const std::string& from, const std::string& to) { 4 | if(from.empty()) return; 5 | size_t start_pos = 0; 6 | while((start_pos = str.find(from, start_pos)) != std::string::npos) { 7 | str.replace(start_pos, from.length(), to); 8 | start_pos += to.length(); // In case 'to' contains 'from', like replacing 'x' with 'yx' 9 | }; 10 | }; 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | Depend.list 3 | 4 | .project 5 | .kdev4/ 6 | source.kdev4 7 | 8 | 9 | # Don't track intermediary files from building the manual 10 | extras/doc-latex/*.aux 11 | extras/doc-latex/*.fdb_latexmk 12 | extras/doc-latex/*.fls 13 | extras/doc-latex/*.log 14 | extras/doc-latex/*.out 15 | extras/doc-latex/*.gz 16 | extras/doc-latex/*.toc 17 | 18 | # Don't track the STAR binary once it has being built 19 | source/STAR 20 | .DS_Store 21 | *.xcworkspacedata 22 | *.pbxproj 23 | *.plist 24 | *.xcuserstate 25 | -------------------------------------------------------------------------------- /source/soloInputFeatureUMI.h: -------------------------------------------------------------------------------- 1 | #ifndef H_soloInputFeatureUMI 2 | #define H_soloInputFeatureUMI 3 | 4 | #include 5 | #include 6 | #include 7 | #include "IncludeDefine.h" 8 | #include "SoloCommon.h" 9 | 10 | bool soloInputFeatureUMI(fstream *strIn, int32 featureType, bool readInfoYes, array,2> &sjAll, uint64 &iread, 11 | int32 &cbmatch, uint32 &feature, uint64 &umi, vector &featVecU32, SoloReadFlagClass &readFlagCounts); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /source/SoloRead_record.cpp: -------------------------------------------------------------------------------- 1 | #include "SoloRead.h" 2 | 3 | void SoloRead::record(uint64 nTr, Transcript **alignOut, uint64 iRead, ReadAnnotations &readAnnot) 4 | { 5 | if (pSolo.type==pSolo.SoloTypes::None) 6 | return; 7 | if (pSolo.type==pSolo.SoloTypes::CB_samTagOut) 8 | return; 9 | 10 | if (pSolo.readStats.yes) 11 | readFlagReset(); 12 | 13 | for (uint32 ii=0; iirecord(*readBar, nTr, alignOut, iRead, readAnnot); 15 | }; 16 | -------------------------------------------------------------------------------- /source/genomeSAindex.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_genomeSAindex 2 | #define CODE_genomeSAindex 3 | #include "PackedArray.h" 4 | #include "Parameters.h" 5 | #include "Genome.h" 6 | 7 | void genomeSAindex(char * G, PackedArray & SA, Parameters & P, PackedArray & SAip, Genome &mapGen); 8 | void genomeSAindexChunk(char * G, PackedArray & SA, Parameters & P, PackedArray & SAi, uint iSA1, uint iSA2, Genome &mapGen); 9 | void funSAiFindNextIndex(char *G, PackedArray &SA, uint isaStep, uint & isa, uint & indFull, int & iL4, Genome &mapGen); 10 | 11 | #endif 12 | -------------------------------------------------------------------------------- /extras/scripts/transfragsFromBedGraph.awk: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | OFS="\t"; 3 | getline; 4 | start1=$2; 5 | end1=$3; 6 | chr1=$1; 7 | s=$4; 8 | } 9 | 10 | { 11 | if ($2!=end1 && start1>0) { 12 | printf "%s\t%i\t%i\t%20.5f\n", chr1,start1,end1,s; 13 | chr1=$1; 14 | start1=$2; 15 | end1=$3; 16 | s=$4 17 | } else { 18 | s+=$4; 19 | end1=$3; 20 | #print end1,start1,$2 21 | }; 22 | } 23 | 24 | END { 25 | printf "%s\t%i\t%i\t%20.5f\n", chr1,start1,end1,s; 26 | } 27 | -------------------------------------------------------------------------------- /source/ChimericTranscript.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_ChimericTranscript 2 | #define CODE_ChimericTranscript 3 | 4 | #include "IncludeDefine.h" 5 | #include "Parameters.h" 6 | #include "Transcript.h" 7 | 8 | class ChimericTranscript 9 | {// 10 | public: 11 | Transcript **chTrs; //all chimeric transcripts 12 | uint nCh; //number of recorded (best) chimeric transcripts 13 | uint nChSize; //size of the chTrs array, will be increased if nCh > nChSize 14 | 15 | ChimericTranscript(Parameters &Pin); //allocate 16 | private: 17 | }; 18 | 19 | #endif -------------------------------------------------------------------------------- /source/SoloRead.h: -------------------------------------------------------------------------------- 1 | #ifndef H_SoloRead 2 | #define H_SoloRead 3 | 4 | #include "SoloReadBarcode.h" 5 | #include "SoloReadFeature.h" 6 | #include "ReadAnnotations.h" 7 | 8 | class SoloRead { 9 | public: 10 | SoloReadBarcode *readBar; 11 | SoloReadFeature **readFeat; 12 | 13 | SoloRead(Parameters &Pin, int32 iChunkIn); 14 | void readFlagReset(); 15 | void record(uint64 nTr, Transcript **alignOut, uint64 iRead, ReadAnnotations &readAnnot); 16 | 17 | private: 18 | const int32 iChunk; 19 | Parameters &P; 20 | ParametersSolo &pSolo; 21 | }; 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /source/stitchWindowAligns.h: -------------------------------------------------------------------------------- 1 | #include "IncludeDefine.h" 2 | #include "Parameters.h" 3 | #include "Transcript.h" 4 | #include "extendAlign.h" 5 | #include "stitchAlignToTranscript.h" 6 | #include "ReadAlign.h" 7 | 8 | void stitchWindowAligns(uint iA, uint nA, int Score, bool WAincl[], uint tR2, uint tG2, Transcript trA, \ 9 | uint Lread, uiWA* WA, char* R, Genome &mapGen, \ 10 | Parameters& P, Transcript** wTr, uint* nWinTr, ReadAlign *RA); 11 | //recursively stitch aligns for one gene 12 | //*nWinTr - number of transcripts for the current window 13 | -------------------------------------------------------------------------------- /source/InOutStreams.h: -------------------------------------------------------------------------------- 1 | #ifndef INOUTSTREAMS_DEF 2 | #define INOUTSTREAMS_DEF 3 | 4 | #include "IncludeDefine.h" 5 | #include SAMTOOLS_BGZF_H 6 | 7 | class InOutStreams { 8 | public: 9 | ostream *logStdOut, *outSAM; 10 | ofstream logStdOutFile, outSAMfile; 11 | BGZF *outBAMfileUnsorted, *outBAMfileCoord, *outQuantBAMfile; 12 | 13 | ofstream outChimSAM, outChimJunction, logMain, logProgress, logFinal, outUnmappedReadsStream[MAX_N_MATES]; 14 | ifstream readIn[MAX_N_MATES]; 15 | 16 | //compilation-optional streams 17 | ofstream outLocalChains; 18 | 19 | InOutStreams(); 20 | ~InOutStreams(); 21 | }; 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /source/sjAlignSplit.cpp: -------------------------------------------------------------------------------- 1 | #include "sjAlignSplit.h" 2 | 3 | bool sjAlignSplit(uint a1,uint aLength, const Genome &mapGen, uint &a1D, uint &aLengthD, uint &a1A, uint &aLengthA, uint &isj) { 4 | uint sj1=(a1-mapGen.sjGstart)%mapGen.sjdbLength; 5 | if (sj1mapGen.sjdbOverhang) {//align crosses the junctions 6 | isj=(a1-mapGen.sjGstart)/mapGen.sjdbLength; 7 | aLengthD=mapGen.sjdbOverhang-sj1; 8 | aLengthA=aLength-aLengthD; 9 | a1D=mapGen.sjDstart[isj]+sj1; 10 | a1A=mapGen.sjAstart[isj]; 11 | return true; 12 | } else { 13 | return false; 14 | }; 15 | }; 16 | -------------------------------------------------------------------------------- /source/ChimericSegment.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_ChimericSegment 2 | #define CODE_ChimericSegment 3 | 4 | #include "IncludeDefine.h" 5 | #include "Parameters.h" 6 | #include "Transcript.h" 7 | #include "ParametersChimeric.h" 8 | 9 | class ChimericSegment 10 | {// 11 | public: 12 | Parameters &P; 13 | ParametersChimeric &pCh; 14 | 15 | Transcript &align; //alignment 16 | uint roS,roE,str; //start/end/strand in original read coordinates 17 | 18 | ChimericSegment(Parameters &Pin, Transcript &alignIn); //allocate 19 | bool segmentCheck();//check if chimeric segment is good 20 | private: 21 | }; 22 | 23 | #endif -------------------------------------------------------------------------------- /extras/scripts/soloUMIperCell.awk: -------------------------------------------------------------------------------- 1 | # usage: awk -f calcUMIperCell.awk raw/matrix.mtx raw/barcodes.tsv filtered/barcodes.tsv | sort -k1,1rn > UMIperCell.txt 2 | # output: column1 = total UMIs per cell 3 | # column2 = 1 for cell that passed filtering, 0 otherwise 4 | 5 | BEGIN { 6 | OFS="\t"; 7 | } 8 | 9 | { 10 | if (ARGIND==1) { 11 | if (FNR<4) 12 | next; #skip header 13 | 14 | umiCount[$2]+=$3; 15 | 16 | } else if (ARGIND==2) { 17 | rawCB[$1]=FNR; 18 | } else if (ARGIND==3) { 19 | filtCB[rawCB[$1]]=FNR; 20 | } 21 | 22 | } 23 | 24 | END { 25 | for (ii in umiCount) 26 | print umiCount[ii], (ii in filtCB); 27 | } 28 | -------------------------------------------------------------------------------- /source/SpliceGraph_swTraceBack.cpp: -------------------------------------------------------------------------------- 1 | //#include "SpliceGraph.h" 2 | 3 | //void SpliceGraph::swTraceBack(array &alignEnds, array &alignStarts) 4 | //{ 5 | 6 | // uint32 row = alignEnds[0]; 7 | // uint32 col = alignEnds[1]; 8 | // uint32 rowT = 0; 9 | // uint32 colT = 0; 10 | // while(col > 0 && row > 0 && scoringMatrix[col][row] != 0) { 11 | // rowT = directionMatrix[col][row].second; 12 | // colT = directionMatrix[col][row].first; 13 | // 14 | // row = rowT; 15 | // col = colT; 16 | // }; 17 | // alignStarts[0]=row; 18 | // alignStarts[1]=col; 19 | //}; 20 | -------------------------------------------------------------------------------- /source/SoloRead.cpp: -------------------------------------------------------------------------------- 1 | #include "SoloRead.h" 2 | 3 | SoloRead::SoloRead(Parameters &Pin, int32 iChunkIn) : iChunk(iChunkIn), P(Pin), pSolo(P.pSolo) 4 | { 5 | readBar = new SoloReadBarcode(P); 6 | 7 | if (pSolo.type==0) 8 | return; 9 | if (pSolo.type==pSolo.SoloTypes::CB_samTagOut) 10 | return; 11 | 12 | readFeat = new SoloReadFeature*[pSolo.nFeatures]; 13 | 14 | for (uint32 ii=0; iireadFlag.flag = 0; 22 | }; -------------------------------------------------------------------------------- /source/TimeFunctions.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | std::string timeMonthDayTime() { 5 | time_t rawTime; 6 | char timeChar[100]; 7 | time(&rawTime); 8 | strftime(timeChar,80,"%b %d %H:%M:%SS",localtime(&rawTime)); 9 | std::string timeString=timeChar; 10 | timeString.erase(timeString.end()-1,timeString.end()); 11 | return timeString; 12 | }; 13 | 14 | std::string timeMonthDayTime(time_t &rawTime) { 15 | char timeChar[100]; 16 | strftime(timeChar,80,"%b %d %H:%M:%SS",localtime(&rawTime)); 17 | std::string timeString=timeChar; 18 | timeString.erase(timeString.end()-1,timeString.end()); 19 | return timeString; 20 | }; 21 | -------------------------------------------------------------------------------- /extras/scripts/sjCollapseSamples.awk: -------------------------------------------------------------------------------- 1 | # usage: 2 | # awk -f sjCollapseSamples.awk /path/to/all/*/SJ.out.tab | sort -k1,1V -k2,2n -k3,3n > SJ.all 3 | # output columns: 4 | # 1-6 - same as in SJ.out.tab 5 | # 7 - total number of unique mappers 6 | # 8 - total number of multi-mappers 7 | # 9 - max overhang 8 | # 10 - number of samples the junction was detected in 9 | 10 | BEGIN { 11 | OFS="\t"; 12 | } 13 | 14 | { 15 | sj=$1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6; 16 | nSamples[sj]++; 17 | nU[sj]+=$7; 18 | nM[sj]+=$8; 19 | if (nO[sj]<$9) nO[sj]=$9; 20 | }; 21 | 22 | END { 23 | for (sj in nSamples) print sj,nU[sj],nM[sj],nO[sj],nSamples[sj]; 24 | } 25 | 26 | -------------------------------------------------------------------------------- /source/Solo.h: -------------------------------------------------------------------------------- 1 | #ifndef H_Solo 2 | #define H_Solo 3 | #include "IncludeDefine.h" 4 | #include "ReadAlignChunk.h" 5 | #include "Transcriptome.h" 6 | #include 7 | 8 | #include "SoloFeature.h" 9 | 10 | 11 | class Solo { 12 | private: 13 | ReadAlignChunk **RAchunk; 14 | Parameters &P; 15 | Transcriptome &Trans; 16 | 17 | public: 18 | ParametersSolo &pSolo; 19 | SoloFeature **soloFeat; 20 | 21 | SoloReadBarcode *readBarSum; 22 | 23 | Solo(ReadAlignChunk **RAchunk, Parameters &Pin, Transcriptome &inTrans); 24 | 25 | Solo(Parameters &Pin, Transcriptome &inTrans);//for soloCellFiltering 26 | 27 | void processAndOutput(); 28 | }; 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /source/SoloFeatureTypes.h: -------------------------------------------------------------------------------- 1 | #ifndef H_SoloFeatureTypes 2 | #define H_SoloFeatureTypes 3 | 4 | namespace SoloFeatureTypes 5 | { 6 | // enum {Gene=0, GeneFull=1, SJ=2, Transcript3p=3, VelocytoSpliced=4, VelocytoUnspliced=5, VelocytoAmbiguous=6, N=7}; 7 | // const static vector Names={"Gene","GeneFull","SJ","Transcript3p","VelocytoSpliced","VelocytoUnspliced","VelocytoAmbiguous",}; 8 | enum {SJ=0, Transcript3p=1, GeneFull=2, GeneFull_ExonOverIntron=3, GeneFull_Ex50pAS=4, Gene=5, VelocytoSimple=6, Velocyto=7, N=8}; 9 | const static vector Names={"SJ", "Transcript3p", "GeneFull", "GeneFull_ExonOverIntron", "GeneFull_Ex50pAS", "Gene", "VelocytoSimple", "Velocyto"}; 10 | }; 11 | 12 | #endif -------------------------------------------------------------------------------- /source/systemFunctions.cpp: -------------------------------------------------------------------------------- 1 | // system functions 2 | #include 3 | #include 4 | #include 5 | 6 | std::string linuxProcMemory() 7 | { 8 | std::ifstream t("/proc/self/status"); 9 | std::stringstream buffer; 10 | buffer << t.rdbuf(); 11 | 12 | 13 | std::string outString; 14 | while (buffer.good()) { 15 | std::string str1; 16 | std::getline(buffer,str1); 17 | if ( (str1.rfind("VmPeak",0) == 0) || 18 | (str1.rfind("VmSize",0) == 0) || 19 | (str1.rfind("VmHWM",0) == 0) || 20 | (str1.rfind("VmRSS",0) == 0) ) { 21 | outString += str1+"; "; 22 | }; 23 | }; 24 | outString += '\n'; 25 | 26 | return outString; 27 | }; -------------------------------------------------------------------------------- /source/Quantifications.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_Quantifications 2 | #define CODE_Quantifications 3 | #include "IncludeDefine.h" 4 | 5 | #define uintQ unsigned long 6 | 7 | class Quantifications { 8 | public: 9 | struct {//counting reads per gene, similar to HTseq 10 | uint32 nGe; //number of genes 11 | int nType; //number of count types (columns) 12 | uintQ cMulti; //count multimappers 13 | uintQ *cAmbig, *cNone;//ambigouous, no-feature 14 | uintQ **gCount; // array of read counts per gene for two strands 15 | } geneCounts; 16 | 17 | Quantifications (uint32 nGeIn); 18 | 19 | void addQuants(const Quantifications & quantsIn); //adds quantsIn to the quants 20 | }; 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /source/ParametersClip.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_ParametersClip 2 | #define CODE_ParametersClip 3 | 4 | #include "IncludeDefine.h" 5 | #include "ClipMate.h" 6 | #include "ClipCR4.h" 7 | #include 8 | #include 9 | 10 | class Parameters; 11 | 12 | class ReadClipInput 13 | { 14 | public: 15 | vector N; 16 | vector NafterAd; 17 | vector adSeq; 18 | vector adMMp; 19 | }; 20 | 21 | class ParametersClip 22 | {// 23 | public: 24 | //bool yes; //trimming is performed 25 | 26 | vector adapterType; 27 | 28 | array in; 29 | 30 | void initialize(Parameters *pPin); 31 | void initializeClipMates(vector> &clipMates); 32 | 33 | private: 34 | Parameters *pP; 35 | }; 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /extras/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stable-slim 2 | 3 | MAINTAINER dobin@cshl.edu 4 | 5 | ARG STAR_VERSION=2.7.11b 6 | 7 | ENV PACKAGES gcc g++ make wget zlib1g-dev unzip 8 | 9 | RUN set -ex 10 | 11 | RUN apt-get update && \ 12 | apt-get install -y --no-install-recommends ${PACKAGES} && \ 13 | apt-get clean && \ 14 | g++ --version && \ 15 | cd /home && \ 16 | wget --no-check-certificate https://github.com/alexdobin/STAR/archive/${STAR_VERSION}.zip && \ 17 | unzip ${STAR_VERSION}.zip && \ 18 | cd STAR-${STAR_VERSION}/source && \ 19 | make STARstatic && \ 20 | mkdir /home/bin && \ 21 | cp STAR /home/bin && \ 22 | cd /home && \ 23 | 'rm' -rf STAR-${STAR_VERSION} && \ 24 | apt-get --purge autoremove -y ${PACKAGES} 25 | 26 | ENV PATH /home/bin:${PATH} 27 | 28 | -------------------------------------------------------------------------------- /source/sysRemoveDir.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | //#define _XOPEN_SOURCE 500 4 | #include 5 | #include 6 | 7 | int removeFileOrDir(const char *fpath,const struct stat *sb, int typeflag, struct FTW *ftwbuf) { 8 | 9 | {//to avoid unused variable warning 10 | (void) sb; 11 | (void) ftwbuf; 12 | }; 13 | 14 | if (typeflag==FTW_F) {//file 15 | remove(fpath); 16 | } else if (typeflag==FTW_DP) {//dir 17 | rmdir(fpath); 18 | } else {//something went wrong, stop the removal 19 | return -1; 20 | }; 21 | return 0; 22 | }; 23 | 24 | 25 | void sysRemoveDir(std::string dirName) {//remove directory and all its contents 26 | int nftwFlag=FTW_DEPTH; 27 | nftw(dirName.c_str(), removeFileOrDir, 100, nftwFlag); 28 | }; 29 | -------------------------------------------------------------------------------- /source/ThreadControl.h: -------------------------------------------------------------------------------- 1 | #ifndef THREAD_CONTROL_DEF 2 | #define THREAD_CONTROL_DEF 3 | 4 | #include "ReadAlignChunk.h" 5 | #include 6 | 7 | #define MAX_chunkOutBAMposition 100000 8 | 9 | class ThreadControl { 10 | public: 11 | bool threadBool; 12 | 13 | pthread_t *threadArray; 14 | pthread_mutex_t mutexInRead, mutexOutSAM, mutexOutBAM1, mutexOutChimSAM, mutexOutChimJunction, mutexOutUnmappedFastx, mutexOutFilterBySJout; 15 | pthread_mutex_t mutexStats, mutexLogMain, mutexBAMsortBins, mutexError; 16 | 17 | uint chunkInN,chunkOutN; 18 | 19 | ThreadControl(); 20 | 21 | static void* threadRAprocessChunks(void *RAchunk) { 22 | ( (ReadAlignChunk*) RAchunk )->processChunks(); 23 | pthread_exit(0); 24 | return NULL; 25 | }; 26 | }; 27 | 28 | #endif 29 | 30 | -------------------------------------------------------------------------------- /extras/scripts/soloExtractFiltCells.awk: -------------------------------------------------------------------------------- 1 | # usage awk soloExtractFiltCells.awk barcodes.tsv barcodes_filtered.tsv matrix.mtx > matrix_filtered.mtx 2 | BEGIN { 3 | nSkip=1; # number of cells to skip after comments 4 | } 5 | 6 | { 7 | if (ARGIND==1) {# read barcodes 8 | CBind[$1]=NR; 9 | } else if (ARGIND==2) { 10 | n++; 11 | CB[CBind[$1]]=n; # new indexes 12 | } else if ($1~/^%/){ 13 | print; 14 | next; 15 | } else { 16 | if (nSkip>0) { 17 | nFeat=$1; 18 | nSkip--; 19 | next; 20 | }; 21 | if ($2 in CB) { 22 | nLines++; 23 | outAll = outAll sprintf($1 " " CB[$2] " " $3 "\n"); 24 | }; 25 | }; 26 | } 27 | 28 | END { 29 | print nFeat, length(CB), nLines; 30 | print outAll; 31 | }; 32 | -------------------------------------------------------------------------------- /source/Chain.h: -------------------------------------------------------------------------------- 1 | #ifndef DEF_Chain 2 | #define DEF_Chain 3 | 4 | #include "IncludeDefine.h" 5 | #include "Parameters.h" 6 | #include "ErrorWarning.h" 7 | 8 | class OneChain 9 | { 10 | public: 11 | uint bN; 12 | string chr1,chr2;//1/2 (old/new) chr names 13 | vector bStart1, bStart2, bLen; //blocks starts in 1/2, lengths 14 | }; 15 | 16 | class Chain { 17 | public: 18 | // // uint bN;//number of blocks 19 | // // vector bStart1, bStart2, bLen; //blocks starts in 1/2, lengths 20 | 21 | Chain(Parameters &Pin, string chainFileNameIn); 22 | void liftOverGTF(string gtfFileName, string outFileName); 23 | private: 24 | Parameters &P; 25 | string chainFileName; 26 | void chainLoad(); 27 | std::map chrChains; 28 | }; 29 | 30 | #endif -------------------------------------------------------------------------------- /extras/scripts/transcriptTypes.awk: -------------------------------------------------------------------------------- 1 | # requires "trTypes.txt" - a file with transcript types 2 | # e.g. for Gencode GTF 3 | # awk '$3=="transcript" {a=$0; gsub(/.*transcript_id "/,"",a);gsub(/".*/,"",a);;b=$0; gsub(/.*gene_type "/,"",b);gsub(/".*/,"",b); print a,b}' Gencode.gtf > trTypes.txt 4 | 5 | BEGIN { 6 | while (getline < "trTypes.txt") { 7 | tT[$1]=$2; 8 | }; 9 | OFS="\t"; 10 | rt[1]=0; #declare array 11 | delete rt; 12 | } 13 | 14 | { 15 | if ($1!=r) { 16 | if (length(rt)==1) {#only if read overlaps one trType 17 | for (tt in rt) { 18 | if (tt=="") print r; 19 | nT[tt]++; 20 | }; 21 | }; 22 | delete rt; 23 | r=$1; 24 | }; 25 | 26 | if ($3 in tT) { 27 | rt[tT[$3]]=1; 28 | }; 29 | }; 30 | 31 | END { 32 | for (tt in nT) { 33 | print tt, nT[tt]; 34 | }; 35 | }; 36 | 37 | -------------------------------------------------------------------------------- /extras/scripts/mergeLogFinal.awk: -------------------------------------------------------------------------------- 1 | # 2 | # merges Log.final.out files from multiple runs into one table 3 | # usage: 4 | # awk -f mergeLogFinal.awk /path/to/1st/Log.final.out /path/to/2nd/Log.final.out ... 5 | # e.g. 6 | # awk -f mergeLogFinal.awk */Log.final.out 7 | # 8 | 9 | BEGIN { 10 | FS="|"; 11 | for (jj=1;jj<=ARGC;jj++) 12 | { 13 | a=ARGV[jj]; 14 | gsub("/Log.final.out","",a); 15 | printf ";" a 16 | }; 17 | printf "\n"; 18 | } 19 | { 20 | gsub(/^[ \t]+|[ \t]+$/,"",$1); 21 | gsub(/^[ \t]+|[ \t]+$/,"",$2); 22 | L[FNR]=$1; 23 | V[FNR,ARGIND]=$2 24 | } 25 | END { 26 | for (ii=1;ii<=length(L);ii++) 27 | { 28 | printf "%s",L[ii]; 29 | if (V[ii,1]!="") 30 | for (jj=1;jj<=ARGC;jj++) 31 | printf ";" V[ii,jj]; 32 | printf "\n" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /source/htslib/README.md: -------------------------------------------------------------------------------- 1 | HTSlib is an implementation of a unified C library for accessing common file 2 | formats, such as [SAM, CRAM and VCF][1], used for high-throughput sequencing 3 | data, and is the core library used by [samtools][2] and [bcftools][3]. 4 | HTSlib only depends on [zlib][4]. 5 | It is known to be compatible with gcc, g++ and clang. 6 | 7 | HTSlib implements a generalized BAM index, with file extension `.csi` 8 | (coordinate-sorted index). The HTSlib file reader first looks for the new index 9 | and then for the old if the new index is absent. 10 | 11 | This project also includes the popular tabix indexer, which indexes both `.tbi` 12 | and `.csi` formats, and the bgzip compression utility. 13 | 14 | [1]: http://samtools.github.io/hts-specs/ 15 | [2]: http://samtools.github.io/bcftools/ 16 | [3]: http://github.com/samtools/samtools 17 | [4]: http://zlib.net/ 18 | -------------------------------------------------------------------------------- /source/ClipMate_initialize.cpp: -------------------------------------------------------------------------------- 1 | #include "ClipMate.h" 2 | #include "Parameters.h" 3 | #include "SequenceFuns.h" 4 | 5 | void ClipMate::initialize(uint32 Nin, const string &adSeqIn, uint32 NafterAdin, double adMMpIn) 6 | { 7 | N=Nin; 8 | 9 | adSeq=adSeqIn; 10 | if (adSeq=="-") { 11 | adSeq=""; 12 | } else { 13 | if ( adSeq=="polyA") { 14 | adSeqNum.clear(); //it should be empty, but just in case... 15 | adSeqNum.resize(DEF_readSeqLengthMax, 0); //fill with A=0 16 | } else { 17 | adSeqNum.resize(adSeq.size(),0); 18 | convertNucleotidesToNumbers(adSeq.data(), adSeqNum.data(), adSeqNum.size()); 19 | }; 20 | }; 21 | 22 | if (N==0 && adSeq=="") 23 | type=-1; 24 | 25 | if (type==10) 26 | cr4 = new ClipCR4; 27 | 28 | NafterAd=NafterAdin; 29 | adMMp=adMMpIn; 30 | }; 31 | 32 | 33 | -------------------------------------------------------------------------------- /extras/parameters/ENCODE.txt: -------------------------------------------------------------------------------- 1 | --outFilterType BySJout //reduces the number of "spurious" junctions 2 | --outFilterMultimapNmax 20 //max number of multiple alignments allowed for a read: if exceeded, the read is considered unmapped 3 | --alignSJoverhangMin 8 //min overhang for unannotated junctions 4 | --alignSJDBoverhangMin 1 //min overhang for annotated junctions 5 | --outFilterMismatchNmax 999 //max number of mismatches per pair (absolute) 6 | --outFilterMismatchNoverLmax 0.06 //max number of mismatches per pair relative to read length: for 2x100b, max number of mismatches is 0.06*200=12 for the paired read 7 | --alignIntronMin 20 //min intron 8 | --alignIntronMax 1000000 //max intron 9 | --alignMatesGapMax 1000000 //max genomic distance between pairs 10 | -------------------------------------------------------------------------------- /source/ChimericDetection.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_ChimericDetection 2 | #define CODE_ChimericDetection 3 | 4 | #include "IncludeDefine.h" 5 | #include "Parameters.h" 6 | #include "Transcript.h" 7 | #include "ChimericAlign.h" 8 | #include "Genome.h" 9 | 10 | class ReadAlign; 11 | 12 | class ChimericDetection { 13 | private: 14 | Parameters &P; 15 | ReadAlign *RA; 16 | Transcript ***trAll; 17 | uint nW, *nWinTr; 18 | char** Read1; 19 | Genome &outGen; 20 | 21 | public: 22 | vector chimAligns; 23 | 24 | ChimericDetection(Parameters &Pin, Transcript ***trAll, uint *nWinTr, char** Read1in, Genome &genomeIn, fstream *ostreamChimJunctionIn, ReadAlign *RA); 25 | bool chimericDetectionMult(uint nWin, uint *readLength, int maxNonChimAlignScore, ReadAlign *PEunmergedRA); 26 | fstream *ostreamChimJunction; 27 | }; 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /source/SoloFeature_addBAMtags.cpp: -------------------------------------------------------------------------------- 1 | #include "SoloFeature.h" 2 | #include "BAMfunctions.h" 3 | #include "SequenceFuns.h" 4 | 5 | void SoloFeature::addBAMtags(char *&bam0, uint32 &size0, char *bam1) 6 | {//add extra tags to the BAM record 7 | 8 | uint64 iread = * ((uint64*) (bam0+size0)); 9 | iread = iread >> 32; //iRead was encoded in the upper 32 bitsls 10 | 11 | string cb="-"; 12 | string umi="-"; 13 | if (readInfo[iread].cb+1 != 0) 14 | cb = pSolo.cbWLstr[readInfo[iread].cb]; 15 | 16 | if (readInfo[iread].umi+1 != 0) 17 | umi = convertNuclInt64toString(readInfo[iread].umi, pSolo.umiL); 18 | 19 | memcpy(bam1, bam0, size0); 20 | 21 | size0 += bamAttrArrayWrite(cb, "CB", bam1+size0); 22 | size0 += bamAttrArrayWrite(umi, "UB", bam1+size0); 23 | uint32 *bam1i = (uint32*) bam1; 24 | bam1i[0] = size0-sizeof(uint32); 25 | bam0=bam1; 26 | }; -------------------------------------------------------------------------------- /source/streamFuns.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_streamFuns 2 | #define CODE_streamFuns 3 | 4 | #include "Parameters.h" 5 | #include 6 | 7 | unsigned long long fstreamReadBig(std::ifstream &S, char* A, unsigned long long N); 8 | void fstreamWriteBig(std::ofstream &S, char* A, unsigned long long N, std::string fileName, std::string errorID, Parameters &P) ; 9 | 10 | fstream &fstrOpen (std::string fileName, std::string errorID, Parameters &P, bool flagDelete); 11 | ofstream &ofstrOpen (std::string fileName, std::string errorID, Parameters &P); 12 | ifstream &ifstrOpen (std::string fileName, std::string errorID, std::string solutionString, Parameters &P); 13 | ifstream &ifstrOpenGenomeFile (std::string fileName, std::string errorID, Parameters &P); 14 | void createDirectory(const string dirPathIn, const mode_t dirPerm, const string dirParameter, Parameters &P); 15 | 16 | void copyFile(string fileIn, string fileOut); 17 | #endif 18 | -------------------------------------------------------------------------------- /source/PackedArray.h: -------------------------------------------------------------------------------- 1 | #ifndef PACKEDARRAY_DEF 2 | #define PACKEDARRAY_DEF 3 | 4 | #include "IncludeDefine.h" 5 | 6 | class PackedArray { 7 | private: 8 | uint bitRecMask, wordCompLength; 9 | bool arrayAllocated; //true if charArray was allocated 10 | public: 11 | uint wordLength, length, lengthByte; 12 | uint operator [] (uint ii); 13 | char* charArray; 14 | 15 | PackedArray(); 16 | void defineBits (uint Nbits, uint lengthIn); 17 | void writePacked(uint jj, uint x); 18 | void allocateArray(); 19 | void deallocateArray(); 20 | void pointArray(char* pointerCharIn); 21 | // PackedArray(uint N); 22 | }; 23 | 24 | inline uint PackedArray::operator [] (uint ii) { 25 | uint b=ii*wordLength; 26 | uint B=b/8; 27 | uint S=b%8; 28 | 29 | uint a1 = *((uint*) (charArray+B)); 30 | a1 = ((a1>>S)<>wordCompLength; 31 | return a1; 32 | }; 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /source/ClipMate.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_ClipMate 2 | #define CODE_ClipMate 3 | 4 | #include "IncludeDefine.h" 5 | #include "ClipCR4.h" 6 | 7 | class ClipMate 8 | { 9 | public: 10 | //clip parameters 11 | int type; //standard sequence clip: 0=5p, 1=3p, -1=no clip; 10/11 = 10X CR4 5/3p clip 12 | uint32 N; 13 | uint32 NafterAd; 14 | string adSeq; 15 | vector adSeqNum; 16 | double adMMp; 17 | 18 | //clipped info from clipChunk 19 | char clippedInfo; 20 | 21 | //clip results 22 | uint32 clippedAdN; //adapter bases clipped 23 | uint32 clippedAdMM; //adapter mismatches 24 | uint32 clippedN; //total number of bases clipped 25 | 26 | ClipCR4 *cr4; //CR4 clipping structure 27 | 28 | void initialize(uint32 Nin, const string &adSeqIn, uint32 afterAdNin, double adMMpIn); 29 | uint32 clip(uint &Lread, char *SeqNum); 30 | void clipChunk(char *chArr, uint64 chSize); 31 | 32 | }; 33 | 34 | #endif -------------------------------------------------------------------------------- /source/ClipCR4.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_ClipCR4 2 | #define CODE_ClipCR4 3 | 4 | #include "IncludeDefine.h" 5 | #include "opal/opal.h" 6 | 7 | class ClipCR4 8 | { 9 | public: 10 | int dbN;//number of sequence in the opal "database" 11 | 12 | vector storeClip; 13 | 14 | // Results for each sequence in database 15 | vector opalRes; 16 | 17 | OpalSearchResult** opalResP; 18 | 19 | //constructor 20 | ClipCR4(); 21 | void opalFillOneSeq(uint32 idb, char *seq, uint32 seqL); 22 | void opalAlign(uint8 *query, uint32 queryLen, int dbN1); 23 | uint32 polyTail3p(char *seq, uint32 seqLen); 24 | 25 | private: 26 | 27 | uint32 readLen; //sequence length to align against 28 | 29 | int alphabetLength; 30 | int gapOpen; 31 | int gapExt; 32 | vector scoreMatrix; 33 | 34 | // Database 35 | uint8_t* dbSeqArr; 36 | uint8_t** dbSeqs; 37 | int* dbSeqsLen; 38 | 39 | }; 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /extras/scripts/filterCirc.awk: -------------------------------------------------------------------------------- 1 | function cigarGenomicDist(cig) 2 | { 3 | n=split(cig,L,/[A-Z]/)-1; 4 | split(cig,C,/[0-9]*/); 5 | g=0; 6 | for (ii=1;ii<=n;ii++) {//scan through CIGAR operations 7 | if (C[ii+1]!="S" && C[ii+1]!="I") { 8 | g+=L[ii]; 9 | }; 10 | }; 11 | return g; 12 | }; 13 | BEGIN { 14 | endTol=5; 15 | OFS="\t"; 16 | }; 17 | { 18 | if ( $7>=0 && $1==$4 && $3==$6 && (($3=="-" && $5>$2 && $5-$2<1000000) || ($3=="+" && $2>$5 && $2-$5<1000000)) ) 19 | { 20 | #print $1,$2,$5,$3,$7,$8,$9; 21 | #print $11,$11+cigarGenomicDist($12),$13,$13+cigarGenomicDist($14); 22 | if ( ($3=="+" && $11+endTol>$5 && $13+cigarGenomicDist($14)-endTol<=$2) \ 23 | || ($3=="-" && $13+endTol>$2 && $11+cigarGenomicDist($12)-endTol<=$5) ) { 24 | print $1,($3=="+"?$5:$2),($3=="+"?$2:$5),($3=="+"?"-":"+"),($7==0?0:3-$7),$8,$9,(NF>=15 ? $15:1); 25 | }; 26 | }; 27 | }; 28 | -------------------------------------------------------------------------------- /extras/scripts/calcInsertCoverage.awk: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | while (getline < TrLengthFile) 3 | { 4 | T[$1]=$2+0; 5 | } 6 | } 7 | 8 | { 9 | if (NR%2==1 && $3 in T) 10 | { 11 | s=$4; 12 | if (s==0) 13 | next; 14 | i=$9; 15 | l=T[$3]; 16 | n=substr($12,6); 17 | 18 | #print s,i,l,n > "alignTr.txt"; 19 | 20 | iN[i]+=1/n; 21 | 22 | t1=int((s-1)/l*100)+1;t2=int((s+i-2)/l*100)+1; 23 | for (ii=t1;ii<=t2;ii++) 24 | C[ii]+=1/n; 25 | 26 | C5[t1]+=1/n; 27 | C3[t2]+=1/n; 28 | 29 | if (n==1) 30 | { 31 | for (ii=t1;ii<=t2;ii++) 32 | Cu[ii]+=1; 33 | 34 | C5u[t1]++; 35 | C3u[t2]++; 36 | iNu[i]++; 37 | } 38 | } 39 | } 40 | END { 41 | for (ii in iN) 42 | print ii,iN[ii] > "insertHist.txt"; 43 | for (ii in iNu) 44 | print ii,iNu[ii] > "insertHistUnique.txt"; 45 | for (ii=1;ii<=101;ii++) 46 | print ii,C[ii]+0,Cu[ii]+0,C5[ii]+0,C3[ii]+0,C5u[ii]+0,C3u[ii]+0 > "coverage.txt" 47 | } 48 | -------------------------------------------------------------------------------- /source/ReadAlign_mappedFilter.cpp: -------------------------------------------------------------------------------- 1 | #include "ReadAlign.h" 2 | 3 | void ReadAlign::mappedFilter() {//filter mapped read, add to stats 4 | unmapType=-1;//mark as mapped 5 | if ( nW==0 ) {//no good windows 6 | statsRA.unmappedOther++; 7 | unmapType=0; 8 | } else if ( (trBest->maxScore < P.outFilterScoreMin) || (trBest->maxScore < (intScore) (P.outFilterScoreMinOverLread*(Lread-1))) \ 9 | || (trBest->nMatch < P.outFilterMatchNmin) || (trBest->nMatch < (uint) (P.outFilterMatchNminOverLread*(Lread-1))) ) {//too short 10 | statsRA.unmappedShort++; 11 | unmapType=1; 12 | } else if ( (trBest->nMM > outFilterMismatchNmaxTotal) || (double(trBest->nMM)/double(trBest->rLength)>P.outFilterMismatchNoverLmax) ) {//too many mismatches 13 | statsRA.unmappedMismatch++; 14 | unmapType=2; 15 | } else if (nTr > P.outFilterMultimapNmax){//too multi 16 | statsRA.unmappedMulti++; 17 | unmapType=3; 18 | }; 19 | 20 | return; 21 | }; -------------------------------------------------------------------------------- /source/SuffixArrayFuns.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_SuffixArrayFuns 2 | #define CODE_SuffixArrayFuns 3 | 4 | #include "IncludeDefine.h" 5 | #include "Parameters.h" 6 | #include "PackedArray.h" 7 | #include "Genome.h" 8 | 9 | uint medianUint2(uint, uint); 10 | uint compareSeqToGenome(Genome &mapGen, char** s2, uint S, uint N, uint L, uint iSA, bool dirR, bool& comparRes); 11 | uint findMultRange(Genome &mapGen, uint i3, uint L3, uint i1, uint L1, uint i1a, uint L1a, uint i1b, uint L1b, char** s, bool dirR, uint S); 12 | uint maxMappableLength(Genome &mapGen, char** s, uint S, uint N, uint i1, uint i2, bool dirR, uint& L, uint* indStartEnd); 13 | void writePacked(Genome &mapGen, char* a, uint jj, uint x); 14 | uint readPacked(Genome &mapGen, char* a, uint jj); 15 | uint suffixArraySearch1(Genome &mapGen, char** s2, uint S, uint N, uint64 gInsert, bool dirR, uint i1, uint i2, uint L); 16 | int64 funCalcSAi(char *G, uint iL); 17 | uint funCalcSAiFromSA(char* gSeq, PackedArray& gSA, Genome &mapGen, uint iSA, int L, int & iL4); 18 | #endif 19 | -------------------------------------------------------------------------------- /source/sjdbLoadFromStream.cpp: -------------------------------------------------------------------------------- 1 | #include "sjdbLoadFromStream.h" 2 | void sjdbLoadFromStream(ifstream &sjdbStreamIn, SjdbClass &sjdbLoci) { 3 | while (sjdbStreamIn.good()) { 4 | string oneLine,chr1; 5 | uint u1,u2; 6 | char str1; 7 | getline(sjdbStreamIn,oneLine); 8 | istringstream oneLineStream (oneLine); 9 | oneLineStream >> chr1 >> u1 >> u2 >> str1; 10 | if (chr1!="") { 11 | sjdbLoci.chr.push_back(chr1); 12 | sjdbLoci.start.push_back(u1); 13 | sjdbLoci.end.push_back(u2); 14 | switch (str1) {//convert numbers to symbols 15 | case '1': 16 | case '+': 17 | str1='+'; 18 | break; 19 | case '2': 20 | case '-': 21 | str1='-'; 22 | break; 23 | default: 24 | str1='.'; 25 | }; 26 | sjdbLoci.str.push_back(str1); 27 | }; 28 | }; 29 | }; -------------------------------------------------------------------------------- /docs/STARconsensus.md: -------------------------------------------------------------------------------- 1 | STARconsensus: mapping RNA-seq reads to consensus genome. 2 | ========================================================= 3 | 4 | * Introduced in STAR 2.7.7a (2020/12/28) 5 | 6 | * Provide the VCF file with consensus SNVs and InDels at the genome generation stage with ```--genomeTransformVCF Variants.vcf --genomeTransformType Haploid```. 7 | The alternative alleles in this VCF will be inserted to the reference genome to create a "transformed" genome. 8 | Both the genome sequence and transcript/gene annotations are transformed. 9 | 10 | * At the mapping stage, the reads will be mapped to the tranformed (consensus) genome. 11 | The quantification in the transformed annotations can be performed with standard ```--quantMode TranscriptomeSAM and/or GeneCounts``` options. 12 | If desired, alignments (SAM/BAM) and spliced junctions (SJ.out.tab) can be transformed back to the original (reference) coordinates with ```--genomeTransformOutput SAM and/or SJ```. 13 | This is useful if downstream processing relies on reference coordinates. 14 | -------------------------------------------------------------------------------- /source/SoloReadBarcodeStats.h: -------------------------------------------------------------------------------- 1 | #ifndef H_SoloReadBarcodeStats 2 | #define H_SoloReadBarcodeStats 3 | #include "IncludeDefine.h" 4 | 5 | class SoloReadBarcodeStats { 6 | public: 7 | vector names; 8 | enum { noNoAdapter, noNoUMI, noNoCB, noNinCB, noNinUMI, noUMIhomopolymer, noNoWLmatch, noTooManyMM, noTooManyWLmatches, yesWLmatchExact, yesOneWLmatchWithMM, yesMultWLmatchWithMM, nStats}; 9 | uint64 V[nStats]; 10 | SoloReadBarcodeStats() 11 | { 12 | names={"noNoAdapter", "noNoUMI", "noNoCB", "noNinCB", "noNinUMI", "noUMIhomopolymer","noNoWLmatch", "noTooManyMM", "noTooManyWLmatches", "yesWLmatchExact", "yesOneWLmatchWithMM", "yesMultWLmatchWithMM"}; 13 | for (uint32 ii=0; ii> exonLoci; 22 | vector transcriptStrand; 23 | vector transcriptID, geneID; 24 | vector> geneAttr; 25 | 26 | vector> transcriptSeq;//sequences of normal transcripts 27 | vector> transcriptStartEnd;//normal transcripts start/end in the normal genome 28 | 29 | SuperTranscriptome superTrome; 30 | 31 | GTF(Genome &genomeIn, Parameters &Pin, const string &dirOut, SjdbClass &sjdbLoci); 32 | uint64 transcriptGeneSJ(const string &dirOut); 33 | void superTranscript(); 34 | }; 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /source/htslib/htslib_vars.mk: -------------------------------------------------------------------------------- 1 | # Makefile variables useful for third-party code using htslib's public API. 2 | # 3 | # Copyright (C) 2013-2014 Genome Research Ltd. 4 | # 5 | # Author: John Marshall 6 | 7 | # These variables can be used to express dependencies on htslib headers. 8 | # See htslib.mk for details. 9 | 10 | htslib_bgzf_h = $(HTSPREFIX)htslib/bgzf.h 11 | htslib_faidx_h = $(HTSPREFIX)htslib/faidx.h 12 | htslib_hfile_h = $(HTSPREFIX)htslib/hfile.h $(htslib_hts_defs_h) 13 | htslib_hts_h = $(HTSPREFIX)htslib/hts.h 14 | htslib_hts_defs_h = $(HTSPREFIX)htslib/hts_defs.h 15 | htslib_sam_h = $(HTSPREFIX)htslib/sam.h $(htslib_hts_h) 16 | htslib_synced_bcf_reader_h = $(HTSPREFIX)htslib/synced_bcf_reader.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_tbx_h) 17 | htslib_tbx_h = $(HTSPREFIX)htslib/tbx.h $(htslib_hts_h) 18 | htslib_vcf_h = $(HTSPREFIX)htslib/vcf.h $(htslib_hts_h) $(HTSPREFIX)htslib/kstring.h 19 | htslib_vcf_sweep_h = $(HTSPREFIX)htslib/vcf_sweep.h $(htslib_hts_h) $(htslib_vcf_h) 20 | htslib_vcfutils_h = $(HTSPREFIX)htslib/vcfutils.h $(htslib_vcf_h) 21 | -------------------------------------------------------------------------------- /extras/scripts/tagXSstrandedData.awk: -------------------------------------------------------------------------------- 1 | # usage: 2 | # cat Aligned.out.sam | awk -v strType=2 -f tagXSstrandedData.awk 3 | # strType defines strandedness of the libraries: strType = mate whose strand is the same as RNA strand. 4 | # For instance, for Illumina Tru-seq, strType=2 - the 2nd mate's strand is the same as RNA. 5 | 6 | 7 | BEGIN { 8 | OFS="\t"; 9 | strSym[0]="+"; 10 | strSym[1]="-"; 11 | } 12 | 13 | { 14 | 15 | if (substr($1,1,1)=="@" || $4==0) 16 | {# header, or unmapped read - just print 17 | print; 18 | next; 19 | }; 20 | 21 | str=and($2,0x10)/0x10; 22 | 23 | if (and($2,0x1)==0) 24 | {# single end defaults to mate 25 | mate=1; 26 | } else 27 | { 28 | mate=and($2,0x40)/0x40+2*and($2,0x80)/0x80; 29 | }; 30 | 31 | if (mate>0 && mate <3) 32 | {# mate is defined - add XS tag 33 | if (mate!=strType) str=1-str; #revert strand if the mate is opposite 34 | print $0 "\t" "XS:A:" strSym[str]; 35 | } else 36 | {# mate is not defined - just print 37 | print; 38 | }; 39 | } 40 | -------------------------------------------------------------------------------- /source/binarySearch2.cpp: -------------------------------------------------------------------------------- 1 | #include "binarySearch2.h" 2 | 3 | int binarySearch2(uint x, uint y, uint *X, uint *Y, int N) { 4 | //binary search in the sorted list to find the junction 5 | //check the boundaries first 6 | if (N==0 || x>X[N-1] || xi1+1) {//binary search 10 | i3=(i1+i2)/2; 11 | if (X[i3]>x) { 12 | i2=i3; 13 | } else { 14 | i1=i3; 15 | }; 16 | }; 17 | 18 | if (x==X[i1]) { 19 | i3=i1; 20 | } else if (x==X[i2]) { 21 | i3=i2; 22 | } else { 23 | return -1; 24 | }; 25 | 26 | for (int jj=i3;jj>=0;jj--) {//go back 27 | if (x!=X[jj]) { 28 | break;//next try forward 29 | } else if (y==Y[jj]) { 30 | return jj; 31 | }; 32 | }; 33 | 34 | for (int jj=i3;jj outJunctionFormat; 19 | 20 | struct 21 | { 22 | vector stringIn; 23 | bool genomicN; 24 | } filter; 25 | 26 | struct 27 | { 28 | vector type; 29 | bool bam; 30 | bool bamHardClip; 31 | bool samOld; 32 | bool junctions; 33 | } out; 34 | 35 | void initialize(Parameters *pPin); 36 | 37 | private: 38 | Parameters *pP; 39 | }; 40 | 41 | #endif -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Alexander Dobin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /source/SoloFilteredCells.h: -------------------------------------------------------------------------------- 1 | #ifndef H_SoloFilteredCells 2 | #define H_SoloFilteredCells 3 | 4 | class SoloFilteredCells { 5 | public: 6 | array C; 7 | 8 | uint64 &nCells =C[0]; 9 | uint64 &nReadInCells =C[1]; 10 | uint64 &medianReadPerCellUnique =C[2]; 11 | uint64 &meanReadPerCellUnique =C[3]; 12 | uint64 &nUMIinCells =C[4]; 13 | uint64 &medianUMIperCell =C[5]; 14 | uint64 &meanUMIperCell =C[6]; 15 | uint64 &nGeneInCells =C[7]; 16 | uint64 &medianGenePerCell =C[8]; 17 | uint64 &meanGenePerCell =C[9]; 18 | uint64 &nGeneDetected =C[10]; 19 | uint64 &nCellsSimple =C[11]; 20 | uint64 &nReadInCellsUnique =C[12]; 21 | 22 | vector nReadPerCellUnique, nGenePerCell; 23 | vector filtVecBool; 24 | 25 | void reset(uint64 nCells) { 26 | C={0}; 27 | nReadPerCellUnique.clear(); nReadPerCellUnique.reserve(16384); 28 | nGenePerCell.clear(); nGenePerCell.reserve(16384); 29 | filtVecBool.clear(); filtVecBool.resize(nCells, false); 30 | }; 31 | }; 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /source/opal/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Martin Sosic 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /extras/scripts/sjMotif.m: -------------------------------------------------------------------------------- 1 | function [ ] = sjMotif( genomeDir, sjIn, sjOut ) 2 | %sjMotif add splice junction motif as the last column 3 | % genomeDir: STAR genome directory 4 | % sjIn: splice junction loci file name (chr intron_strart intron_end) 5 | % sjOut: output file name 6 | 7 | %% load genome 8 | fin1=fopen([genomeDir '/chrName.txt']); 9 | chrName=textscan(fin1,'%s'); 10 | fclose(fin1); 11 | chrName=chrName{1}; 12 | 13 | fin1=fopen([genomeDir '/chrStart.txt']); 14 | chrStart=textscan(fin1,'%f'); 15 | fclose(fin1); 16 | chrStart=chrStart{1}; 17 | 18 | fin1=fopen([genomeDir '/Genome']); 19 | G=fread(fin1,inf,'*uint8'); 20 | fclose(fin1); 21 | 22 | %% read sj file 23 | fin1=fopen(sjIn); 24 | sj=textscan(fin1,'%s %f %f %[^\n]'); 25 | fclose(fin1); 26 | 27 | %% chr 28 | chrI=zeros(length(sj{1}),1); 29 | for ii=1:length(chrName) 30 | chrI(strcmp(chrName{ii},sj{1}))=ii; 31 | end 32 | 33 | nT='ACGTN'; 34 | a=chrStart(chrI)+sj{2}; 35 | b=chrStart(chrI)+sj{3}; 36 | m=nT([G(a) G(a+1) G(b-1) G(b)]+1); 37 | 38 | fout1=fopen(sjOut,'w'); 39 | for ii=1:length(chrI) 40 | fprintf(fout1,'%s\t%i\t%i\t%s\t%s\n',sj{1}{ii},sj{2}(ii),sj{3}(ii),sj{4}{ii},m(ii,:)); 41 | end 42 | fclose(fout1); 43 | 44 | end 45 | 46 | -------------------------------------------------------------------------------- /source/InOutStreams.cpp: -------------------------------------------------------------------------------- 1 | #include "InOutStreams.h" 2 | #include "GlobalVariables.h" 3 | 4 | InOutStreams::InOutStreams() { 5 | logStdOut=NULL; 6 | outSAM=NULL; 7 | outBAMfileUnsorted=NULL; 8 | outQuantBAMfile=NULL; 9 | }; 10 | 11 | InOutStreams::~InOutStreams() { 12 | 13 | if (logStdOut!=NULL) logStdOut->flush(); 14 | if (outSAM!=NULL) outSAM->flush(); 15 | 16 | logStdOutFile.flush(); 17 | outSAMfile.flush(); 18 | 19 | outChimSAM.flush(); 20 | outChimJunction.flush(); 21 | logProgress.flush(); 22 | logMain.flush(); 23 | logFinal.flush(); 24 | outLocalChains.flush(); 25 | 26 | //logStdOutFile.close(); //do not want to close these log files, as some destructors (e.g. ~SharedMemory) might still write there 27 | //logMain.close(); 28 | 29 | outSAMfile.close(); 30 | outChimSAM.close(); 31 | outChimJunction.close(); 32 | logProgress.close(); 33 | logFinal.close(); 34 | outLocalChains.close(); 35 | 36 | 37 | for (int ii=0;ii<2;ii++) { 38 | if (outUnmappedReadsStream[ii].is_open()) { 39 | outUnmappedReadsStream[ii].flush(); 40 | outUnmappedReadsStream[ii].close(); 41 | } 42 | }; 43 | }; 44 | 45 | -------------------------------------------------------------------------------- /source/SpliceGraph.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Created by Fahimeh Mirhaj on 6/10/19. 3 | */ 4 | using namespace std; 5 | 6 | #include "SpliceGraph.h" 7 | #include "GTF.h" 8 | SpliceGraph::SpliceGraph (SuperTranscriptome &superTrome, Parameters &P, ReadAlign *RA) : superTrome(superTrome), P(P), RA(RA) 9 | { 10 | //find candidate superTr 11 | superTrSeedCount = new typeSuperTrSeedCount[2*superTrome.N];//TODO: for stranded data, do not need 2nd strand 12 | 13 | //Smith-Waterman 14 | scoringMatrix = new typeAlignScore*[superTrome.sjDonorNmax+2]; 15 | scoreTwoColumns[0] = new typeAlignScore[maxSeqLength]; 16 | scoreTwoColumns[1] = new typeAlignScore[maxSeqLength]; 17 | for(uint32 ii = 0; ii < superTrome.sjDonorNmax+2; ii++) { 18 | scoringMatrix[ii] = new typeAlignScore[maxSeqLength];//TODO make it a user parameter 19 | }; 20 | sjDindex = new uint32[superTrome.sjDonorNmax]; 21 | 22 | //rowCol.reserve(100000); 23 | //rowSJ.reserve(100000); 24 | //blockCoord.reserve(100000); 25 | //blockSJ.reserve(10000); 26 | }; 27 | 28 | SpliceGraph::~SpliceGraph() { 29 | for(uint i = 0; i < maxSeqLength; i++) { 30 | delete[] scoringMatrix[i]; 31 | }; 32 | delete[] scoringMatrix; 33 | }; 34 | -------------------------------------------------------------------------------- /extras/scripts/sjFromSAMcollapseUandM.awk: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | OFS="\t"; 3 | mapqU=255; 4 | } 5 | { 6 | if (substr($1,1,1)!="@") { 7 | 8 | # not used 9 | #m=and($2,0x80)/0x80+1; 10 | #m = int($2/0x80)%2 + 1; #in case and() is not available 11 | 12 | if ($1!=readNameOld) delete readSJs; 13 | readNameOld=$1; 14 | 15 | n=split($6,L,/[A-Z]/)-1; 16 | split($6,C,/[0-9]*/); 17 | t=1;g=$4; 18 | for (k=1;k<=n;k++) {#scan through CIGAR operations 19 | if (C[k+1]=="S" || C[k+1]=="I") { 20 | t+=L[k]; 21 | } else if (C[k+1]=="D") { 22 | g+=L[k]; 23 | } else if (C[k+1]=="N") { 24 | sj1=$3 "\t" g "\t" g+L[k]-1; 25 | readSJs[sj1]++; 26 | if (readSJs[sj1]==1) {#only count this junction if it has not been counted for the same read 27 | SJ[sj1]=1; 28 | if ($5>=mapqU) { 29 | SJu[sj1]++; 30 | } else { 31 | SJm[sj1]++; 32 | }; 33 | }; 34 | g+=L[k]; 35 | } else { # M operation 36 | g+=L[k]; 37 | t+=L[k]; 38 | }; 39 | }; 40 | }; 41 | }; 42 | END { 43 | 44 | for (ii in SJ) { 45 | print ii, SJu[ii]+0, SJm[ii]+0; 46 | }; 47 | 48 | }; 49 | -------------------------------------------------------------------------------- /source/ReadAlign_chimericDetectionPEmerged.cpp: -------------------------------------------------------------------------------- 1 | #include "ReadAlign.h" 2 | #include "BAMfunctions.h" 3 | 4 | 5 | void ReadAlign::chimericDetectionPEmerged(ReadAlign &seRA) { 6 | 7 | chimRecord=false; 8 | if (P.pCh.segmentMin==0) {//no chimeric detection requested 9 | return; 10 | }; 11 | 12 | if (P.pCh.multimapNmax==0) { 13 | 14 | // runs old chimeric detection routines. 15 | 16 | seRA.multMapSelect(); //this needs to be done for ChimericDetectionOld, may not need it for the new algorithm 17 | seRA.mappedFilter(); 18 | 19 | chimRecord=seRA.chimericDetectionOld(); 20 | if (!chimRecord) { 21 | return; 22 | }; 23 | 24 | peOverlapChimericSEtoPE(&seRA.trChim[0], &seRA.trChim[1], &trChim[0], &trChim[1]); 25 | chimericDetectionOldOutput(); 26 | 27 | } else if (trBest->maxScore <= (int) (readLength[0]+readLength[1]) - (int) P.pCh.nonchimScoreDropMin) {//require big enough drop in the best score 28 | 29 | // new chimeric detection routine 30 | 31 | chimRecord=seRA.chimDet->chimericDetectionMult(seRA.nW, seRA.readLength, seRA.trBest->maxScore, this); 32 | }; 33 | 34 | if ( chimRecord ) { 35 | statsRA.chimericAll++; 36 | }; 37 | 38 | return; 39 | }; 40 | -------------------------------------------------------------------------------- /source/sjdbLoadFromFiles.cpp: -------------------------------------------------------------------------------- 1 | #include "sjdbLoadFromFiles.h" 2 | #include "sjdbLoadFromStream.h" 3 | #include "ErrorWarning.h" 4 | #include "TimeFunctions.h" 5 | 6 | void sjdbLoadFromFiles(Parameters &P, SjdbClass &sjdbLoci) { 7 | 8 | if (P.pGe.sjdbFileChrStartEnd.at(0)!="-") { 9 | for (uint ifile=0;ifilelogMain, EXIT_CODE_INPUT_FILES, P); 15 | }; 16 | 17 | sjdbLoadFromStream(sjdbStreamIn, sjdbLoci); 18 | 19 | sjdbLoci.priority.resize(sjdbLoci.chr.size(),10); 20 | 21 | time_t rawtime; 22 | time ( &rawtime ); 23 | P.inOut->logMain << timeMonthDayTime(rawtime) << " Loaded database junctions from the pGe.sjdbFileChrStartEnd file(s), total number of junctions:" << sjdbLoci.chr.size()<<"\n\n"; 24 | }; 25 | }; //if (P.pGe.sjdbFileChrStartEnd!="-") 26 | 27 | }; -------------------------------------------------------------------------------- /source/Quantifications.cpp: -------------------------------------------------------------------------------- 1 | #include "Quantifications.h" 2 | 3 | Quantifications::Quantifications (uint32 nGeIn) { 4 | 5 | geneCounts.nType=3; 6 | geneCounts.cAmbig = new uintQ[geneCounts.nType]; 7 | geneCounts.cNone = new uintQ[geneCounts.nType]; 8 | 9 | geneCounts.nGe=nGeIn; 10 | geneCounts.gCount = new uintQ* [geneCounts.nType]; 11 | 12 | geneCounts.cMulti=0; 13 | for (int itype=0; itype 3 | 4 | bool SoloBarcode::extractBarcode(string &seqIn, string &qualIn, const uint32 adapterStart, string &bSeq, string &bQual) 5 | {//input: sequence seqIn, adapter start adapterStart 6 | //output: start position of the barcode 7 | array pos={0,0}; 8 | for (uint32 ii=0; ii<2; ii++) { 9 | switch (anchorType[ii]) {//this calculates the position of the anchor base 10 | case 0: //read start 11 | pos[ii]=0; 12 | break; 13 | case 1: //read end 14 | pos[ii]=seqIn.size()-1; 15 | break; 16 | case 2: //adapter start 17 | pos[ii]=(int32)adapterStart; 18 | break; 19 | case 3: //adapter end 20 | pos[ii]=(int32)adapterStart+adapterLength-1; 21 | break; 22 | }; 23 | pos[ii]+=anchorDist[ii]; 24 | }; 25 | 26 | bSeq=""; 27 | bQual=""; 28 | if (pos[0]<0 || pos[1]>(int32)seqIn.size() || pos[0]>pos[1]) //something went wrong 29 | return false; 30 | 31 | bSeq =seqIn.substr(pos[0],pos[1]-pos[0]+1); 32 | bQual=qualIn.substr(pos[0],pos[1]-pos[0]+1); 33 | return true; 34 | }; 35 | -------------------------------------------------------------------------------- /source/PackedArray.cpp: -------------------------------------------------------------------------------- 1 | # include "PackedArray.h" 2 | 3 | PackedArray::PackedArray() { 4 | charArray=NULL; 5 | arrayAllocated=false; 6 | }; 7 | 8 | void PackedArray::defineBits(uint Nbits, uint lengthIn){ 9 | wordLength=Nbits; 10 | wordCompLength=sizeof(uint)*8LLU-wordLength; 11 | bitRecMask=(~0LLU)>>wordCompLength; 12 | length=lengthIn; 13 | lengthByte=(length-1)*wordLength/8LLU+sizeof(uint); 14 | // lengthByte=((lengthByte+sizeof(uint)-1LLU)/sizeof(uint))*sizeof(uint); 15 | }; 16 | 17 | void PackedArray::writePacked( uint jj, uint x) { 18 | uint b=jj*wordLength; 19 | uint B=b/8LLU; 20 | uint S=b%8LLU; 21 | 22 | x = x << S; 23 | uint* a1 = (uint*) (charArray+B); 24 | *a1 = ( (*a1) & ~(bitRecMask<c1 ? c2:c1); 29 | if (maxc12==0) 30 | maxc12=1e-9; 31 | rd=(c2-c1)/maxc12; 32 | 33 | print c1, c2 > "counts.txt"; 34 | 35 | MARD += (rd>0 ? rd:-rd); 36 | if (rd==1) NrdP1++; 37 | if (rd==-1) NrdM1++; 38 | if (rd==0) Nrd0++; 39 | 40 | cc[1]=log(c1+1); 41 | cc[2]=log(c2+1); 42 | sum12 += cc[1]*cc[2]; 43 | for (ii=1; ii<=2; ii++) { 44 | sum1[ii] += cc[ii]; 45 | sum2[ii] += cc[ii]^2; 46 | }; 47 | 48 | }; 49 | 50 | print "MARD", MARD/n, NrdM1, Nrd0, NrdP1, n-NrdM1-Nrd0-NrdP1; 51 | print "R2", (n*sum12-sum1[1]*sum1[2])^2 / (n*sum2[1]-(sum1[1])^2) / (n*sum2[2]-(sum1[2])^2); 52 | 53 | }; 54 | -------------------------------------------------------------------------------- /source/ReadAlign_mapOneReadSpliceGraph.cpp: -------------------------------------------------------------------------------- 1 | #include "ReadAlign.h" 2 | #include "SequenceFuns.h" 3 | #include "Stats.h" 4 | #include "serviceFuns.cpp" 5 | 6 | void ReadAlign::mapOneReadSpliceGraph() 7 | { 8 | #ifdef OFF_BEFORE_SEEDING 9 | #warning OFF_BEFORE_SEEDING 10 | nW=0; 11 | return; 12 | #endif 13 | 14 | if (LreadrLength=0; //min good piece length 17 | nW=0; 18 | return; 19 | }; 20 | 21 | resetN(); //reset aligns counters to 0 22 | 23 | //reset/initialize a transcript 24 | trInit->reset(); 25 | trInit->Chr=0; trInit->Str=0; trInit->roStr=0; trInit->cStart=0; trInit->gLength=0; //to generate nice output of 0 for non-mapped reads 26 | trInit->iRead=iRead; 27 | trInit->Lread=Lread; 28 | trInit->nExons=0; 29 | trInit->readLengthOriginal=readLengthOriginal; 30 | trInit->readLengthPairOriginal=readLengthPairOriginal; 31 | trInit->readLength=readLength; 32 | trInit->readNmates=readNmates; //not readNends: this is alignment 33 | trInit->readName=readName; 34 | 35 | trBest=trInit; 36 | 37 | splGraph->findSuperTr(Read1[0], Read1[2], Lread, readName, mapGen); 38 | 39 | return; 40 | }; 41 | -------------------------------------------------------------------------------- /source/funCompareUintAndSuffixesMemcmp.cpp: -------------------------------------------------------------------------------- 1 | #include "funCompareUintAndSuffixesMemcmp.h" 2 | #include //for memcmp 3 | 4 | char* g_funCompareUintAndSuffixesMemcmp_G; 5 | uint64_t g_funCompareUintAndSuffixesMemcmp_L; 6 | 7 | int funCompareUintAndSuffixesMemcmp ( const void *a, const void *b) 8 | { 9 | uint64_t* va= ((uint64_t*) a); 10 | uint64_t* vb= ((uint64_t*) b); 11 | 12 | if (va[0]>vb[0]) 13 | { 14 | return 1; 15 | } else if (va[0]vb[1] ? 1 : -1; 29 | }; 30 | // int comp=va[1]>vb[1] ? 1 : -1; 31 | return comp; 32 | }; 33 | }; 34 | -------------------------------------------------------------------------------- /source/BAMoutput.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_BAMoutput 2 | #define CODE_BAMoutput 3 | 4 | #include "IncludeDefine.h" 5 | #include SAMTOOLS_BGZF_H 6 | #include "Parameters.h" 7 | 8 | class BAMoutput {// 9 | public: 10 | //sorted output 11 | BAMoutput (int iChunk, string tmpDir, Parameters &Pin); 12 | void coordOneAlign (char *bamIn, uint bamSize, uint iRead); 13 | void coordBins (); 14 | void coordFlush (); 15 | //unsorted output 16 | BAMoutput (BGZF *bgzfBAMin, Parameters &Pin); 17 | void unsortedOneAlign (char *bamIn, uint bamSize, uint bamSize2); 18 | void unsortedFlush (); 19 | void coordUnmappedPrepareBySJout(); 20 | 21 | uint32 nBins; //number of bins to split genome into 22 | uint* binTotalN; //total number of aligns in each bin 23 | uint* binTotalBytes;//total size of aligns in each bin 24 | private: 25 | uint64 bamArraySize; //this size will be allocated 26 | char* bamArray; //large array to store the bam alignments, pre-sorted 27 | uint64 binSize, binSize1;//storage size of each bin 28 | uint64 binGlen;//bin genomic length 29 | char **binStart; //pointers to starts of the bins 30 | uint64 *binBytes, binBytes1;//number of bytes currently written to each bin 31 | ofstream **binStream;//output streams for each bin 32 | BGZF *bgzfBAM; 33 | Parameters &P; 34 | string bamDir; 35 | }; 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /extras/scripts/sjFromSAMcollapseUandM_inclOverlaps.awk: -------------------------------------------------------------------------------- 1 | BEGIN { 2 | OFS="\t"; 3 | mapqU=255; 4 | } 5 | { 6 | if (substr($1,1,1)!="@") { 7 | 8 | #m=and($2,0x80)/0x80+1; 9 | 10 | if ($1!=readNameOld) delete readSJs; 11 | readNameOld=$1; 12 | 13 | n=split($6,L,/[A-Z]/)-1; 14 | split($6,C,/[0-9]*/); 15 | t=1;g=$4; 16 | for (k=1;k<=n;k++) {#scan through CIGAR operations 17 | if (C[k+1]=="S" || C[k+1]=="I") { 18 | t+=L[k]; 19 | } else if (C[k+1]=="D") { 20 | g+=L[k]; 21 | } else if (C[k+1]=="N") { 22 | sj1=$3 "\t" g "\t" g+L[k]-1; 23 | readSJs[sj1]++; 24 | 25 | if (readSJs[sj1]==1) {#only count this junction if it has not been counted for the same read 26 | SJ[sj1]=1; 27 | if ($5>=mapqU) { 28 | SJu[sj1]++; 29 | } else { 30 | SJm[sj1]++; 31 | }; 32 | }; 33 | 34 | if ($5>=mapqU) { 35 | SJu1[sj1]++; 36 | } else { 37 | SJm1[sj1]++; 38 | }; 39 | 40 | g+=L[k]; 41 | 42 | } else { # M operation 43 | g+=L[k]; 44 | t+=L[k]; 45 | }; 46 | }; 47 | }; 48 | }; 49 | END { 50 | 51 | for (ii in SJ) { 52 | print ii, SJu[ii]+0, SJm[ii]+0, SJu1[ii]+0, SJm1[ii]+0; 53 | }; 54 | 55 | }; 56 | -------------------------------------------------------------------------------- /source/SuperTranscriptome.h: -------------------------------------------------------------------------------- 1 | #ifndef H_SuperTranscriptome 2 | #define H_SuperTranscriptome 3 | 4 | #include "IncludeDefine.h" 5 | #include "Parameters.h" 6 | 7 | struct sjInfo { 8 | uint32 start; 9 | uint32 end; 10 | uint32 tr; 11 | uint32 super; 12 | }; 13 | 14 | class SuperTranscript {//one supertranscript 15 | public: 16 | uint8 *seqP;//pointer to sequence 17 | uint32 length; 18 | vector> sjC;//collapsed junctions 19 | vector sjDonor;//SJ donor coordinates, sorted 20 | }; 21 | 22 | 23 | class SuperTranscriptome { 24 | private: 25 | Parameters &P; 26 | public: 27 | vector seqConcat;//concatenated sequences of supertranscripts, a.k.a. Condensed Genome 28 | vector> seq;//sequences of supertranscripts 29 | vector trIndex;//superTr's index this tr belongs to 30 | vector> trStartEnd;//tr start/end in the superTr it belongs to 31 | vector sj;//all junctions 32 | 33 | vector superTrs; 34 | uint32 sjNmax, sjDonorNmax;//max number of SJs per superTr, SJ donors 35 | uint32 N; //number of superTr 36 | 37 | 38 | SuperTranscriptome(Parameters &P) : P(P) {}; 39 | void sjCollapse(); 40 | void load(char *G, vector &chrStart, vector &chrLength); 41 | }; 42 | #endif 43 | -------------------------------------------------------------------------------- /extras/scripts/sjBED12.awk: -------------------------------------------------------------------------------- 1 | # for SJs (e.g.converts 3-column BED into BED12 2 | # awk -v blockLen=10 -f ... sjIn.bed 3 | 4 | BEGIN { 5 | OFS="\t"; 6 | 7 | } 8 | 9 | { 10 | if ($1=="") 11 | next; 12 | #################### input 13 | chrom=$1; 14 | chromStart1=$2; 15 | chromEnd1=$3; 16 | name="sj-" NR; 17 | if (name=="") 18 | name="."; 19 | 20 | score=1000; 21 | 22 | strand = $6; 23 | if (strand!="+" && strand!="-") 24 | strand="."; 25 | 26 | itemRgb="180,0,0"; 27 | 28 | #################### BED12 29 | chromStart = chromStart1-blockLen; 30 | chromEnd = chromEnd1+blockLen; 31 | thickStart = chromStart; 32 | thickEnd = chromEnd; 33 | blockCount = 2; 34 | blockSizes = blockLen "," blockLen; 35 | blockStarts = 0 "," chromEnd1-chromStart; 36 | 37 | print chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts > FILENAME ".bed12"; 38 | 39 | ####### GTF 40 | source = "STAR_SJ"; 41 | feature = "exon"; 42 | frame = "."; 43 | group = "transcript_id \"" name "\"; gene_id \"" name "\";" 44 | 45 | print chrom, source, feature, chromStart1-blockLen+1, chromStart1, score, strand, frame, group > FILENAME ".gtf"; 46 | print chrom, source, feature, chromEnd1+1, chromEnd1+blockLen, score, strand, frame, group > FILENAME ".gtf"; 47 | } 48 | -------------------------------------------------------------------------------- /source/Genome_insertSequences.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * inserts sequences into the SA and SAi indices 3 | */ 4 | #include "Genome.h" 5 | #include "genomeScanFastaFiles.h" 6 | #include "insertSeqSA.h" 7 | #include "TimeFunctions.h" 8 | 9 | void Genome::insertSequences() 10 | { 11 | if (pGe.gFastaFiles.at(0)!="-") 12 | { 13 | time_t rawtime; 14 | time ( &rawtime ); 15 | P.inOut->logMain << timeMonthDayTime(rawtime) << " ..... inserting extra sequences into genome indexes" <> wl;//whitelists, one for each length 21 | vector> wlEd;//edited whitelists (i.e. including mismatches and indels) 22 | vector> wlEdInd;//index for wlEd in the unedited wl 23 | 24 | uint64 wlFactor;//factor and modulo for converting each whitelist index into global index 25 | vector wlAdd;//additive for each length 26 | uint32 minLen;//minimum length for this barcode 27 | uint32 totalSize;//total size of all whitelists 28 | 29 | uint64 iLen, iCB;//indexes, used in ParametersSolo.cpp 30 | 31 | bool extractBarcode(string &seqIn, string &qualIn, const uint32 aStart, string &bSeq, string &bQual); 32 | void sortWhiteList(ParametersSolo *pSolo); 33 | void extractPositionsFromString(string &strIn); 34 | 35 | //SoloBarcode(ParametersSolo *pSolo) : pSolo(pSolo) {}; 36 | }; 37 | 38 | #endif -------------------------------------------------------------------------------- /source/ErrorWarning.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | functions that handle errors and warnings 3 | */ 4 | #include "ErrorWarning.h" 5 | #include "TimeFunctions.h" 6 | #include "GlobalVariables.h" 7 | 8 | void exitWithError(string messageOut, ostream &streamOut1, ostream &streamOut2, int errorInt, Parameters &P) 9 | { 10 | if (P.runThreadN>1) 11 | pthread_mutex_lock(&g_threadChunks.mutexError); 12 | time_t timeCurrent; 13 | time( &timeCurrent); 14 | if (streamOut1.good()) { 15 | streamOut1 << "\n" << messageOut << endl << timeMonthDayTime(timeCurrent) <<" ...... FATAL ERROR, exiting\n" <1) pthread_mutex_unlock(&g_threadChunks.mutexError); 22 | exit(errorInt); 23 | }; 24 | 25 | void warningMessage(string messageOut, ostream &streamOut1, ostream &streamOut2, Parameters &P) 26 | { 27 | if (P.runThreadN>1) 28 | pthread_mutex_lock(&g_threadChunks.mutexError); 29 | if (streamOut1.good()) { 30 | streamOut1 << "!!!!! WARNING: " << messageOut <1) 36 | pthread_mutex_unlock(&g_threadChunks.mutexError); 37 | }; 38 | -------------------------------------------------------------------------------- /source/ChimericAlign.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_ChimericAlign 2 | #define CODE_ChimericAlign 3 | 4 | #include "IncludeDefine.h" 5 | #include "Parameters.h" 6 | #include "Transcript.h" 7 | #include "ChimericSegment.h" 8 | #include "Genome.h" 9 | 10 | #include 11 | 12 | class ReadAlign; 13 | 14 | class ChimericAlign 15 | {// 16 | public: 17 | ChimericSegment seg1, seg2; //two chimeric segments 18 | // std::unique_ptr al1, al2; //two chimeric alignments - modified by chimeric switching 19 | Transcript *al1, *al2; 20 | uint ex1, ex2; 21 | 22 | uint chimJ1, chimJ2, chimRepeat1, chimRepeat2; 23 | int chimMotif, chimStr, chimScore; 24 | 25 | ChimericAlign(ChimericSegment &seg1in, ChimericSegment &seg2in, int chimScoreIn, Genome &genomeIn, ReadAlign *RAin); //allocate 26 | void chimericJunctionOutput(fstream &outStream, uint chimN, int maxNonChimAlignScore, bool PEmerged_flag, int chimScoreBest, int maxPossibleAlignScore); 27 | static void chimericBAMoutput(Transcript *al1, Transcript *al2, ReadAlign *RA, const uint iTr, const uint chimN, const bool isBestChimAlign, const Parameters& P); 28 | void chimericStitching(char *genSeq, char **Read1); 29 | bool chimericCheck(); 30 | 31 | bool stitchingDone; 32 | 33 | ReadAlign *RA; 34 | private: 35 | Parameters &P; 36 | ParametersChimeric &pCh; 37 | Genome &mapGen; 38 | 39 | }; 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /source/blocksOverlap.cpp: -------------------------------------------------------------------------------- 1 | #include "blocksOverlap.h" 2 | 3 | uint blocksOverlap(Transcript &t1, Transcript &t2) {//calculate overlap between blocks of two transcripts 4 | 5 | uint i1=0, i2=0, nOverlap=0; 6 | while (i1=re2) {//t1 block is on the right to t2, no hope of overlap 15 | i2++; 16 | } else if (rs2>=re1) {//t2 block is on the right to t1, no hope of overlap 17 | i1++; 18 | } else if (gs1-rs1 != gs2-rs2) {//no overlap 19 | if (re1>=re2) i2++;//1 is on the right of 2 20 | if (re2>=re1) i1++;//2 is on the right of 1 21 | } else {//overlap 22 | nOverlap += min(re1,re2) - max(rs1,rs2); 23 | if (re1>=re2) i2++;//1 is on the right of 2 24 | if (re2>=re1) i1++;//2 is on the right of 1 25 | }; 26 | }; 27 | 28 | //debug 29 | // uint nO1=0; 30 | // for (uint ir=0;ir0) nO1++; 32 | // }; 33 | // 34 | // if (nOverlap!=nO1) { 35 | // exit(255); 36 | // }; 37 | // 38 | 39 | return nOverlap; 40 | }; 41 | 42 | -------------------------------------------------------------------------------- /source/SoloReadBarcode.cpp: -------------------------------------------------------------------------------- 1 | #include "SoloReadBarcode.h" 2 | #include "streamFuns.h" 3 | 4 | SoloReadBarcode::SoloReadBarcode(Parameters &P) : P(P), pSolo(P.pSolo) 5 | { 6 | if (pSolo.type==0) 7 | return; 8 | 9 | for (uint32 ii=0; iivb[0]) { 11 | return 1; 12 | } else if (va[0]gb[ig]) 20 | {// second condition: reached the end of ga, it's >= than any character, but = does not matter 21 | return 1; 22 | } else if (ga[ig]vb[1]) 28 | { 29 | return 1; 30 | } else 31 | {//va cannot be equal to vb 32 | return -1; 33 | }; 34 | } else 35 | {//continue 36 | ig++; 37 | }; 38 | }; 39 | }; 40 | }; 41 | -------------------------------------------------------------------------------- /source/soloInputFeatureUMI.cpp: -------------------------------------------------------------------------------- 1 | #include "soloInputFeatureUMI.h" 2 | #include "SoloReadFeature.h" 3 | #include "binarySearch2.h" 4 | 5 | bool soloInputFeatureUMI(fstream *strIn, int32 featureType, bool readInfoYes, array,2> &sjAll, uint64 &iread, 6 | int32 &cbmatch, uint32 &feature, uint64 &umi, vector &featVecU32, SoloReadFlagClass &readFlagCounts) 7 | { 8 | if (!(*strIn >> umi)) //end of file 9 | return false; 10 | 11 | if (readInfoYes) { 12 | *strIn >> iread >> readFlagCounts.flag; 13 | }; 14 | 15 | switch (featureType) { 16 | case SoloFeatureTypes::Gene : 17 | case SoloFeatureTypes::GeneFull : 18 | case SoloFeatureTypes::GeneFull_Ex50pAS : 19 | case SoloFeatureTypes::GeneFull_ExonOverIntron : 20 | *strIn >> feature; 21 | break; 22 | 23 | case SoloFeatureTypes::SJ : 24 | uint64 sj[2]; 25 | *strIn >> sj[0] >> sj[1]; 26 | feature=(uint32) binarySearch2(sj[0],sj[1],sjAll[0].data(),sjAll[1].data(),sjAll[0].size()); 27 | break; 28 | 29 | case SoloFeatureTypes::Transcript3p : 30 | feature=0; 31 | uint32 ntr, in1; 32 | *strIn >> ntr; 33 | featVecU32.resize(2*ntr); 34 | for (uint32 ii=0; ii<2*ntr; ii++) { 35 | *strIn >> in1; 36 | featVecU32[ii]=in1; 37 | }; 38 | break; 39 | }; 40 | 41 | *strIn >> cbmatch; 42 | 43 | return true; 44 | }; -------------------------------------------------------------------------------- /source/ChimericAlign.cpp: -------------------------------------------------------------------------------- 1 | #include "ChimericAlign.h" 2 | 3 | ChimericAlign::ChimericAlign(ChimericSegment &seg1in, ChimericSegment &seg2in, int chimScoreIn, Genome &genomeIn, ReadAlign *RAin) 4 | : seg1(seg1in), seg2(seg2in),chimScore(chimScoreIn), RA(RAin), P(seg1in.P), pCh(P.pCh), mapGen(genomeIn) { 5 | stitchingDone=false; 6 | 7 | al1=&seg1.align; 8 | al2=&seg2.align; 9 | 10 | if (al1->roStart > al2->roStart) 11 | swap (al1,al2); 12 | 13 | ex1 = al1->Str==1 ? 0 : al1->nExons-1; 14 | ex2 = al2->Str==0 ? 0 : al2->nExons-1; 15 | }; 16 | 17 | bool ChimericAlign::chimericCheck() { 18 | bool chimGood=true; 19 | 20 | chimGood = chimGood && al1->exons[ex1][EX_iFrag] <= al2->exons[ex2][EX_iFrag];//otherwise - strange configuration, both segments contain two mates 21 | //if ( trChim[0].exons[e0][EX_iFrag] > trChim[1].exons[e1][EX_iFrag] ) {//strange configuration, rare, similar to the next one 22 | // chimN=0;//reject such chimeras 23 | //good test example: 24 | //CTTAGCTAGCAGCGTCTTCCCAGTGCCTGGAGGGCCAGTGAGAATGGCACCCTCTGGGATTTTTGCTCCTAGGTCT 25 | //TTGAGGTGAAGTTCAAAGATGTGGCTGGCTGTGAGGAGGCCGAGCTAGAGATCATGGAATTTGTGAATTTCTTGAA 26 | //} else 27 | 28 | //junction overhangs too short for chimerically spliced mates 29 | chimGood = chimGood && (al1->exons[ex1][EX_iFrag] < al2->exons[ex2][EX_iFrag] || (al1->exons[ex1][EX_L] >= pCh.junctionOverhangMin && al2->exons[ex2][EX_L] >= pCh.junctionOverhangMin) ); 30 | 31 | return chimGood; 32 | }; 33 | -------------------------------------------------------------------------------- /extras/scripts/mergeGeneCounts.awk: -------------------------------------------------------------------------------- 1 | # 2 | # merges ReadsPerGene.out.tab files from multiple runs into one table 3 | # usage: 4 | # awk -f mergeGeneCounts.awk -v Col=2 /path/to/1st/ReadsPerGene.out.tab /path/to/2nd/ReadsPerGene.out.tab ... 5 | # e.g. 6 | # awk -f mergeGeneCounts.awk -v Col=2 */ReadsPerGene.out.tab 7 | # 8 | # -v Col=: depends on the standedness of the table 9 | # advanced parameters 10 | # -v Skip= 11 | # -v Name= 12 | 13 | 14 | 15 | 16 | BEGIN { 17 | FS="\t"; 18 | 19 | if (Name=="") Name="/ReadsPerGene.out.tab"; 20 | if (Skip=="") Skip=0; 21 | if (Col=="") { 22 | print "Specify the column with -v Col=..." > /dev/err; 23 | exit; 24 | }; 25 | 26 | for (jj=1;jj<=ARGC;jj++) 27 | {# print header line with file names 28 | a=ARGV[jj]; 29 | gsub(Name,"",a); 30 | printf ";" a 31 | }; 32 | printf "\n"; 33 | } 34 | { 35 | if (ARGIND==1) { 36 | L[FNR]=$1; # record gene names (1st column) 37 | } else { 38 | if ($1!=L[FNR]) { 39 | print "File #" ARGIND ": " FILENAME " is not sorted properly, sort all files by the first column" >/dev/err; 40 | }; 41 | }; 42 | 43 | V[FNR,ARGIND]=$2 44 | } 45 | END { 46 | for (ii=1;ii<=length(L);ii++) 47 | { 48 | printf "%s",L[ii]; 49 | if (V[ii,1]!="") 50 | for (jj=1;jj<=ARGC;jj++) 51 | printf ";" V[ii,jj]; 52 | printf "\n" 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /source/SoloReadFeature.h: -------------------------------------------------------------------------------- 1 | #ifndef H_SoloReadFeature 2 | #define H_SoloReadFeature 3 | #include 4 | #include 5 | #include "IncludeDefine.h" 6 | #include "Parameters.h" 7 | #include "Transcript.h" 8 | #include "SoloReadBarcode.h" 9 | #include "SoloCommon.h" 10 | #include "SoloReadFeatureStats.h" 11 | #include "ReadAnnotations.h" 12 | 13 | class SoloFeature; 14 | 15 | class SoloReadFeature { 16 | public: 17 | 18 | uint32 homoPolymer[4];//homopolymer constants 19 | 20 | vector cbReadCount; 21 | map cbReadCountMap; 22 | 23 | vector transcriptDistCount; 24 | 25 | bool readInfoYes ,readIndexYes; 26 | 27 | fstream *streamReads; 28 | 29 | string cbSeq, umiSeq, cbQual, umiQual; 30 | 31 | SoloReadFlagClass readFlag; 32 | 33 | SoloReadFeatureStats stats; 34 | 35 | SoloReadFeature (int32 feTy, Parameters &Pin, int iChunk); 36 | void record(SoloReadBarcode &soloBar, uint nTr, Transcript **alignOut, uint64 iRead, ReadAnnotations &readAnnot); 37 | void addCounts(const SoloReadFeature &soloCBin); 38 | void addStats(const SoloReadFeature &soloCBin); 39 | void statsOut(ofstream &streamOut); 40 | void inputRecords(uint32 **cbP, uint32 cbPstride, vector &cbReadCountTotal, vector &readInfo, SoloReadFlagClass &readFlagCounts, 41 | vector &nReadPerCBunique1, vector &nReadPerCBmulti1); 42 | 43 | private: 44 | const int32 featureType; 45 | 46 | Parameters &P; 47 | ParametersSolo &pSolo; 48 | }; 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /extras/scripts/soloCountMatrixFromBAM.awk: -------------------------------------------------------------------------------- 1 | # usage: 2 | # samtools view Aligned.sortedByCoord.out.bam | awk -v fileWL=Solo.out/Gene/raw/barcodes.tsv -v fileGenes=Solo.out/Gene/raw/features.tsv \ 3 | # soloCountMatrixFromBAM.awk | sort -k2,2n -k1,1n > matrix.mtx 4 | 5 | 6 | function getTag(tag) 7 | { 8 | tagOut=$0; 9 | if (gsub(".*" tag,"",tagOut)==0) 10 | return 0; 11 | gsub("\t.*","",tagOut); 12 | return tagOut; 13 | } 14 | 15 | BEGIN { 16 | 17 | stderr="/dev/stderr"; 18 | 19 | ii=0; 20 | while (getline < fileWL) { 21 | ii++; 22 | WL[$1]=ii; 23 | }; 24 | #print length(WL) > stderr; 25 | 26 | #getline < fileGenes; #skip header 27 | ii=0; 28 | while (getline < fileGenes) { 29 | ii++; 30 | geneID[$1]=ii; 31 | }; 32 | #print length(geneID) > stderr; 33 | } 34 | 35 | { 36 | #if ((NR%1000000)==0) printf NR/1000000 " "$3 " " nTot+0 " " > "/dev/stderr"; 37 | 38 | GX=getTag("GX:Z:"); 39 | if (GX=="0" || GX=="-") 40 | next; 41 | 42 | CB=getTag("CB:Z:"); 43 | if (CB=="0" || CB=="-") 44 | next; 45 | 46 | UB=getTag("UB:Z:"); 47 | if (UB=="0" || UB=="-") 48 | next; 49 | 50 | # this is needed for CR 51 | #if (substr(CB,length(CB)-1,1)=="-") 52 | # CB=substr(CB,1, length(CB)-2); 53 | 54 | 55 | cb=WL[CB]; 56 | ge=geneID[GX]; 57 | 58 | 59 | nTot++; 60 | U[cb][ge][UB]++; 61 | 62 | } 63 | 64 | END { 65 | for (cb in U) { 66 | for (ge in U[cb]) { 67 | print ge,cb,length(U[cb][ge]); 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /source/Variation.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_Variation 2 | #define CODE_Variation 3 | 4 | #include "IncludeDefine.h" 5 | #include "Parameters.h" 6 | #include 7 | 8 | // struct SNPnt 9 | // { 10 | // char ref; 11 | // char a1; 12 | // char a2; 13 | // }; 14 | 15 | class SNP 16 | { 17 | public: 18 | uint32 N; //number of snps 19 | uint* loci; //snp coordinates 20 | vector lociV; //snp coordinates vector 21 | // SNPnt* nt; //reference and alternative bases 22 | // char **nt; //reference and alternative bases 23 | // char *nt1; //1D array to store nt 24 | vector> nt;//reference and alternative bases 25 | 26 | //methods 27 | void snpOnBlocks(uint blockStart, uint blockL, int blockShift, vector>> &snpV); 28 | }; 29 | 30 | class Variation 31 | { 32 | public: 33 | //methods 34 | Variation (Parameters &Pin, vector &chrStart, map &chrNameIndex, bool yesVar); //create transcriptome structure, load and initialize parameters 35 | void loadVCF(string fileIn); //load VCF file 36 | vector>> sjdbSnp(uint sjStart, uint sjEnd, uint sjdbOverhang1); //calculates snp loci in sjdb sequences 37 | 38 | //variables 39 | bool yes; 40 | SNP snp; 41 | 42 | Parameters &P; //TODO: make this private 43 | 44 | private: 45 | string vcfFile; 46 | //string varOutFileName; 47 | //ofstream varOutStream;//output file for variations 48 | 49 | vector &chrStart; //this needs to be replaced with a structure that contains basic genome variables 50 | map &chrNameIndex; 51 | }; 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /source/htslib/htslib/kfunc.h: -------------------------------------------------------------------------------- 1 | #ifndef __KFUNC_H__ 2 | #define __KFUNC_H__ 3 | 4 | /* Log gamma function 5 | * \log{\Gamma(z)} 6 | * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245 7 | */ 8 | double kf_lgamma(double z); 9 | 10 | /* complementary error function 11 | * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt 12 | * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66 13 | */ 14 | double kf_erfc(double x); 15 | 16 | /* The following computes regularized incomplete gamma functions. 17 | * Formulas are taken from Wiki, with additional input from Numerical 18 | * Recipes in C (for modified Lentz's algorithm) and AS245 19 | * (http://lib.stat.cmu.edu/apstat/245). 20 | * 21 | * A good online calculator is available at: 22 | * 23 | * http://www.danielsoper.com/statcalc/calc23.aspx 24 | * 25 | * It calculates upper incomplete gamma function, which equals 26 | * kf_gammaq(s,z)*tgamma(s). 27 | */ 28 | 29 | double kf_gammap(double s, double z); 30 | double kf_gammaq(double s, double z); 31 | 32 | /* Regularized incomplete beta function. The method is taken from 33 | * Numerical Recipe in C, 2nd edition, section 6.4. The following web 34 | * page calculates the incomplete beta function, which equals 35 | * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b): 36 | * 37 | * http://www.danielsoper.com/statcalc/calc36.aspx 38 | */ 39 | double kf_betai(double a, double b, double x); 40 | 41 | /* 42 | * n11 n12 | n1_ 43 | * n21 n22 | n2_ 44 | * -----------+---- 45 | * n_1 n_2 | n 46 | */ 47 | double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /source/SequenceFuns.h: -------------------------------------------------------------------------------- 1 | /** general basic functions to operate on sequences, no classes 2 | */ 3 | 4 | #ifndef SEQUENCEFUNS_DEF 5 | #define SEQUENCEFUNS_DEF 6 | 7 | #include "IncludeDefine.h" 8 | 9 | void complementSeqNumbers(char*, char*, uint); 10 | void convertNucleotidesToNumbers(const char* R0, char* R1, const uint Lread); 11 | uint convertNucleotidesToNumbersRemoveControls(const char* R0, char* R1, const uint Lread); 12 | void revComplementNucleotides(char* ReadsIn, char* ReadsOut, uint Lread); //complement the numeric sequences 13 | void revComplementNucleotides(string &seq); 14 | char nuclToNumBAM(char cc); 15 | void nuclPackBAM(char* ReadsIn, char* ReadsOut, uint Lread); 16 | char convertNt01234(const char R0);//transform sequence from ACGT into 0-1-2-3-4 code 17 | 18 | uint chrFind(uint, uint, uint*); // find chromosome from global locus 19 | uint localSearch(const char*, uint, const char*, uint, double); //local search to clip adapter 20 | uint localSearchNisMM(const char *x, uint nx, const char *y, uint ny, double pMM); 21 | uint32 localAlignHammingDist(const string &text, const string &query, uint32 &pos); 22 | uint32 localSearchGeneral(const char *text, const uint32 textL, const vector &query, const int32 textStart, const int32 textEnd, double pMM, vector vecMM, uint32 &nMM); 23 | 24 | uint qualitySplit(char*, uint, uint, uint, uint**); 25 | 26 | int32 convertNuclStrToInt32(const string S, uint32 &intOut); 27 | string convertNuclInt32toString(const uint32 nuclNum, const uint32 L); 28 | 29 | int64 convertNuclStrToInt64(const string S, uint64 &intOut); 30 | string convertNuclInt64toString(const uint64 nuclNum, const uint32 L); 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /source/SoloReadBarcode.h: -------------------------------------------------------------------------------- 1 | #ifndef H_SoloReadBarcode 2 | #define H_SoloReadBarcode 3 | #include 4 | #include "IncludeDefine.h" 5 | #include "Parameters.h" 6 | #include "SoloReadBarcodeStats.h" 7 | 8 | class SoloReadBarcode { 9 | private: 10 | Parameters &P; 11 | 12 | public: 13 | ParametersSolo &pSolo; 14 | 15 | 16 | uint32 homoPolymer[4];//homopolymer constants 17 | string cbSeq, umiSeq, cbQual, umiQual, bSeq, bQual; 18 | vector bStrings; //barcode strings from SAM tags 19 | string cbSeqCorrected; 20 | uint64 umiB; 21 | //int64 cbI; 22 | int32 cbMatch;//-1: no match, 0: exact, 1: 1 match with 1MM, >1: # of matches with 1MM 23 | int32 umiCheck;//umi check status 24 | string cbMatchString;//CB matches and qualities 25 | vector cbMatchInd;//matches 26 | vector cbReadCountExact; 27 | //map cbReadCountMap;//count read per CB for no WL 28 | 29 | array qualHist; 30 | SoloReadBarcodeStats stats; 31 | 32 | SoloReadBarcode(Parameters &Pin); 33 | void getCBandUMI(char **readSeq, char **readQual, uint64 *readLen, const string &readNameExtraIn, const uint32 &readFilesIndex, const char *readName); 34 | void addCounts(const SoloReadBarcode &rfIn); 35 | void addStats(const SoloReadBarcode &rfIn); 36 | void statsOut(ofstream &streamOut); 37 | void matchCBtoWL(string &cbSeq1, string &cbQual1, vector &cbWL, int32 &cbMatch1, vector &cbMatchInd1, string &cbMatchString1); 38 | bool convertCheckUMI(); 39 | void addStats(const int32 cbMatch1); 40 | 41 | 42 | protected: 43 | void readNameExtra(); 44 | }; 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /source/Stats.h: -------------------------------------------------------------------------------- 1 | #ifndef STATS_DEF 2 | #define STATS_DEF 3 | 4 | #include "IncludeDefine.h" 5 | #include "Transcript.h" 6 | #include "Parameters.h" 7 | 8 | 9 | class Stats { 10 | public: 11 | uint readN;//number of reads from the file 12 | uint readBases;//number of input bases 13 | // uint mateLmax[2], mateLmin[2];//mates' max and min length 14 | 15 | uint mappedReadsU, mappedReadsM; 16 | uint mappedBases, mappedMismatchesN, mappedInsN, mappedDelN, mappedInsL, mappedDelL; 17 | double mappedPortion; //portion of the read length that has been mapped 18 | 19 | uint splicesN[SJ_MOTIF_SIZE];//non-can,3*can,annotated 20 | uint splicesNsjdb; 21 | 22 | uint unmappedOther, unmappedShort, unmappedMismatch, unmappedMulti, unmappedAll; 23 | 24 | uint chimericAll; 25 | 26 | time_t timeStart, timeStartMap, timeFinishMap, timeLastReport, timeFinish; 27 | 28 | Stats (); 29 | void resetN(); 30 | void printShort(ostream*); 31 | void transcriptStats(Transcript &T, uint Lread); 32 | void addStats(Stats &S); 33 | void progressReportHeader(ofstream &progressStream); 34 | void progressReport(ofstream &progressStream) ; 35 | void reportFinal(ofstream &streamOut); 36 | void writeLines(ofstream &streamOut, const vector outType, const string commStr, const string outStr);// write commented lines to text files with stats 37 | 38 | void qualHistCalc(const uint64 imate, const char* qual, const uint64 len); 39 | //void qualHistCalcSolo(const uint64 imate, const char* qual, const vector stlen); 40 | }; 41 | #endif 42 | -------------------------------------------------------------------------------- /source/htslib/htslib/tbx.h: -------------------------------------------------------------------------------- 1 | #ifndef TBX_H 2 | #define TBX_H 3 | 4 | #include "hts.h" 5 | 6 | #define TBX_MAX_SHIFT 31 7 | 8 | #define TBX_GENERIC 0 9 | #define TBX_SAM 1 10 | #define TBX_VCF 2 11 | #define TBX_UCSC 0x10000 12 | 13 | typedef struct { 14 | int32_t preset; 15 | int32_t sc, bc, ec; // seq col., beg col. and end col. 16 | int32_t meta_char, line_skip; 17 | } tbx_conf_t; 18 | 19 | typedef struct { 20 | tbx_conf_t conf; 21 | hts_idx_t *idx; 22 | void *dict; 23 | } tbx_t; 24 | 25 | extern tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sam, tbx_conf_vcf; 26 | 27 | #ifdef __cplusplus 28 | extern "C" { 29 | #endif 30 | 31 | #define tbx_itr_destroy(iter) hts_itr_destroy(iter) 32 | #define tbx_itr_queryi(tbx, tid, beg, end) hts_itr_query((tbx)->idx, (tid), (beg), (end), tbx_readrec) 33 | #define tbx_itr_querys(tbx, s) hts_itr_querys((tbx)->idx, (s), (hts_name2id_f)(tbx_name2id), (tbx), hts_itr_query, tbx_readrec) 34 | #define tbx_itr_next(htsfp, tbx, itr, r) hts_itr_next(hts_get_bgzfp(htsfp), (itr), (r), (tbx)) 35 | #define tbx_bgzf_itr_next(bgzfp, tbx, itr, r) hts_itr_next((bgzfp), (itr), (r), (tbx)) 36 | 37 | int tbx_name2id(tbx_t *tbx, const char *ss); 38 | 39 | /* Internal helper function used by tbx_itr_next() */ 40 | BGZF *hts_get_bgzfp(htsFile *fp); 41 | int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end); 42 | 43 | int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf); 44 | tbx_t *tbx_index_load(const char *fn); 45 | const char **tbx_seqnames(tbx_t *tbx, int *n); // free the array but not the values 46 | void tbx_destroy(tbx_t *tbx); 47 | 48 | #ifdef __cplusplus 49 | } 50 | #endif 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /source/htslib/cram/md5.h: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | /* 6 | * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. 7 | * MD5 Message-Digest Algorithm (RFC 1321). 8 | * 9 | * Homepage: 10 | * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 11 | * 12 | * Author: 13 | * Alexander Peslyak, better known as Solar Designer 14 | * 15 | * This software was written by Alexander Peslyak in 2001. No copyright is 16 | * claimed, and the software is hereby placed in the public domain. 17 | * In case this attempt to disclaim copyright and place the software in the 18 | * public domain is deemed null and void, then the software is 19 | * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the 20 | * general public under the following terms: 21 | * 22 | * Redistribution and use in source and binary forms, with or without 23 | * modification, are permitted. 24 | * 25 | * There's ABSOLUTELY NO WARRANTY, express or implied. 26 | * 27 | * See md5.c for more information. 28 | */ 29 | 30 | #ifdef HAVE_OPENSSL 31 | #include 32 | #elif !defined(_MD5_H) 33 | #define _MD5_H 34 | 35 | /* Any 32-bit or wider unsigned integer data type will do */ 36 | typedef unsigned int MD5_u32plus; 37 | 38 | typedef struct { 39 | MD5_u32plus lo, hi; 40 | MD5_u32plus a, b, c, d; 41 | unsigned char buffer[64]; 42 | MD5_u32plus block[16]; 43 | } MD5_CTX; 44 | 45 | extern void MD5_Init(MD5_CTX *ctx); 46 | extern void MD5_Update(MD5_CTX *ctx, void *data, unsigned long size); 47 | extern void MD5_Final(unsigned char *result, MD5_CTX *ctx); 48 | 49 | #endif 50 | 51 | #ifdef __cplusplus 52 | } 53 | #endif 54 | 55 | -------------------------------------------------------------------------------- /source/ReadAlignChunk.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_ReadAlignChunk 2 | #define CODE_ReadAlignChunk 3 | 4 | #include "IncludeDefine.h" 5 | #include "Parameters.h" 6 | #include "ReadAlign.h" 7 | #include "OutSJ.h" 8 | #include "Transcriptome.h" 9 | #include "BAMoutput.h" 10 | #include "Quantifications.h" 11 | 12 | class ReadAlignChunk {//chunk of reads and alignments 13 | public: 14 | Parameters& P; 15 | ReadAlign* RA; 16 | 17 | Transcriptome *chunkTr; 18 | 19 | char **chunkIn; //space for the chunk of input reads 20 | array chunkInSizeBytesTotal; 21 | 22 | char *chunkOutBAM, *chunkOutBAM1;//space for the chunk of output SAM 23 | OutSJ *chunkOutSJ, *chunkOutSJ1; 24 | 25 | BAMoutput *chunkOutBAMcoord, *chunkOutBAMunsorted, *chunkOutBAMquant; 26 | Quantifications *chunkQuants; 27 | 28 | istringstream** readInStream; 29 | ostringstream* chunkOutBAMstream; 30 | ofstream chunkOutBAMfile; 31 | string chunkOutBAMfileName; 32 | 33 | bool noReadsLeft; 34 | uint iChunkIn; //current chunk # as read from .fastq 35 | uint iChunkOutSAM; //current chunk # writtedn to Aligned.out.sam 36 | int iThread; //current thread 37 | uint chunkOutBAMtotal; //total number of bytes in the write buffer 38 | 39 | ReadAlignChunk(Parameters& Pin, Genome &genomeIn, Transcriptome *TrIn, int iChunk); 40 | void processChunks(); 41 | void mapChunk(); 42 | void chunkFstreamOpen(string filePrefix, int iChunk, fstream &fstreamOut); 43 | void chunkFstreamCat (fstream &chunkOut, ofstream &allOut, bool mutexFlag, pthread_mutex_t &mutexVal); 44 | void chunkFilesCat(ostream *allOut, string filePrefix, uint &iC); 45 | 46 | Genome &mapGen; 47 | private: 48 | }; 49 | #endif 50 | -------------------------------------------------------------------------------- /source/SoloReadFeatureStats.h: -------------------------------------------------------------------------------- 1 | #ifndef H_SoloReadFeatureStats 2 | #define H_SoloReadFeatureStats 3 | #include "IncludeDefine.h" 4 | 5 | class SoloReadFeatureStats { 6 | public: 7 | vector names; 8 | enum { noUnmapped, noNoFeature, MultiFeature, subMultiFeatureMultiGenomic, noTooManyWLmatches, noMMtoWLwithoutExact, yesWLmatch, yessubWLmatchExact, yessubWLmatch_UniqueFeature, yesCellBarcodes, yesUMIs, nStats}; 9 | uint64 V[nStats]; 10 | SoloReadFeatureStats() 11 | { 12 | names={"noUnmapped","noNoFeature","MultiFeature","subMultiFeatureMultiGenomic","noTooManyWLmatches","noMMtoWLwithoutExact","yesWLmatch","yessubWLmatchExact","yessubWLmatch_UniqueFeature","yesCellBarcodes","yesUMIs"}; 13 | for (uint32 ii=0; ii 6 | 7 | class Parameters; 8 | 9 | class ParametersGenome {//"constant" genome parameters - user input 10 | public: 11 | string gDir; 12 | string gLoad; 13 | 14 | uint32 gType;//type code 15 | string gTypeString; 16 | 17 | vector gFastaFiles; 18 | vector gChainFiles; 19 | //string gConsensusFile; DEPRECATED 20 | 21 | struct { 22 | int32 type; 23 | string typeString; 24 | string vcfFile; 25 | vector output; //which output to transform 26 | bool outYes, outSAM, outSJ, outQuant; 27 | } transform; 28 | 29 | uint gSAindexNbases;//length of the SA pre-index strings 30 | uint gChrBinNbits; 31 | uint gSAsparseD;//SA sparsity 32 | uint gSuffixLengthMax;//maximum length of the suffixes, has to be longer than read length 33 | vector gFileSizes;//size of the genome files 34 | 35 | vector sjdbFileChrStartEnd; 36 | string sjdbGTFfile; 37 | string sjdbGTFchrPrefix; 38 | 39 | string sjdbGTFfeatureExon; 40 | string sjdbGTFtagExonParentTranscript; 41 | string sjdbGTFtagExonParentGene; 42 | vector sjdbGTFtagExonParentGeneName; 43 | vector sjdbGTFtagExonParentGeneType; 44 | 45 | string sjdbInsertSave; 46 | uint sjdbOverhang; 47 | int sjdbOverhang_par; 48 | int sjdbScore; 49 | 50 | struct { 51 | vector mitoStrings; 52 | unordered_set mito; 53 | } chrSet; 54 | 55 | 56 | void initialize(Parameters *Pin); 57 | 58 | private: 59 | Parameters *pP; 60 | 61 | }; 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /source/htslib/sam.5: -------------------------------------------------------------------------------- 1 | '\" t 2 | .TH sam 5 "August 2013" "htslib" "Bioinformatics formats" 3 | .SH NAME 4 | sam \- Sequence Alignment/Map file format 5 | .SH DESCRIPTION 6 | Sequence Alignment/Map (SAM) format is TAB-delimited. Apart from the header lines, which are started 7 | with the `@' symbol, each alignment line consists of: 8 | .TS 9 | nlbl. 10 | 1 QNAME Query template/pair NAME 11 | 2 FLAG bitwise FLAG 12 | 3 RNAME Reference sequence NAME 13 | 4 POS 1-based leftmost POSition/coordinate of clipped sequence 14 | 5 MAPQ MAPping Quality (Phred-scaled) 15 | 6 CIGAR extended CIGAR string 16 | 7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME) 17 | 8 MPOS 1-based Mate POSistion 18 | 9 TLEN inferred Template LENgth (insert size) 19 | 10 SEQ query SEQuence on the same strand as the reference 20 | 11 QUAL query QUALity (ASCII-33 gives the Phred base quality) 21 | 12+ OPT variable OPTional fields in the format TAG:VTYPE:VALUE 22 | .TE 23 | .PP 24 | Each bit in the FLAG field is defined as: 25 | .TS 26 | lcbl. 27 | 0x0001 p the read is paired in sequencing 28 | 0x0002 P the read is mapped in a proper pair 29 | 0x0004 u the query sequence itself is unmapped 30 | 0x0008 U the mate is unmapped 31 | 0x0010 r strand of the query (1 for reverse) 32 | 0x0020 R strand of the mate 33 | 0x0040 1 the read is the first read in a pair 34 | 0x0080 2 the read is the second read in a pair 35 | 0x0100 s the alignment is not primary 36 | 0x0200 f the read fails platform/vendor quality checks 37 | 0x0400 d the read is either a PCR or an optical duplicate 38 | 0x0800 S the alignment is supplementary 39 | .TE 40 | .P 41 | where the second column gives the string representation of the FLAG field. 42 | .SH SEE ALSO 43 | .TP 44 | https://github.com/samtools/hts-specs 45 | The full SAM/BAM file format specification 46 | -------------------------------------------------------------------------------- /source/Transcript.cpp: -------------------------------------------------------------------------------- 1 | #include "Transcript.h" 2 | 3 | Transcript::Transcript() 4 | { 5 | reset(); 6 | }; 7 | 8 | void Transcript::reset() { 9 | extendL=0; 10 | 11 | // for (uint ii=0;ii<4;ii++) { 12 | // polyXlength[ii]=0; 13 | // polyXnMM[ii]=0; 14 | // }; 15 | primaryFlag=false; 16 | 17 | rStart=0; roStart=0; rLength=0; gStart=0; gLength=0; //read and genomic coordinates 18 | 19 | maxScore=0; 20 | nMatch=0; 21 | nMM=0; 22 | 23 | nGap=0; lGap=0; lDel=0; lIns=0; nDel=0; nIns=0; 24 | 25 | nUnique=nAnchor=0; 26 | }; 27 | 28 | void Transcript::add(Transcript *trIn) { 29 | maxScore+=trIn->maxScore; 30 | nMatch+=trIn->nMatch; 31 | nMM+=trIn->nMM; 32 | nGap+=trIn->nGap; lGap+=trIn->lGap; 33 | lDel+=trIn->lDel; nDel+=trIn->nDel; 34 | lIns+=trIn->lIns; nIns+=trIn->nIns; 35 | nUnique+=trIn->nUnique; 36 | }; 37 | 38 | void Transcript::extractSpliceJunctions(vector> &sjOut, bool &annotYes) 39 | { 40 | annotYes=true; 41 | for (uint64 iex=0; iex=0) {//only record junctions, not indels or mate gap 43 | array sj; 44 | sj[0]=exons[iex][EX_G]+exons[iex][EX_L];//start 45 | sj[1]=exons[iex+1][EX_G] - sj[0]; //gap 46 | sjOut.push_back(sj); 47 | if (sjAnnot[iex]==0) 48 | annotYes=false;//if one of the SJs is unannoated, annotYes=false 49 | }; 50 | }; 51 | }; 52 | 53 | uint64 Transcript::chrStartLengthExtended() 54 | { 55 | uint64 start1 = cStart - exons[0][EX_R]; 56 | uint64 length1 = exons[nExons-1][EX_G] + Lread - exons[nExons-1][EX_R] - exons[0][EX_G] + exons[0][EX_R]; 57 | 58 | return (start1 << 32) | length1; 59 | }; 60 | -------------------------------------------------------------------------------- /extras/tests/scripts/checkCellReadsStats_vsMatrix.awk: -------------------------------------------------------------------------------- 1 | # usage awk -f checkCellReadsStats.awk CellReads.stats barcodes.tsv matrix.mtx 2 | function funAbs(x) { 3 | return (x>0 ? x : -x); 4 | }; 5 | 6 | BEGIN { 7 | OFS="\t"; 8 | } 9 | 10 | (ARGIND==1) { 11 | if (NR==1) { 12 | for (ff=1; ff<=NF; ff++) { 13 | fn[ff]=$ff; 14 | fi[$ff]=ff; 15 | }; 16 | next; 17 | }; 18 | 19 | nuu=$(fi["nUMIunique"]); 20 | ngu=$(fi["nGenesUnique"]); 21 | num=$(fi["nUMImulti"]); 22 | ngm=$(fi["nGenesMulti"]); 23 | 24 | if (nuu>0) { 25 | nGenes[$1]=ngu; 26 | nUMI[$1]=nuu; 27 | }; 28 | if (nuu+num>0) { 29 | nGenesTot[$1]=ngu+ngm; 30 | nUMItot[$1]=nuu+num; 31 | }; 32 | 33 | } 34 | 35 | (ARGIND==2) {#barcodes.tsv 36 | CB[FNR]=$1; 37 | } 38 | 39 | (ARGIND==3 && FNR>3) {#matrix mtx 40 | matGenes[CB[$2]]++; 41 | matUMI[CB[$2]]+=$3; 42 | } 43 | 44 | (ARGIND==4 && FNR>3) {#matrix mtx 45 | matGenesTot[CB[$2]]++; 46 | matUMItot[CB[$2]]+=$3; 47 | } 48 | 49 | 50 | END { 51 | 52 | #for (cb in matUMI) 53 | # print cb,matUMI[cb],matGenes[cb]; 54 | 55 | #exit; 56 | 57 | print length(nGenes), length(matGenes); 58 | 59 | for (cb in matGenes) { 60 | if (nGenes[cb]!=matGenes[cb]) 61 | print "G",cb,nGenes[cb]+0,matGenes[cb]+0; 62 | if (nUMI[cb]!=matUMI[cb]) 63 | print "U",cb,nUMI[cb]+0,matUMI[cb]+0; 64 | }; 65 | 66 | if (ARGC>4) { 67 | print length(nGenesTot), length(matGenesTot); 68 | 69 | for (cb in matGenesTot) { 70 | if (nGenesTot[cb]!=matGenesTot[cb]) 71 | print "G",cb,nGenesTot[cb]+0,matGenesTot[cb]+0; 72 | if (funAbs(nUMItot[cb]-matUMItot[cb])>0.0001) 73 | print "U",cb,nUMItot[cb]+0,matUMItot[cb]+0; 74 | }; 75 | }; 76 | 77 | } 78 | -------------------------------------------------------------------------------- /source/OutSJ.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_OutSJ 2 | #define CODE_OutSJ 3 | 4 | #include "Parameters.h" 5 | #include "Genome.h" 6 | 7 | class Junction {//one junction 8 | public: 9 | const static uint startP=0; 10 | const static uint gapP=startP+sizeof(uint); 11 | const static uint strandP=gapP+sizeof(uint32); 12 | const static uint motifP=strandP+sizeof(char); 13 | const static uint annotP=motifP+sizeof(char); 14 | const static uint countUniqueP=annotP+sizeof(char); 15 | const static uint countMultipleP=countUniqueP+sizeof(uint32); 16 | const static uint overhangLeftP=countMultipleP+sizeof(uint32); 17 | const static uint overhangRightP=overhangLeftP+sizeof(uint16); 18 | 19 | uint *start; 20 | uint32 *gap; 21 | char *strand, *motif, *annot; 22 | uint32 *countUnique, *countMultiple; 23 | uint16 *overhangLeft, *overhangRight; 24 | 25 | const static uint dataSize=overhangRightP+sizeof(uint16); 26 | 27 | Junction(Genome &genomeIn); 28 | void junctionPointer(char* sjPoint, uint isj); 29 | void outputStream(ostream &outStream); 30 | void collapseOneSJ(char* isj1P, char* isjP, Parameters& P); 31 | 32 | private: 33 | Genome &genOut; 34 | }; 35 | 36 | class OutSJ { 37 | 38 | public: 39 | //all junctions 40 | char* data; //sj array[Njunctions][dataSize] 41 | vector dataVec; 42 | uint64 N, Nstore; //N=number of junctions stored; Nstore=storage size 43 | Junction oneSJ; 44 | 45 | OutSJ(uint64 nSJmax, Parameters &Pin, Genome &genomeIn); 46 | void collapseSJ();//collapse the junctions in data 47 | // int compareSJ(void* i1, void* i2); 48 | 49 | void dataSizeIncrease(); 50 | 51 | private: 52 | Parameters &P; 53 | Genome &genOut; 54 | }; 55 | 56 | int compareSJ(const void* i1, const void* i2); //external functions 57 | 58 | #endif 59 | 60 | -------------------------------------------------------------------------------- /source/mapThreadsSpawn.cpp: -------------------------------------------------------------------------------- 1 | #include "mapThreadsSpawn.h" 2 | #include "ThreadControl.h" 3 | #include "GlobalVariables.h" 4 | #include "ErrorWarning.h" 5 | 6 | void mapThreadsSpawn (Parameters &P, ReadAlignChunk** RAchunk) { 7 | for (int ithread=1;ithread0) {//something went wrong with one of threads 10 | ostringstream errOut; 11 | errOut << "EXITING because of FATAL ERROR: phtread error while creating thread # " << ithread <<", error code: "<logMain, 1, P); 13 | }; 14 | pthread_mutex_lock(&g_threadChunks.mutexLogMain); 15 | P.inOut->logMain << "Created thread # " <processChunks(); //start main thread 20 | 21 | for (int ithread=1;ithread0) {//something went wrong with one of threads 24 | ostringstream errOut; 25 | errOut << "EXITING because of FATAL ERROR: phtread error while joining thread # " << ithread <<", error code: "<logMain, 1, P); 27 | }; 28 | pthread_mutex_lock(&g_threadChunks.mutexLogMain); 29 | P.inOut->logMain << "Joined thread # " < fSet; //set of genes for this read 11 | vector> fAlign; //gene for each alignment of this read 12 | uint32 ovType; 13 | enum overlapTypes {none, exonic, exonicAS, exonic50p, exonic50pAS, intronic, intronicAS, intergenic, N}; 14 | 15 | void reset() { 16 | fSet={}; fAlign={}; ovType = 0; 17 | }; 18 | }; 19 | 20 | class ReadAnnotations { 21 | public: 22 | //set geneFull, geneFull_Ex50pAS, geneFull_ExonOverIntron, geneConcordant; 23 | //uint32 geneFullTr, geneFull_Ex50pAS_Tr, geneFull_ExonOverIntron_Tr, geneConcordantTr; //index of the annotated align - for multimappers that aligned to one gene only 24 | 25 | //vector geneFull_Al, geneFull_Ex50pAS_Al, geneFull_ExonOverIntron_Al, geneConcordant_Al; //gene for each align 26 | 27 | array annotFeatures; 28 | 29 | vector> transcriptConcordant; 30 | vector geneExonOverlap; 31 | array geneVelocytoSimple;//first element is gene, then counts of transcript types 32 | vector trVelocytoType;//first element is gene, then counts of transcript types 33 | 34 | //vector> sj; 35 | //bool sjAnnot; 36 | 37 | void reset () { 38 | transcriptConcordant = {}; 39 | trVelocytoType = {}; 40 | for (auto &raf : annotFeatures) 41 | raf.reset(); 42 | }; 43 | }; 44 | 45 | #endif 46 | 47 | 48 | -------------------------------------------------------------------------------- /source/ChimericSegment.cpp: -------------------------------------------------------------------------------- 1 | #include "ChimericSegment.h" 2 | 3 | ChimericSegment::ChimericSegment(Parameters &Pin, Transcript &alignIn) : P(Pin), pCh(Pin.pCh), align(alignIn) 4 | { 5 | if ( (align.intronMotifs[1]==0 && align.intronMotifs[2]==0) || (align.intronMotifs[1]>0 && align.intronMotifs[2]>0)) {//strand is undefined 6 | str=0; 7 | } else if ( (align.Str==0) == (align.intronMotifs[1]>0)) {//strand the same as RNA. 8 | //This assumes that the aligns have consistent strands, i.e. only intronMotifs[1]>0 OR intronMotifs[2]>0 9 | str=1; 10 | } else {//strand opposite to RNA 11 | str=2; 12 | }; 13 | roS=align.Str==0 ? align.exons[0][EX_R] : align.Lread - align.exons[align.nExons-1][EX_R] - align.exons[align.nExons-1][EX_L]; 14 | roE=align.Str==0 ? align.exons[align.nExons-1][EX_R] + align.exons[align.nExons-1][EX_L] - 1 : align.Lread - align.exons[0][EX_R] - 1; 15 | if (roS>align.readLength[0]) roS--; 16 | if (roE>align.readLength[0]) roE--; 17 | }; 18 | 19 | bool ChimericSegment::segmentCheck() 20 | { 21 | bool segGood = true; 22 | segGood = segGood && align.rLength >= pCh.segmentMin; //mapped length >= chim segmentMin 23 | segGood = segGood && align.intronMotifs[0]==0; //no non-canonical unannotated juncions. 24 | return segGood; 25 | 26 | //this is already tested for each align with default --outFilterIntronStrands RemoveInconsistentStrands 27 | //segGood = segGood && (align.intronMotifs[1]==0 || align.intronMotifs[2]==0); //consistent intron motifs. 28 | //this is not requiered since seg2 is tested for length 29 | // segGood = segGood && (align.exons[align.nExons-1][EX_R] + align.exons[align.nExons-1][EX_L] + P.pCh.segmentMin <= Lread 30 | // || align.exons[0][EX_R] >= P.pCh.segmentMin); //uncovered by seg1 read length is <= segmentMin 31 | 32 | }; -------------------------------------------------------------------------------- /source/ChimericAlign_chimericJunctionOutput.cpp: -------------------------------------------------------------------------------- 1 | #include "ChimericAlign.h" 2 | #include "ReadAlign.h" 3 | 4 | void ChimericAlign::chimericJunctionOutput(fstream &outStream, uint chimN, int maxNonChimAlignScore, bool PEmerged_flag, int chimScoreBest, int maxPossibleAlignScore) 5 | { 6 | outStream << mapGen.chrName[al1->Chr] <<"\t"<< chimJ1 - mapGen.chrStart[al1->Chr]+1 <<"\t"<< (al1->Str==0 ? "+":"-") \ 7 | <<"\t"<< mapGen.chrName[al2->Chr] <<"\t"<< chimJ2 - mapGen.chrStart[al2->Chr]+1 <<"\t"<< (al2->Str==0 ? "+":"-") \ 8 | <<"\t"<< chimMotif <<"\t"<< chimRepeat1 <<"\t"<< chimRepeat2 <<"\t"<< al1->readName+1 \ 9 | <<"\t"<< al1->exons[0][EX_G] - mapGen.chrStart[al1->Chr]+1 <<"\t"<< al1->generateCigarP() \ 10 | <<"\t"<< al2->exons[0][EX_G] - mapGen.chrStart[al2->Chr]+1 <<"\t"<< al2->generateCigarP() 11 | <<"\t"<< chimN // number of multimapping chimeric alignments for this read. 12 | << "\t" << maxPossibleAlignScore // the maximum possible alignment score (currently the sum of the (paired) read lengths) 13 | << "\t" << maxNonChimAlignScore // trBest - the best alignment score from a non-chimeric alignment of this read to the ref genome. 14 | << "\t" << chimScore // current chimeric alignment score 15 | << "\t" << chimScoreBest // best chimeric score among multimapping chimeric alignments. 16 | << "\t" << PEmerged_flag; // boolean indicating paired reads were merged into a single read before alignment & chimer detection. 17 | 18 | if (P.outSAMattrPresent.RG) 19 | outStream <<"\t"<< P.outSAMattrRG.at(RA->readFilesIndex); 20 | if (P.pSolo.type>0) 21 | outStream <<"\t"<< RA->soloRead->readBar->cbSeq <<"\t"<< RA->soloRead->readBar->umiSeq; 22 | outStream <<"\n"; //<<"\t"<< trChim[0].exons[0][EX_iFrag]+1 --- no need for that, since trChim[0] is always on the first mate 23 | }; 24 | -------------------------------------------------------------------------------- /source/SpliceGraph.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Created by Fahimeh Mirhaj on 6/14/19. 3 | */ 4 | 5 | #ifndef H_SpliceGraph 6 | #define H_SpliceGraph 7 | #include "GTF.h" 8 | #include "Transcript.h" 9 | 10 | class ReadAlign; 11 | 12 | class SpliceGraph { 13 | public: 14 | typedef int32 typeAlignScore; 15 | typedef uint32 typeSeqLen; 16 | 17 | SuperTranscriptome &superTrome; 18 | 19 | //vector> readAndSuperTranscript; 20 | const static typeSeqLen maxSeqLength = 100000;//make user parameter? 21 | typeAlignScore **scoringMatrix, *scoreTwoColumns[2]; 22 | const static typeSeqLen maxMatrixSize = 1000000000; //1B elements in the matrix for now 23 | uint8 directionMatrix[maxMatrixSize]; 24 | uint32 *sjDindex; 25 | 26 | int8_t gapPenalty = -1; 27 | int8_t matchScore = 1; 28 | int8_t misMatchPenalty = -1; 29 | 30 | //seed-and-rank 31 | typedef uint16 typeSuperTrSeedCount; 32 | typeSuperTrSeedCount *superTrSeedCount; 33 | 34 | //output 35 | struct { 36 | uint32 nMap, nMM, nI, nD, nSJ; 37 | array aStart, aEnd; 38 | } alignInfo; 39 | //vector> blockCoord; 40 | //vector blockSJ; 41 | //vector rowCol; 42 | //vector> rowSJ; 43 | 44 | 45 | 46 | SpliceGraph(SuperTranscriptome &superTrome, Parameters &P, ReadAlign *RA); 47 | ~SpliceGraph(); 48 | 49 | typeAlignScore swScoreSpliced(const char *readSeq, const uint32 readLen, const SuperTranscript &superTr, vector> &cigar); 50 | //void swTraceBack(array &alignEnds, array &alignStarts); 51 | void findSuperTr(const char *readSeq, const char *readSeqRevCompl, const uint32 readLen, const string &readName, Genome &mapGen); 52 | 53 | private: 54 | Parameters &P; 55 | ReadAlign *RA; 56 | }; 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /source/htslib/htslib/hts_defs.h: -------------------------------------------------------------------------------- 1 | /* hts_defs.h -- Miscellaneous definitions. 2 | 3 | Copyright (C) 2013-2014 Genome Research Ltd. 4 | 5 | Author: John Marshall 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notices and this permission notice shall be included in 15 | all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. */ 24 | 25 | #ifndef HTSLIB_HTS_DEFS_H 26 | #define HTSLIB_HTS_DEFS_H 27 | 28 | #if __clang__major__ >= 2 || __GNUC__ >= 3 29 | #define HTS_NORETURN __attribute__ ((__noreturn__)) 30 | #else 31 | #define HTS_NORETURN 32 | #endif 33 | 34 | #if (defined __clang__ && __clang_major__ >= 3) || \ 35 | (defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__==4 && __GNUC_MINOR__ >= 5))) 36 | #define HTS_RESULT_USED __attribute__ ((__warn_unused_result__)) 37 | #else 38 | #define HTS_RESULT_USED 39 | #endif 40 | 41 | #if defined __clang__ || defined __GNUC__ 42 | #define HTS_UNUSED __attribute__ ((__unused__)) 43 | #else 44 | #define HTS_UNUSED 45 | #endif 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /source/htslib/htslib/knetfile.h: -------------------------------------------------------------------------------- 1 | #ifndef KNETFILE_H 2 | #define KNETFILE_H 3 | 4 | #include 5 | #include 6 | 7 | #ifndef _WIN32 8 | #define netread(fd, ptr, len) read(fd, ptr, len) 9 | #define netwrite(fd, ptr, len) write(fd, ptr, len) 10 | #define netclose(fd) close(fd) 11 | #else 12 | #include 13 | #define netread(fd, ptr, len) recv(fd, ptr, len, 0) 14 | #define netwrite(fd, ptr, len) send(fd, ptr, len, 0) 15 | #define netclose(fd) closesocket(fd) 16 | #endif 17 | 18 | // FIXME: currently I/O is unbuffered 19 | 20 | #define KNF_TYPE_LOCAL 1 21 | #define KNF_TYPE_FTP 2 22 | #define KNF_TYPE_HTTP 3 23 | 24 | typedef struct knetFile_s { 25 | int type, fd; 26 | int64_t offset; 27 | char *host, *port; 28 | 29 | // the following are for FTP only 30 | int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; 31 | char *response, *retr, *size_cmd; 32 | int64_t seek_offset; // for lazy seek 33 | int64_t file_size; 34 | 35 | // the following are for HTTP only 36 | char *path, *http_host; 37 | } knetFile; 38 | 39 | #define knet_tell(fp) ((fp)->offset) 40 | #define knet_fileno(fp) ((fp)->fd) 41 | 42 | #ifdef __cplusplus 43 | extern "C" { 44 | #endif 45 | 46 | #ifdef _WIN32 47 | int knet_win32_init(); 48 | void knet_win32_destroy(); 49 | #endif 50 | 51 | knetFile *knet_open(const char *fn, const char *mode); 52 | 53 | /* 54 | This only works with local files. 55 | */ 56 | knetFile *knet_dopen(int fd, const char *mode); 57 | 58 | /* 59 | If ->is_ready==0, this routine updates ->fd; otherwise, it simply 60 | reads from ->fd. 61 | */ 62 | ssize_t knet_read(knetFile *fp, void *buf, size_t len); 63 | 64 | /* 65 | This routine only sets ->offset and ->is_ready=0. It does not 66 | communicate with the FTP server. 67 | */ 68 | off_t knet_seek(knetFile *fp, off_t off, int whence); 69 | int knet_close(knetFile *fp); 70 | 71 | #ifdef __cplusplus 72 | } 73 | #endif 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /source/Parameters_readSAMheader.cpp: -------------------------------------------------------------------------------- 1 | #include "Parameters.h" 2 | #include "ErrorWarning.h" 3 | #include 4 | #include 5 | 6 | void Parameters::readSAMheader(const string readFilesCommandString, const vector readFilesNames) { 7 | 8 | if (readFilesCommandString=="") {//simply read from file 9 | while (inOut->readIn[0].peek()=='@') { 10 | string str1; 11 | getline(inOut->readIn[0],str1); 12 | if (str1.substr(1,2)!="HD" && str1.substr(1,2)!="SQ") { 13 | samHeaderExtra += str1 + '\n'; 14 | }; 15 | }; 16 | return; 17 | }; 18 | 19 | string tmpFifo=outFileTmp+"tmp.fifo.header"; 20 | remove(tmpFifo.c_str()); 21 | if (mkfifo(tmpFifo.c_str(), S_IRUSR | S_IWUSR ) != 0) { 22 | exitWithError("Exiting because of *FATAL ERROR*: could not create FIFO file " + tmpFifo + "\n" 23 | + "SOLUTION: check the if run directory supports FIFO files.\n" 24 | + "If run partition does not support FIFO (e.g. Windows partitions FAT, NTFS), " 25 | + "re-run on a Linux partition, or point --outTmpDir to a Linux partition.\n" 26 | , std::cerr, inOut->logMain, EXIT_CODE_FIFO, *this); 27 | }; 28 | 29 | ifstream tmpFifoIn; 30 | for (uint32 ii=0; ii " + tmpFifo + "&"; 32 | system(com1.c_str()); 33 | tmpFifoIn.open(tmpFifo); 34 | while (tmpFifoIn.peek()=='@') { 35 | string str1; 36 | getline(tmpFifoIn,str1); 37 | if (str1.substr(1,2)!="HD" && str1.substr(1,2)!="SQ" && (!twoPass.pass2) ) { 38 | //SQ and HD header lines cannot be imported from uSAM; do not record the header again in the 2nd pass 39 | samHeaderExtra += str1 + '\n'; 40 | }; 41 | }; 42 | tmpFifoIn.close(); 43 | }; 44 | }; -------------------------------------------------------------------------------- /source/htslib/cram/vlen.h: -------------------------------------------------------------------------------- 1 | /* 2 | Author: James Bonfield (jkb@sanger.ac.uk) 3 | 4 | Copyright (c) 1995-1996 MEDICAL RESEARCH COUNCIL 5 | All rights reserved 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | 1 Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | 13 | 2 Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | 3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF 18 | MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or 19 | promote products derived from this software without specific prior written 20 | permission. 21 | 22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 23 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 24 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 25 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 26 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 27 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 29 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | 34 | #ifndef _VLEN_H_ 35 | #define _VLEN_H_ 36 | 37 | #ifdef __cplusplus 38 | extern "C" { 39 | #endif 40 | 41 | extern int vflen(char *fmt, va_list ap); 42 | extern int flen(char *fmt, ...); 43 | 44 | #ifdef __cplusplus 45 | } 46 | #endif 47 | 48 | #endif /* _VLEN_H_ */ 49 | -------------------------------------------------------------------------------- /extras/scripts/mergeSuperContig.awk: -------------------------------------------------------------------------------- 1 | ### 2 | # usage 3 | # awk -f mergeSuperContig.awk All.fasta All.gtf 4 | ### 5 | 6 | BEGIN { 7 | ################################ 8 | # these parameters can be changed 9 | 10 | shortL=64000 11 | 12 | fLongOut="Long.out.fasta"; # output file name for FASTA 13 | fShortOut="Short.out.fasta"; # output file name for FASTA 14 | fGtfOut="Annot.out.gtf"; # output file name for GTF 15 | fChrStartOut="ChrStart.tab"; # output file name for chr starts 16 | pN=60; # N-padding length 17 | scName="SC"; # supercontig name 18 | ################################ 19 | 20 | OFS="\t"; 21 | s=0; # sequence start in super-contig 22 | c=""; # current chr name 23 | Seq=""; # chr sequence 24 | 25 | for (ii=1; ii<=60; ii++) P=P "N"; # N-padding string 26 | 27 | print ">" scName > fShortOut; 28 | 29 | while (getline < ARGV[1]) 30 | {# FASTA 31 | if (substr($1,1,1)==">") 32 | {# new chr 33 | chrFinish(); 34 | # start new chr 35 | c=substr($1,2); # chr name 36 | l=0; # chr length 37 | Seq=""; 38 | print c,s > fChrStartOut; 39 | } else 40 | {# collect sequence 41 | Seq=Seq $1; 42 | l+=length($1); 43 | }; 44 | }; 45 | 46 | chrFinish(); 47 | 48 | while (getline < ARGV[2]) 49 | {# GTF 50 | if ($1 in S) 51 | {# transform coordinates 52 | $4+=S[$1]; 53 | $5+=S[$1]; 54 | $1=scName; 55 | }; 56 | print > fGtfOut; 57 | }; 58 | }; 59 | 60 | function chrFinish() 61 | { 62 | if (l>shortL) 63 | {# long chr 64 | print ">" c "\n" Seq > fLongOut; 65 | } else if (l>0) 66 | {#short chr 67 | printf Seq P > fShortOut; 68 | S[c]=s; # chr start in contig 69 | s+=l+pN; 70 | }; 71 | }; 72 | -------------------------------------------------------------------------------- /source/Transcriptome_geneFullAlignOverlap_ExonOverIntron.cpp: -------------------------------------------------------------------------------- 1 | #include "Transcriptome.h" 2 | #include "serviceFuns.cpp" 3 | #include "ReadAnnotations.h" 4 | 5 | void Transcriptome::geneFullAlignOverlap_ExonOverIntron(uint nA, Transcript **aAll, int32 strandType, ReadAnnotFeature &annFeat, ReadAnnotFeature &annFeatGeneConcordant) 6 | { 7 | // annFeat.ovType = 0; 8 | if (annFeatGeneConcordant.fSet.size()>0) {//if concordant genes were found for this read, prioritize them over intronic overlap 9 | annFeat = annFeatGeneConcordant; 10 | annFeat.ovType = ReadAnnotFeature::overlapTypes::exonic; 11 | return; 12 | }; 13 | 14 | //calculate overlap with introns 15 | // annFeat.fSet={}; 16 | // annFeat.fAlign = {}; 17 | 18 | annFeat.fAlign.resize(nA); 19 | for (uint32 iA=0; iA(aS, geneFull.s, (int32) nGe); //search align-start against gene-starts. Find last gene which still starts to the left of align-start 27 | 28 | while (gi1>=0 && geneFull.eMax[gi1]>=aE) {//these genes may overlap this block 29 | if (geneFull.e[gi1]>=aE) {//this gene contains the block: gene-end is to the right of block start 30 | int32 str1 = geneFull.str[gi1]==1 ? a.Str : 1-a.Str; 31 | if (strandType==-1 || strandType==str1) { 32 | annFeat.fSet.insert(geneFull.g[gi1]); 33 | annFeat.fAlign[iA].insert(geneFull.g[gi1]); 34 | }; 35 | }; 36 | --gi1;// go to the previous gene 37 | }; 38 | }; 39 | if (annFeat.fSet.size()>0) 40 | annFeat.ovType = ReadAnnotFeature::overlapTypes::intronic; 41 | }; 42 | 43 | 44 | -------------------------------------------------------------------------------- /extras/doc-latex/convertParDefToLatexTable.awk: -------------------------------------------------------------------------------- 1 | function substLatexSymbols() { 2 | gsub("\\^","\\^{}",$0); 3 | gsub(">","{\\textgreater}"); 4 | gsub("<","{\\textless}"); 5 | gsub("_","{\\textunderscore}"); 6 | gsub("&","{\\\\&}"); 7 | gsub("%","{\\%}"); 8 | }; 9 | 10 | BEGIN { 11 | nSection=0; 12 | optOptTableEnd="\\end{optOptTable}"; 13 | optOptTableBegin="\\begin{optOptTable}"; 14 | optTableEnd="\\end{optTable}"; 15 | optTableBegin="\\begin{optTable}"; 16 | }; 17 | 18 | { 19 | 20 | if ($0 ~ /^#####UnderDevelopment_begin/) { 21 | while ($0 !~ /^#####UnderDevelopment_end/) { 22 | getline; 23 | }; 24 | }; 25 | 26 | substLatexSymbols(); 27 | 28 | if ($1=="###") {# new group/subsection of parameters 29 | if ($2!="versions") {# skip versions 30 | if (nSection>0) print optTableEnd; 31 | sectionName=substr($0,index($0,$2)); 32 | printf "\\optSection{" sectionName "}"; 33 | gsub(" ","_",sectionName); 34 | print "\\label{" sectionName "}"; 35 | print optTableBegin; 36 | ++nSection; 37 | }; 38 | } else if ($0!="" && substr($0,1,1)!=" " && substr($1,1,1)!="#" && substr($1,1,7)!="version") {//option name has a letter as the first character 39 | optV=$2; 40 | for (ii=3;ii<=NF;ii++) optV=optV " " $ii; 41 | print "\\optName{" $1 "}"; 42 | print " \\optValue{" optV "}"; 43 | 44 | getline;substLatexSymbols(); 45 | nOpt=0; 46 | while ($1!="") { 47 | $0=substr($0,match($0,/[^[:space:]]/)); 48 | no=split($0,oo,/[[:space:]]*\.\.\.[[:space:]]*/); 49 | if (no!=2) {# not option line 50 | if (nOpt>0) print optOptTableEnd; 51 | print " \\optLine{" $0 "}" " "; 52 | nOpt=0; 53 | } else { 54 | if (nOpt==0) print optOptTableBegin; 55 | gsub(/^[ \t]+|[ \t]+$/, "",oo[1]); #remove leading trailing spaces 56 | gsub(/^[ \t]+|[ \t]+$/, "",oo[2]); 57 | print " \\optOpt{" oo[1] "} \\optOptLine{" oo[2] "}" ; 58 | ++nOpt; 59 | }; 60 | getline; substLatexSymbols(); 61 | }; 62 | if (nOpt>0) print optOptTableEnd; 63 | }; 64 | 65 | }; 66 | 67 | END { 68 | print optTableEnd; 69 | }; 70 | -------------------------------------------------------------------------------- /source/ReadAlign_chimericDetection.cpp: -------------------------------------------------------------------------------- 1 | #include "IncludeDefine.h" 2 | #include "Parameters.h" 3 | #include "Transcript.h" 4 | #include "ReadAlign.h" 5 | #include "BAMfunctions.h" 6 | #include "blocksOverlap.h" 7 | 8 | //#include "SequenceFuns.h" 9 | //#include "stitchWindowAligns.h" 10 | //#include "sjSplitAlign.cpp" 11 | //#include "PackedArray.h" 12 | //#include "alignSpliceGraph.h" 13 | //#include "GlobalVariables.h" 14 | //#include 15 | 16 | void ReadAlign::chimericDetection() { 17 | 18 | chimRecord=false; 19 | 20 | if (P.pCh.segmentMin==0) {//no chimeric detection requested 21 | return; 22 | }; 23 | if (P.outFilterBySJoutStage>1) {//no chimeric output for stage=2. REVISIT: NOT SURE why 24 | return; 25 | }; 26 | 27 | //output chains for out-of-STAR chimeric detection 28 | #ifdef OUTPUT_localChains 29 | { 30 | P.inOut->outLocalChains << readName <<"\t"<< Read0[0] <<"\t"<< Read0[1] << "\n"; 31 | for (uint iw=0; iwoutLocalChains << trAll[iw][itr]->maxScore<<"\t"<< trAll[iw][itr]->Chr<<"\t"<Str<<"\t"<nExons; 34 | for (uint ib=0;ibnExons;ib++) { 35 | P.inOut->outLocalChains <<"\t"<< trAll[iw][itr]->exons[ib][EX_G]-mapGen.chrStart[trAll[iw][itr]->Chr] \ 36 | <<"\t"<< trAll[iw][itr]->exons[ib][EX_R] <<"\t"<< trAll[iw][itr]->exons[ib][EX_L]; 37 | }; 38 | P.inOut->outLocalChains <<"\n"; 39 | }; 40 | }; 41 | }; 42 | #endif 43 | 44 | 45 | if (P.pCh.multimapNmax==0) { 46 | chimRecord=chimericDetectionOld(); 47 | chimericDetectionOldOutput(); 48 | } else if (trBest->maxScore <= (int) (readLength[0]+readLength[1]) - (int) P.pCh.nonchimScoreDropMin) {//require big enough drop in the best score 49 | chimRecord=chimDet->chimericDetectionMult(nW, readLength, trBest->maxScore, NULL); 50 | }; 51 | 52 | if ( chimRecord ) { 53 | statsRA.chimericAll++; 54 | }; 55 | 56 | return; 57 | };//END 58 | -------------------------------------------------------------------------------- /source/ClipMate_clipChunk.cpp: -------------------------------------------------------------------------------- 1 | #include "ParametersClip.h" 2 | #include "Parameters.h" 3 | #include "SequenceFuns.h" 4 | 5 | inline char* findChar(char *arr, char c); 6 | 7 | void ClipMate::clipChunk(char *chArr, uint64 chSize) 8 | {//clipping adapters from a chunk of reads 9 | if (type != 10) //type=10 for CellRanger4 5' clipping. 10 | return; 11 | 12 | char *chA1 = chArr; 13 | bool chGood = true; //=true after teh end of the chunk 14 | while (chGood) {//cycle over all 15 | 16 | int dbN1 = cr4->dbN; //maybe changed to a smaller value in the loop 17 | int idb = 0; 18 | for ( ; idbdbN; idb++) { 19 | chA1 = findChar(chA1, '\n')+1; //skip read name 20 | 21 | char *chA2 = findChar(chA1, '\n'); 22 | uint32 rL = (uint32) (chA2-chA1); 23 | //debug 24 | string tmp1(chA1, 91); 25 | 26 | 27 | 28 | cr4->opalFillOneSeq(idb, chA1, rL); 29 | 30 | cr4->storeClip[idb] = (uint8*) (chA2+1);//store the position of "+" character - we will record the clipped length there 31 | 32 | //before the next one 33 | chA1 = chA2 + 3 + rL + 1; //start of the next read: skip \n+\n, quality=read lengt, \n 34 | if (chA1 > chArr+chSize) { 35 | chGood = false; 36 | dbN1 = idb+1; 37 | break; 38 | }; 39 | }; 40 | 41 | cr4->opalAlign((uint8_t*) adSeqNum.data(), adSeqNum.size(), dbN1); 42 | 43 | for (int idb=0; idbopalRes[idb].endLocationTarget+1; 45 | int S = cr4->opalRes[idb].score; 46 | 47 | bool L0 = S<20 || (S==20 && L>26) || (S==21 && L>30); 48 | 49 | *cr4->storeClip[idb] = (uint8) (L0 ? 0 : L); 50 | }; 51 | }; 52 | 53 | }; 54 | 55 | inline char* findChar(char *arr, char c) 56 | {//find character in a character array. No check for out of boundary. 57 | char* cArr=arr; 58 | while (*cArr != c) 59 | ++cArr; 60 | 61 | return cArr; 62 | }; -------------------------------------------------------------------------------- /source/funPrimaryAlignMark.cpp: -------------------------------------------------------------------------------- 1 | #include "funPrimaryAlignMark.h" 2 | 3 | void funPrimaryAlignMark(Transcript **trMult, uint64 nTr, 4 | Parameters &P, int maxScore, std::uniform_real_distribution rngUniformReal0to1, std::mt19937 rngMultOrder) 5 | { 6 | if (nTr==1){//unique mappers 7 | trMult[0]->primaryFlag=true; 8 | } else {//multimappers 9 | int nbest=0; 10 | if (P.outMultimapperOrder.random || P.outSAMmultNmax != (uint) -1 ) {//bring the best alignment to the top of the list. TODO sort alignments by the score? 11 | for (uint itr=0; itrmaxScore == maxScore ) { 13 | swap(trMult[itr],trMult[nbest]); 14 | ++nbest; 15 | }; 16 | }; 17 | }; 18 | 19 | if (P.outMultimapperOrder.random) {//shuffle separately the best aligns, and the rest 20 | for (int itr=nbest-1; itr>=1; itr--) {//Fisher-Yates-Durstenfeld-Knuth shuffle 21 | int rand1=int (rngUniformReal0to1(rngMultOrder)*itr+0.5); 22 | swap(trMult[itr],trMult[rand1]); 23 | }; 24 | for (int itr=nTr-nbest-1; itr>=1; itr--) {//Fisher-Yates-Durstenfeld-Knuth shuffle 25 | int rand1=int (rngUniformReal0to1(rngMultOrder)*itr+0.5); 26 | swap(trMult[nbest+itr],trMult[nbest+rand1]); 27 | }; 28 | }; 29 | 30 | if ( P.outSAMprimaryFlag=="AllBestScore" ) { 31 | for (uint itr=0; itrmaxScore == maxScore ) trMult[itr]->primaryFlag=true; 34 | }; 35 | } else if (P.outMultimapperOrder.random || P.outSAMmultNmax != (uint) -1) { 36 | trMult[0]->primaryFlag=true;//mark as primary the first one in the random ordered list: best scoring aligns are already in front of the list 37 | } else {//old way 38 | //trBest->primaryFlag=true; //cannot do it, trBest may not be defined 39 | trMult[0]->primaryFlag=true; 40 | }; 41 | }; 42 | }; 43 | 44 | -------------------------------------------------------------------------------- /source/stitchGapIndel.cpp: -------------------------------------------------------------------------------- 1 | #include "IncludeDefine.h" 2 | #include "Parameters.h" 3 | 4 | int stitchGapIndel (uint rAend, uint gAend, uint rBstart, uint gBstart, uint L, uint gapStart, uint gapEnd, char* R, char* G, Parameters& P,\ 5 | uint &iRbest, uint &nMM){//returns stitch score 6 | 7 | uint gapLength = gapEnd-gapStart+1; 8 | sint inDel= (sint) (gBstart-gAend-1) - (sint) gapLength - (sint) (rBstart-rAend-1); //>0: deletion; <0: insertion 9 | 10 | if (inDel==0) {//this should not happen, it should have been caught in the first stitching 11 | return -1; 12 | }; 13 | int score2best; 14 | int score2; 15 | 16 | if (inDel>0) {// 17 | score2=0; 18 | score2best=-1; 19 | iRbest=0; 20 | for (uint iR=1; iR=gapStart) iG1 += gapLength;//exclude gap 24 | if (iG2>=gapStart) iG2 += gapLength; 25 | 26 | if ( R[rAend+iR]==G[iG1] && R[rAend+iR]!=G[iG2] ) { 27 | score2++; 28 | } else if ( R[rAend+iR]!=G[iG1] && R[rAend+iR]==G[iG2] ) { 29 | score2--; 30 | }; 31 | 32 | if (score2>score2best) { 33 | score2best=score2; 34 | iRbest=iR; 35 | }; 36 | }; 37 | 38 | //score the alignment with inDel at iRbest 39 | nMM=0; 40 | score2= L - inDel*P.scoreDelBase - P.scoreDelOpen; //score B and deletion 41 | for (uint iR=1; iRiRbest) iG += (uint) inDel; 44 | if (iG>=gapStart) iG += gapLength;//exclude gap 45 | 46 | if ( R[rAend+iR]==G[iG] ) { 47 | score2++; 48 | } else if (R[rAend+iR]!=G[iG] && R[rAend+iR]<4 && G[iG]<4) {//only penalize mismatches for non-N bases 49 | score2--; 50 | nMM++; 51 | }; 52 | }; 53 | 54 | } else { 55 | return -1; 56 | }; 57 | 58 | return score2; 59 | }; 60 | -------------------------------------------------------------------------------- /source/htslib/cram/zfio.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2009-2013 Genome Research Ltd. 3 | Author: James Bonfield 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger 16 | Institute nor the names of its contributors may be used to endorse or promote 17 | products derived from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | #ifndef _ZFIO_H_ 32 | #define _ZFIO_H_ 33 | 34 | #include 35 | #include 36 | 37 | /* 38 | * Either a gzFile or a FILE. 39 | */ 40 | typedef struct { 41 | FILE *fp; 42 | gzFile gz; 43 | } zfp; 44 | 45 | off_t zftello(zfp *zf); 46 | int zfseeko(zfp *zf, off_t offset, int whence); 47 | char *zfgets(char *line, int size, zfp *zf); 48 | int zfputs(char *line, zfp *zf); 49 | zfp *zfopen(const char *path, const char *mode); 50 | int zfclose(zfp *zf); 51 | int zfpeek(zfp *zf); 52 | int zfeof(zfp *zf); 53 | 54 | #endif /* _ZFIO_H_ */ 55 | -------------------------------------------------------------------------------- /source/SoloReadFeature.cpp: -------------------------------------------------------------------------------- 1 | #include "SoloReadFeature.h" 2 | #include "streamFuns.h" 3 | #include "SoloFeatureTypes.h" 4 | 5 | SoloReadFeature::SoloReadFeature(int32 feTy, Parameters &Pin, int iChunk) 6 | : featureType(feTy), P(Pin), pSolo(P.pSolo) 7 | { 8 | if (pSolo.type==0) 9 | return; 10 | // if (pSolo.type==pSolo.SoloTypes::CB_samTagOut) 11 | // return; 12 | 13 | readInfoYes = pSolo.readInfoYes[featureType]; 14 | readIndexYes = pSolo.readIndexYes[featureType]; 15 | 16 | if (pSolo.cbWLyes) { 17 | cbReadCount.resize(pSolo.cbWLsize,0); 18 | }; 19 | 20 | if (iChunk>=0) { 21 | //open with flagDelete=false, i.e. try to keep file if it exists 22 | streamReads = &fstrOpen(P.outFileTmp+"/solo"+SoloFeatureTypes::Names[featureType]+'_'+std::to_string(iChunk), ERROR_OUT, P, false); 23 | }; 24 | 25 | if (featureType==SoloFeatureTypes::Transcript3p) 26 | transcriptDistCount.resize(10000,0); 27 | }; 28 | 29 | void SoloReadFeature::addCounts(const SoloReadFeature &rfIn) 30 | { 31 | if (pSolo.cbWLyes) {//WL 32 | for (uint32 ii=0; iifirst] += ii->second; 38 | }; 39 | }; 40 | 41 | if (transcriptDistCount.size()>0) { 42 | for (uint32 ii=0; ii0) { 53 | c[CB]["featureU"]++; 54 | } else if (R[r]["gx"]>0) { 55 | c[CB]["featureM"]++; 56 | }; 57 | 58 | if (R[r]["mapped"]==1) 59 | c[CB]["genomeU"]++; 60 | if (R[r]["mapped"]>1) 61 | c[CB]["genomeM"]++; 62 | if (R[r]["chrM"]>0) 63 | c[CB]["mito"]++; 64 | 65 | }; 66 | 67 | split("genomeU genomeM featureU featureM mito", fnames); 68 | for (CB in c) { 69 | #print CB,c[CB]["cbMatch"],crs[CB]["cbMatch"]; 70 | for (f=1; f<=length(fnames); f++) { 71 | ff=fnames[f]; 72 | if (crs[CB][ff]!=c[CB][ff]) 73 | print CB, ff, crs[CB][ff]+0, c[CB][ff]+0; 74 | }; 75 | }; 76 | 77 | 78 | # write out the file - not needed 79 | exit; 80 | split("cbMatch genomeU genomeM featureU featureM mito", fnames); 81 | for (CB in c) { 82 | printf CB > "CellReads.txt"; 83 | for (f=1; f<=length(fnames); f++) { 84 | ff=fnames[f]; 85 | printf "\t" c[CB][ff]+0 > "CellReads.txt"; 86 | }; 87 | printf "\n" > "CellReads.txt"; 88 | }; 89 | } 90 | -------------------------------------------------------------------------------- /source/htslib/cram/pooled_alloc.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2009 Genome Research Ltd. 3 | Author: Rob Davies 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger 16 | Institute nor the names of its contributors may be used to endorse or promote 17 | products derived from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | #ifndef _POOLED_ALLOC_H_ 32 | #define _POOLED_ALLOC_H_ 33 | 34 | /* 35 | * Implements a pooled block allocator where all items are the same size, 36 | * but we need many of them. 37 | */ 38 | typedef struct { 39 | void *pool; 40 | size_t used; 41 | } pool_t; 42 | 43 | typedef struct { 44 | size_t dsize; 45 | size_t npools; 46 | pool_t *pools; 47 | void *free; 48 | } pool_alloc_t; 49 | 50 | pool_alloc_t *pool_create(size_t dsize); 51 | void pool_destroy(pool_alloc_t *p); 52 | void *pool_alloc(pool_alloc_t *p); 53 | void pool_free(pool_alloc_t *p, void *ptr); 54 | 55 | 56 | #endif /*_POOLED_ALLOC_H_*/ 57 | -------------------------------------------------------------------------------- /source/Transcript_alignScore.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "Transcript.h" 3 | 4 | intScore Transcript::alignScore(char **Read1, char *G, Parameters &P) {//re-calculates score and number of mismatches 5 | maxScore=0; 6 | nMM=0; 7 | nMatch=0; 8 | 9 | if (nExons==0) 10 | return maxScore; 11 | 12 | char* R=Read1[roStr==0 ? 0:2]; 13 | for (uint iex=0;iex3 || g1>3) {//nothing to do 18 | } else if (r1==g1) {//match 19 | ++maxScore; 20 | ++nMatch; 21 | } else {//mismatch 22 | ++nMM; 23 | --maxScore; 24 | }; 25 | }; 26 | }; 27 | for (uint iex=0;iexlogMain << "Reading output genome generation parameters:\n"; 15 | P1.inOut = P.inOut; 16 | P1.scanAllLines(parFile,3,-1); 17 | parFile.close(); 18 | } else { 19 | ostringstream errOut; 20 | errOut << "EXITING because of FATAL ERROR: could not open genome file "<< pGe.gDir+("/genomeParameters.txt") << endl; 21 | errOut << "SOLUTION: check that the path to genome files, specified in --genomeDir is correct and the files are present, and have user read permsissions\n" <logMain, EXIT_CODE_GENOME_FILES, P); 23 | }; 24 | 25 | //find chr starts from files 26 | chrInfoLoad(); 27 | 28 | 29 | ifstream GenomeIn; 30 | nGenome = OpenStream("Genome", GenomeIn, 0); 31 | G=new char[nGenome]; 32 | //uint64 genomeReadBytesN = 33 | fstreamReadBig(GenomeIn,G,nGenome); 34 | GenomeIn.close(); 35 | 36 | Genome::loadSJDB(pGe.gDir); 37 | 38 | //record required genome parameters in P 39 | pGe.gSAindexNbases=P1.pGe.gSAindexNbases; 40 | pGe.gChrBinNbits=P1.pGe.gChrBinNbits; 41 | genomeChrBinNbases=1LLU<> nconv >> genomeOut.nMinusStrandOffset; 51 | genomeOut.convBlocks.resize(nconv+1); 52 | for (uint32 ii=0; ii> genomeOut.convBlocks[ii][0] >> genomeOut.convBlocks[ii][1] >> genomeOut.convBlocks[ii][2]; 54 | 55 | //genomeOut.convBlocks[nconv][0]=genomeOut.convBlocks[nconv-1][0]+genomeOut.convBlocks[nconv-1][1]; 56 | genomeOut.convBlocks[nconv-1][1] +=1;//increase the length of the last block so that we never reach the last base 57 | genomeOut.convBlocks[nconv][0]=(uint64)-1;//start of the block after last is infinity 58 | }; 59 | -------------------------------------------------------------------------------- /source/SoloFeature.cpp: -------------------------------------------------------------------------------- 1 | #include "SoloFeature.h" 2 | #include "streamFuns.h" 3 | 4 | SoloFeature::SoloFeature(Parameters &Pin, ReadAlignChunk **RAchunk, Transcriptome &inTrans, int32 feTy, SoloReadBarcode *readBarSumIn, SoloFeature **soloFeatAll) 5 | : P(Pin), RAchunk(RAchunk), Trans(inTrans), featureType(feTy), soloFeatAll(soloFeatAll), pSolo(P.pSolo), readBarSum(readBarSumIn) 6 | { 7 | if (featureType>=0) {//otherwise we do not need these arrays - e.g. with --runMode soloCellFiltering 8 | readFeatSum = new SoloReadFeature(featureType,P,-1); 9 | readFeatAll = new SoloReadFeature*[P.runThreadN]; 10 | }; 11 | 12 | //number of features 13 | switch (featureType) { 14 | case SoloFeatureTypes::Gene : 15 | case SoloFeatureTypes::GeneFull : 16 | case SoloFeatureTypes::GeneFull_Ex50pAS : 17 | case SoloFeatureTypes::GeneFull_ExonOverIntron : 18 | case SoloFeatureTypes::Velocyto : 19 | featuresNumber=Trans.nGe; 20 | break; 21 | case SoloFeatureTypes::SJ : 22 | featuresNumber=P.sjAll[0].size(); 23 | break; 24 | default: 25 | featuresNumber = -1; //undefined 26 | }; 27 | }; 28 | 29 | void SoloFeature::clearLarge() 30 | { 31 | cbFeatureUMImap.clear(); 32 | cbFeatureUMImap.shrink_to_fit(); 33 | countCellGeneUMI.clear(); 34 | countCellGeneUMI.shrink_to_fit(); 35 | countCellGeneUMIindex.clear(); 36 | countCellGeneUMIindex.shrink_to_fit(); 37 | countMatMult.i.clear(); 38 | countMatMult.i.shrink_to_fit(); 39 | countMatMult.m.clear(); 40 | countMatMult.m.shrink_to_fit(); 41 | //indCB.clear(); //needed for Velocyto 42 | //indCB.shrink_to_fit(); 43 | indCBwl.clear(); 44 | indCBwl.shrink_to_fit(); 45 | nGenePerCB.clear(); 46 | nGenePerCB.shrink_to_fit(); 47 | nGenePerCBmulti.clear(); 48 | nGenePerCBmulti.shrink_to_fit(); 49 | nReadPerCB.clear(); 50 | nReadPerCB.shrink_to_fit(); 51 | nReadPerCBtotal.clear(); 52 | nReadPerCBtotal.shrink_to_fit(); 53 | nReadPerCBunique.clear(); 54 | nReadPerCBunique.shrink_to_fit(); 55 | nUMIperCB.clear(); 56 | nUMIperCB.shrink_to_fit(); 57 | nUMIperCBmulti.clear(); 58 | nUMIperCBmulti.shrink_to_fit(); 59 | nUMIperCBsorted.clear(); 60 | nUMIperCBsorted.shrink_to_fit(); 61 | sjAll[0].clear(); 62 | sjAll[0].shrink_to_fit(); 63 | sjAll[1].clear(); 64 | sjAll[1].shrink_to_fit(); 65 | }; 66 | -------------------------------------------------------------------------------- /source/SoloCommon.h: -------------------------------------------------------------------------------- 1 | #ifndef H_SoloCommon 2 | #define H_SoloCommon 3 | 4 | #include 5 | #include 6 | 7 | typedef struct{ 8 | uint64 cb; 9 | uint32 umi; 10 | } readInfoStruct; 11 | 12 | typedef struct{ 13 | uint32 tr; 14 | uint8 type; 15 | } trTypeStruct; 16 | 17 | typedef uint32 uintUMI; 18 | typedef uint64 uintCB; 19 | typedef uint32 uintRead; 20 | 21 | #define uintUMIbits 32 22 | #define velocytoTypeGeneBits 4 23 | #define velocytoTypeGeneBitShift 28 24 | #define geneMultMark (((uint32)1)<<31) 25 | 26 | class SoloReadFlagClass 27 | { 28 | public: 29 | bool yes=false; 30 | typedef uint32 typeFlag; 31 | typeFlag flag=0; 32 | enum: uint32 {cbMatch, cbPerfect, cbMMunique, cbMMmultiple, genomeU, genomeM, featureU, featureM, exonic, intronic, exonicAS, intronicAS, mito, countedU, countedM, nBits}; 33 | const vector statNames={"cbMatch", "cbPerfect", "cbMMunique", "cbMMmultiple", "genomeU", "genomeM", "featureU", "featureM", "exonic", "intronic", "exonicAS", "intronicAS", "mito", "countedU", "countedM"}; 34 | 35 | /* lookup table, probably not efficient 36 | typeFlag bitMask[nBits], bitInt[nBits]; 37 | 38 | void SoloReadFlagClass() { 39 | for (uint32 ibit=0; ibit< nBits; ibit++) { 40 | bitInt[ii] = ((typeFlag)1) << ibit; 41 | bitMask[ii] = ~ bitInt[ii]; 42 | }; 43 | 44 | }; 45 | */ 46 | 47 | unordered_map < uintCB, array > flagCounts; 48 | array flagCountsNoCB={}; 49 | 50 | void setBit(uint32 ibit) { 51 | flag |= ((typeFlag)1) << ibit; 52 | }; 53 | 54 | typeFlag checkBit(uint32 ibit) { 55 | return (flag>>ibit) & ((typeFlag)1); 56 | }; 57 | 58 | void countsAdd(uintCB cb) 59 | {//adds flag bits to the count for a given cb 60 | auto cbInserted = flagCounts.insert({cb, {} }); 61 | for (uint32 ibit=0; ibit &arrIn) 73 | { 74 | for (uint32 ibit=0; ibit0) 41 | nMin=length(cellTot)=nMin) { 46 | print CB[ii] > fOutCB; 47 | nCell++; 48 | cellInew[ii]=nCell; 49 | print nCell,cellTot[ii] > fOutCB ".counts"; 50 | }; 51 | }; 52 | 53 | if (exactCells==0) { 54 | print "maxUMIperCell=" cellTotSort[length(cellTotSort)]+0,"Robust maxUMIperCel=" nMax,"minUMIperCell=" nMin, "Filtered N cells=", nCell; 55 | } else { 56 | print "total N cells=" length(cellTot), "exactCells=" exactCells, "minUMIperCell=" nMin; 57 | }; 58 | 59 | nMat=0; 60 | for (ii=nHeaderLines+1; ii<=nLines; ii++) { 61 | if (cellTot[cellI[ii]]>=nMin) { 62 | nMat++; 63 | }; 64 | }; 65 | 66 | for (ii=1;ii fOutMat; 68 | }; 69 | 70 | print nGenes,nCell+0,nMat+0 > fOutMat; 71 | 72 | for (ii=nHeaderLines+1; ii<=nLines; ii++) { 73 | if (cellTot[cellI[ii]]>=nMin) { 74 | print cellG[ii],cellInew[cellI[ii]],cellN[ii] > fOutMat; 75 | }; 76 | }; 77 | 78 | }; 79 | -------------------------------------------------------------------------------- /source/htslib/cram/cram_stats.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2012-2013 Genome Research Ltd. 3 | Author: James Bonfield 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger 16 | Institute nor the names of its contributors may be used to endorse or promote 17 | products derived from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | #ifndef _CRAM_STATS_H_ 32 | #define _CRAM_STATS_H_ 33 | 34 | #ifdef __cplusplus 35 | extern "C" { 36 | #endif 37 | 38 | cram_stats *cram_stats_create(void); 39 | void cram_stats_add(cram_stats *st, int32_t val); 40 | void cram_stats_del(cram_stats *st, int32_t val); 41 | void cram_stats_dump(cram_stats *st); 42 | void cram_stats_free(cram_stats *st); 43 | 44 | /* 45 | * Computes entropy from integer frequencies for various encoding methods and 46 | * picks the best encoding. 47 | * 48 | * FIXME: we could reuse some of the code here for the actual encoding 49 | * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman. 50 | * 51 | * Returns the best codec to use. 52 | */ 53 | enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st); 54 | 55 | #ifdef __cplusplus 56 | } 57 | #endif 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /source/ReadAlign_calcCIGAR.cpp: -------------------------------------------------------------------------------- 1 | #include "ReadAlign.h" 2 | 3 | void ReadAlign::calcCIGAR(Transcript const &trOut, uint nMates, uint iExMate, uint leftMate) { 4 | matesCIGAR.clear(); 5 | for (uint imate=0;imate0) { 27 | samStreamCIGAR << trimL1 << "S"; //initial trimming 28 | }; 29 | 30 | for (uint ii=iEx1;ii<=iEx2;ii++) { 31 | if (ii>iEx1) {//record gaps 32 | uint gapG=trOut.exons[ii][EX_G]-(trOut.exons[ii-1][EX_G]+trOut.exons[ii-1][EX_L]); 33 | uint gapR=trOut.exons[ii][EX_R]-trOut.exons[ii-1][EX_R]-trOut.exons[ii-1][EX_L]; 34 | //it's possible to have a D or N and I at the same time 35 | if (gapR>0){ 36 | samStreamCIGAR << gapR; 37 | samStreamCIGAR << "I"; 38 | }; 39 | if (trOut.canonSJ[ii-1]>=0 || trOut.sjAnnot[ii-1]==1) {//junction: N 40 | samStreamCIGAR << gapG; 41 | samStreamCIGAR << "N"; 42 | } else if (gapG>0) {//deletion: N 43 | samStreamCIGAR << gapG; 44 | samStreamCIGAR << "D"; 45 | }; 46 | }; 47 | samStreamCIGAR << trOut.exons[ii][EX_L] << "M"; 48 | }; 49 | 50 | uint trimR1=(trOut.exons[iEx1][EX_R] 0 ) { 54 | samStreamCIGAR << trimR1 << "S"; //final trimming 55 | }; 56 | matesCIGAR.push_back(samStreamCIGAR.str()); 57 | }; 58 | }; -------------------------------------------------------------------------------- /source/Transcriptome_geneFullAlignOverlap.cpp: -------------------------------------------------------------------------------- 1 | #include "Transcriptome.h" 2 | #include "serviceFuns.cpp" 3 | #include "ReadAnnotations.h" 4 | 5 | void Transcriptome::geneFullAlignOverlap(uint nA, Transcript **aAll, int32 strandType, ReadAnnotFeature &annFeat) 6 | { 7 | // annFeat.fSet={}; 8 | // annFeat.fAlign = {}; 9 | // annFeat.ovType = 0; //exonic/intronic determination is not done 10 | 11 | annFeat.fAlign.resize(nA); 12 | for (uint32 iA=0; iA=0; ib--) {//scan through all blocks of the alignments 18 | 19 | uint64 be1=a.exons[ib][EX_G]+a.exons[ib][EX_L]-1;//end of the block 20 | gi1=binarySearch1a(be1, geneFull.s, (int32) nGe); 21 | 22 | while (gi1>=0 && geneFull.eMax[gi1]>=a.exons[ib][EX_G]) {//these exons may overlap this block 23 | if (geneFull.e[gi1]>=a.exons[ib][EX_G]) {//this gene overlaps the block 24 | int32 str1 = geneFull.str[gi1]==1 ? a.Str : 1-a.Str; 25 | if (strandType==-1 || strandType==str1) { 26 | annFeat.fSet.insert(geneFull.g[gi1]); 27 | annFeat.fAlign[iA].insert(geneFull.g[gi1]); 28 | }; 29 | }; 30 | --gi1;// go to the previous gene 31 | }; 32 | }; 33 | }; 34 | 35 | /* 36 | for (int64 ib=a.nExons-1; ib>=0; ib--) {//scan through all blocks of the alignments 37 | 38 | uint64 be1=a.exons[ib][EX_G]+a.exons[ib][EX_L]-1;//end of the block 39 | int64 gi1=binarySearch1a(be1, geneFull.s, (int32) nGe); //search block-end against gene-starts. Find last gene which still starts to the left of block-end 40 | 41 | while (gi1>=0 && geneFull.eMax[gi1]>=a.exons[ib][EX_G]) {//these genes may overlap this block 42 | if (geneFull.e[gi1]>=a.exons[ib][EX_G]) {//this gene overlaps the block: gene-end is to the right of block start 43 | int32 str1 = geneFull.str[gi1]==1 ? a.Str : 1-a.Str; 44 | if (strandType==-1 || strandType==str1) { 45 | annFeat.fSet.insert(geneFull.g[gi1]); 46 | annFeat.fSetTr=iA; 47 | }; 48 | }; 49 | --gi1;// go to the previous gene 50 | }; 51 | }; 52 | */ 53 | 54 | }; 55 | 56 | 57 | -------------------------------------------------------------------------------- /source/htslib/cram/cram.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2012-2013 Genome Research Ltd. 3 | Author: James Bonfield 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger 16 | Institute nor the names of its contributors may be used to endorse or promote 17 | products derived from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | /*! \file 32 | * CRAM interface. 33 | * 34 | * Consider using the higher level scram_*() API for programs that wish to 35 | * be file format agnostic. 36 | * 37 | * This API should be used for CRAM specific code. The specifics of the 38 | * public API are implemented in cram_io.h, cram_encode.h and cram_decode.h 39 | * although these should not be included directly (use this file instead). 40 | */ 41 | 42 | #ifdef __cplusplus 43 | extern "C" { 44 | #endif 45 | 46 | #ifdef SAMTOOLS 47 | # include "cram/cram_samtools.h" 48 | #endif 49 | 50 | #ifndef _CRAM_H_ 51 | #define _CRAM_H_ 52 | 53 | #include "cram/sam_header.h" 54 | #include "cram_structs.h" 55 | #include "cram_io.h" 56 | #include "cram_encode.h" 57 | #include "cram_decode.h" 58 | #include "cram_stats.h" 59 | #include "cram_codecs.h" 60 | #include "cram_index.h" 61 | 62 | #endif 63 | 64 | #ifdef __cplusplus 65 | } 66 | #endif 67 | -------------------------------------------------------------------------------- /source/htslib/htslib/kstdint.h: -------------------------------------------------------------------------------- 1 | #ifndef KSTDINT_H 2 | #define KSTDINT_H 3 | 4 | #include 5 | 6 | /* Basic assumptions: 1) "char" is 8-bit; 2) there is a 8-bit, 16-bit, 32-bit 7 | * and 64-bit integer type, respectively; 3) "short" is no less than "char", 8 | * "int" is no less than "short", "long" is no less than "int" and "long long" 9 | * is no less than "long"; 4) "int" is at least 16-bit, "long" at least 32-bit 10 | * and "long long" at least 64-bit. The last two assumptions are enforced by 11 | * the C99 spec. 12 | * 13 | * Following assumptions 1) and 2), we know that "signed char"=="int8_t" and 14 | * "short"=="int16_t" for sure. Further from the assumptions, a 32-bit integer 15 | * type must be either "int" or "long". We can test (UINT16_MAX==UINT_MAX) to 16 | * see which is the case. Similarly, a 64-bit integer must be either "long" or 17 | * "long long". We can test (UINT16_MAX==UINT_MAX) to get the definite answer. 18 | */ 19 | 20 | /* 8-bit integers */ 21 | typedef signed char int8_t; 22 | typedef unsigned char uint8_t; 23 | #define INT8_MIN (-SCHAR_MAX-1) 24 | #define INT8_MAX SCHAR_MAX 25 | #define UINT8_MAX UCHAR_MAX 26 | 27 | /* 16-bit integers */ 28 | typedef signed short int16_t; 29 | typedef unsigned short uint16_t; 30 | #define INT16_MIN (-SHRT_MAX-1) 31 | #define INT16_MAX SHRT_MAX 32 | #define UINT16_MAX USHRT_MAX 33 | 34 | /* 32-bit integers */ 35 | #if UINT16_MAX != UINT_MAX 36 | typedef signed int int32_t; 37 | typedef unsigned int uint32_t; 38 | #define INT32_MIN (-INT_MAX-1) 39 | #define INT32_MAX INT_MAX 40 | #define UINT32_MAX UINT_MAX 41 | #else /* then int is 16-bit and long is 32-bit, which may happen to compilers for embedded CPUs */ 42 | typedef signed long int32_t; 43 | typedef unsigned long uint32_t; 44 | #define INT32_MIN (-LONG_MAX-1) 45 | #define INT32_MAX LONG_MAX 46 | #define UINT32_MAX ULONG_MAX 47 | #endif /* ~UINT16_MAX!=UINT_MAX */ 48 | 49 | /* 64-bit integers */ 50 | #if UINT32_MAX != ULONG_MAX 51 | typedef signed long int64_t; 52 | typedef unsigned long uint64_t; 53 | #define INT64_MIN (-LONG_MAX-1) 54 | #define INT64_MAX LONG_MAX 55 | #define UINT64_MAX ULONG_MAX 56 | #else 57 | typedef signed long long int64_t; 58 | typedef unsigned long long uint64_t; 59 | #define INT64_MIN (-LLONG_MAX-1) 60 | #define INT64_MAX LLONG_MAX 61 | #define UINT64_MAX ULLONG_MAX 62 | #endif /* ~UINT32_MAX!=ULONG_MAX */ 63 | 64 | #endif /* ~defined(KSTDINT_H) */ 65 | -------------------------------------------------------------------------------- /extras/tests/scripts/checkCellReadsStats.awk: -------------------------------------------------------------------------------- 1 | # usage awk -f checkCellReadsStats.awk CellReads.stats Features.stats -v flags=mult 2 | 3 | BEGIN { 4 | OFS="\t"; 5 | if (flags~"mult") { 6 | flagMult=1; 7 | print "Yes multi-gene"; 8 | } else { 9 | print "No multi-gene"; 10 | }; 11 | } 12 | 13 | (ARGIND==1) { 14 | if (NR==1) { 15 | for (ff=1; ff<=NF; ff++) { 16 | fn[ff]=$ff; 17 | fi[$ff]=ff; 18 | }; 19 | next; 20 | }; 21 | 22 | if ($1~/^CB/) { 23 | for (ff=2; ff<=NF; ff++) { 24 | nocbStats[ff] += $ff; 25 | }; 26 | next; 27 | }; 28 | # check fields 29 | if ($fi["cbMatch"] != $fi["cbPerfect"]+$fi["cbMMunique"]+$fi["cbMMmultiple"]) { 30 | print; 31 | print "cbMatch != cbPerfect + cbMMunique + cbMMmultiple"; 32 | exit; 33 | }; 34 | if ( flagMult && $fi["featureM"]!=$fi["countedM"] ) { 35 | print; 36 | print "featureM!=countedM"; 37 | exit; 38 | }; 39 | if ( $fi["featureU"] != $fi["countedU"] ) { 40 | print; 41 | print "featureU!=countedU"; 42 | exit; 43 | }; 44 | 45 | for (ff=2; ff<=NF; ff++) { 46 | sumStats[ff] += $ff; 47 | }; 48 | 49 | } 50 | 51 | (ARGIND==2) { 52 | featStats[$1]=$2; 53 | } 54 | 55 | END { 56 | #print "featureM", sumStats[fi["featureM"]], featStats["MultiFeature"]; 57 | # featureM counts reads with defined CB, while MultiFeature also counts reads that may be CB-rejected in inputRecords 58 | print "countedU+M", sumStats[fi["countedU"]]+sumStats[fi["countedM"]], featStats["yesWLmatch"]; 59 | print "countedU", sumStats[fi["countedU"]], featStats["yessubWLmatch_UniqueFeature"]; 60 | print "nUMIunique", sumStats[fi["nUMIunique"]], featStats["yesUMIs"]; 61 | 62 | # this does not work: cbMatch (all cells with valid CBs) cannot be extracted from Features.stats, 63 | # because some of the cells with possible valid barcodes will be classified as noFeature,unmapped,multifeature 64 | #split("noUnmapped noNoFeature noMMtoWLwithoutExact noTooManyWLmatches MultiFeature yesWLmatch", tags1); 65 | #split("noUnmapped noNoFeature noMMtoWLwithoutExact noTooManyWLmatches yesWLmatch", tags1); 66 | #split("noUnmapped noNoFeature noMMtoWLwithoutExact noTooManyWLmatches MultiFeature yessubWLmatch_UniqueFeature", tags1); 67 | #for (tt in tags1) { 68 | # sum1+=featStats[tags1[tt]]; 69 | #}; 70 | #print "cbMatch", sumStats[fi["cbMatch"]], sum1; 71 | 72 | print "---------------------------------- All Sums"; 73 | print "allReads", nocbStats[fi["cbMatch"]]+sumStats[fi["cbMatch"]]; 74 | for (f=2;f<=length(fn);f++) 75 | print fn[f], sumStats[f]+0; 76 | } 77 | -------------------------------------------------------------------------------- /source/Transcriptome.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_Transcriptome 2 | #define CODE_Transcriptome 3 | 4 | #include 5 | 6 | #include "IncludeDefine.h" 7 | #include "Parameters.h" 8 | #include "Transcript.h" 9 | #include "Quantifications.h" 10 | #include "AlignVsTranscript.h" 11 | #include "ReadAnnotations.h" 12 | 13 | class Transcriptome { 14 | public: 15 | string trInfoDir; 16 | 17 | vector trID, geID, geName, geBiotype; //transcript/gene IDs 18 | uint32 nTr, nGe; //number of transcript/genes 19 | 20 | uint *trS, *trE, *trEmax; //transcripts start,end,end-max 21 | 22 | uint32 nEx; //number of exons 23 | uint16 *trExN; //number of exons per transcript 24 | uint32 *trExI; //index of the first exon for each transcript in exSE 25 | uint8 *trStr; //transcript strand 26 | uint32 *exSE; //exons start/end 27 | uint32 *exLenCum; //cumulative length of previous exons 28 | uint32 *trGene; //transcript to gene correspondence 29 | uint32 *trLen; //transcript lengths 30 | 31 | struct {//exon-gene structure for GeneCounts 32 | uint64 nEx;//number of exons/genes 33 | uint64 *s,*e, *eMax; //exon start/end 34 | uint8 *str; //strand 35 | uint32 *g, *t; //gene/transcript IDs 36 | } exG; 37 | 38 | struct {//geneFull structure 39 | uint64 *s, *e, *eMax; 40 | uint8 *str; 41 | uint32 *g; 42 | } geneFull; 43 | 44 | Quantifications *quants; 45 | 46 | //methods: 47 | Transcriptome (Parameters &Pin); //create transcriptome structure, load and initialize parameters 48 | uint32 quantAlign (Transcript &aG, Transcript *aTall);//transform coordinates for all aligns from genomic in RA to transcriptomic in RAtr 49 | void geneCountsAddAlign(uint nA, Transcript **aAll, vector &gene1); //add one alignment to gene counts 50 | void quantsAllocate(); //allocate quants structure 51 | void quantsOutput(); //output quantification files 52 | void geneFullAlignOverlap(uint nA, Transcript **aAll, int32 strandType, ReadAnnotFeature &annFeat); 53 | void geneFullAlignOverlap_ExonOverIntron(uint nA, Transcript **aAll, int32 strandType, ReadAnnotFeature &annFeat, ReadAnnotFeature &annFeatGeneConcordant); 54 | //void geneFullAlignOverlap_CR(uint nA, Transcript **aAll, int32 strandType, ReadAnnotations &readAnnot); 55 | void classifyAlign(Transcript **alignG, uint64 nAlignG, ReadAnnotations &readAnnot); 56 | void alignExonOverlap(uint nA, Transcript **aAll, int32 strandType, ReadAnnotFeature &annFeat); 57 | 58 | private: 59 | Parameters &P; //normal "genomic" parameters 60 | 61 | }; 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /source/Transcript_variationAdjust.cpp: -------------------------------------------------------------------------------- 1 | #include "Transcript.h" 2 | #include "serviceFuns.cpp" 3 | 4 | int Transcript::variationAdjust(const Genome &mapGen, char *R) 5 | { 6 | Variation &Var=*mapGen.Var; 7 | 8 | if (!Var.yes) 9 | {//no variation 10 | return 0; 11 | }; 12 | 13 | int dScore=0;//change in the score 14 | uint nMM1=0; 15 | 16 | //for each block, check whether it overlaps one or more SNPs 17 | for (uint ie=0; ie (exons[ie][EX_G], Var.snp.loci, Var.snp.N); 21 | if (isnp>=0) 22 | { 23 | while ((uint)isnpVar.snp.loci[isnp]) 24 | {//these SNPs overlap the block 25 | varInd.push_back(isnp); //record snp index 26 | varGenCoord.push_back(Var.snp.loci[isnp]-mapGen.chrStart[Chr]); 27 | 28 | varReadCoord.push_back(exons[ie][EX_R]+Var.snp.loci[isnp]-exons[ie][EX_G]); 29 | char ntR=R[varReadCoord.back()];//nt of the read in the SNP position, already trnasformed to + genome strand 30 | 31 | uint8 igt; 32 | if (ntR>3) { 33 | igt=4; 34 | } else { 35 | for (igt=1; igt<3; igt++) {//1st or 2nd allele, =3 of none 36 | if (Var.snp.nt[isnp][igt]==ntR) { 37 | break; 38 | }; 39 | }; 40 | }; 41 | 42 | //if (ntR == Var.snp.nt[isnp][0]) 43 | //{//mark snp that agrees with the reference 44 | // igt*=10; 45 | //}; 46 | 47 | varAllele.push_back(igt); 48 | 49 | if (igt<3 && ntR != Var.snp.nt[isnp][0]) 50 | {//non-reference allele, correct nMM and score 51 | ++nMM1; 52 | }; 53 | 54 | ++isnp; 55 | }; 56 | }; 57 | }; 58 | 59 | #define VAR_noScoreCorrection 60 | #ifndef VAR_noScoreCorrection 61 | if (nMM1>0) 62 | {//one or more mismtaches need to be corrected 63 | uint nMMold=nMM; 64 | alignScore(Read1, G, P); 65 | nMM-=nMM1; 66 | nMatch+=nMM1; 67 | dScore=2*(nMMold-nMM);//score only changes if the number of mismatches is reduced after SNP adjustment 68 | }; 69 | #else 70 | //#warning VAR_noScoreCorrection set: no variation score correction 71 | #endif 72 | 73 | return dScore; 74 | }; 75 | -------------------------------------------------------------------------------- /source/htslib/cram/string_alloc.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2010 Genome Research Ltd. 3 | Author: Andrew Whitwham 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger 16 | Institute nor the names of its contributors may be used to endorse or promote 17 | products derived from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | #ifndef _STRING_ALLOC_H_ 32 | #define _STRING_ALLOC_H_ 33 | 34 | #ifdef __cplusplus 35 | extern "C" { 36 | #endif 37 | 38 | #include 39 | 40 | /* 41 | * A pooled string allocator intended to cut down on the 42 | * memory overhead of many small string allocations. 43 | * 44 | * Andrew Whitwham, September 2010. 45 | */ 46 | 47 | typedef struct { 48 | char *str; 49 | size_t used; 50 | } string_t; 51 | 52 | typedef struct { 53 | size_t max_length; 54 | size_t nstrings; 55 | string_t *strings; 56 | } string_alloc_t; 57 | 58 | string_alloc_t *string_pool_create(size_t max_length); 59 | void string_pool_destroy(string_alloc_t *a_str); 60 | char *string_alloc(string_alloc_t *a_str, size_t length); 61 | char *string_dup(string_alloc_t *a_str, char *instr); 62 | char *string_ndup(string_alloc_t *a_str, char *instr, size_t len); 63 | 64 | #endif 65 | 66 | #ifdef __cplusplus 67 | } 68 | #endif 69 | 70 | -------------------------------------------------------------------------------- /source/ClipMate_clip.cpp: -------------------------------------------------------------------------------- 1 | #include "ParametersClip.h" 2 | #include "Parameters.h" 3 | #include "SequenceFuns.h" 4 | 5 | uint32 ClipMate::clip(uint &Lread, char *seqNum) 6 | { 7 | clippedN=0; 8 | 9 | if (type<0) 10 | return 0; //no clip for this mate 11 | 12 | uint LreadOld=Lread; 13 | 14 | if (N>0) {//clip N bases 15 | if (Lread>N) { 16 | Lread -= N; // for 3p this is all 17 | clippedN += N; 18 | if (type==0) {//5p 19 | memmove(seqNum, seqNum+N, Lread); 20 | }; 21 | } else { 22 | Lread=0; 23 | clippedN=LreadOld; 24 | } 25 | }; 26 | 27 | if (adSeq.length()>0) {//clip adapter 28 | switch (type) { 29 | /* not implemented yet 30 | case 0: {//5p - not tested yet 31 | vector vecMM({20, 22, 24, 30, 40, 50, 60, 70, 80, 90}); 32 | clippedAdN = localSearchGeneral(seqNum, Lread, adSeqNum, -(int32)adSeqNum.size()+1, (int32)Lread-(int32)adSeqNum.size(), adMMp, vecMM, clippedAdMM); 33 | memmove(seqNum, seqNum+clippedAdN, Lread-clippedAdN); 34 | break; 35 | }; 36 | */ 37 | case 1: {//3p 38 | clippedAdN = Lread-localSearch(seqNum, Lread, adSeqNum.data(), adSeqNum.size(), adMMp); 39 | break; 40 | /* new way, not tested properly 41 | uint64 clippedAdN1 = Lread-localSearch(seqNum, Lread, adSeqNum.data(), adSeqNum.size(), adMMp); 42 | 43 | //clippedAdN = localSearchGeneral(seqNum, Lread, adSeqNum, 0, Lread, adMMp, clippedAdMM); 44 | vector vecMM({20, 23, 26, 30, 40, 50, 60, 70, 80, 90}); 45 | clippedAdN = localSearchGeneral(seqNum, Lread, adSeqNum, Lread-1, -1, adMMp, vecMM, clippedAdMM); 46 | 47 | Lread=Lread; 48 | */ 49 | }; 50 | case 10: {//5p: CR4 51 | clippedAdN = min( (uint32)clippedInfo, (uint32)Lread ); 52 | memmove(seqNum, seqNum+clippedAdN, Lread-clippedAdN); 53 | break; 54 | }; 55 | case 11: {//3p: CR4, polyA 56 | clippedAdN = cr4->polyTail3p(seqNum, Lread); 57 | }; 58 | }; 59 | 60 | Lread -= clippedAdN; 61 | clippedN += clippedAdN; 62 | }; 63 | 64 | if (NafterAd>0) { 65 | if (Lread > NafterAd) { 66 | Lread -= NafterAd; 67 | clippedN += NafterAd; 68 | if (type==0) {//5p. For 3p, no need to move sequence 69 | memmove(seqNum, seqNum+NafterAd, Lread); 70 | }; 71 | } else {//0-length after clipping 72 | Lread=0; 73 | clippedN=LreadOld; 74 | }; 75 | }; 76 | 77 | return clippedN; 78 | }; 79 | -------------------------------------------------------------------------------- /source/htslib/vcf.5: -------------------------------------------------------------------------------- 1 | '\" t 2 | .TH vcf 5 "August 2013" "htslib" "Bioinformatics formats" 3 | .SH NAME 4 | vcf \- Variant Call Format 5 | .SH DESCRIPTION 6 | The Variant Call Format (VCF) is a TAB-delimited format with each data line 7 | consisting of the following fields: 8 | .TS 9 | nlbl. 10 | 1 CHROM CHROMosome name 11 | 2 POS the left-most POSition of the variant 12 | 3 ID unique variant IDentifier 13 | 4 REF the REFerence allele 14 | 5 ALT the ALTernate allele(s) (comma-separated) 15 | 6 QUAL variant/reference QUALity 16 | 7 FILTER FILTERs applied 17 | 8 INFO INFOrmation related to the variant (semicolon-separated) 18 | 9 FORMAT FORMAT of the genotype fields (optional; colon-separated) 19 | 10+ SAMPLE SAMPLE genotypes and per-sample information (optional) 20 | .TE 21 | .P 22 | The following table gives the \fBINFO\fP tags used by samtools and bcftools. 23 | .TP 24 | .B AF1 25 | Max-likelihood estimate of the site allele frequency (AF) of the first ALT allele 26 | (double) 27 | .TP 28 | .B DP 29 | Raw read depth (without quality filtering) 30 | (int) 31 | .TP 32 | .B DP4 33 | # high-quality reference forward bases, ref reverse, alternate for and alt rev bases 34 | (int[4]) 35 | .TP 36 | .B FQ 37 | Consensus quality. Positive: sample genotypes different; negative: otherwise 38 | (int) 39 | .TP 40 | .B MQ 41 | Root-Mean-Square mapping quality of covering reads 42 | (int) 43 | .TP 44 | .B PC2 45 | Phred probability of AF in group1 samples being larger (,smaller) than in group2 46 | (int[2]) 47 | .TP 48 | .B PCHI2 49 | Posterior weighted chi^2 P-value between group1 and group2 samples 50 | (double) 51 | .TP 52 | .B PV4 53 | P-value for strand bias, baseQ bias, mapQ bias and tail distance bias 54 | (double[4]) 55 | .TP 56 | .B QCHI2 57 | Phred-scaled PCHI2 58 | (int) 59 | .TP 60 | .B RP 61 | # permutations yielding a smaller PCHI2 62 | (int) 63 | .TP 64 | .B CLR 65 | Phred log ratio of genotype likelihoods with and without the trio/pair constraint 66 | (int) 67 | .TP 68 | .B UGT 69 | Most probable genotype configuration without the trio constraint 70 | (string) 71 | .TP 72 | .B CGT 73 | Most probable configuration with the trio constraint 74 | (string) 75 | .TP 76 | .B VDB 77 | Tests variant positions within reads. Intended for filtering RNA-seq artifacts around splice sites 78 | (float) 79 | .TP 80 | .B RPB 81 | Mann-Whitney rank-sum test for tail distance bias 82 | (float) 83 | .TP 84 | .B HWE 85 | Hardy-Weinberg equilibrium test (Wigginton et al) 86 | (float) 87 | .P 88 | .SH SEE ALSO 89 | .TP 90 | https://github.com/samtools/hts-specs 91 | The full VCF/BCF file format specification 92 | .TP 93 | .I A note on exact tests of Hardy-Weinberg equilibrium 94 | Wigginton JE et al 95 | PMID:15789306 96 | .\" (http://www.ncbi.nlm.nih.gov/pubmed/15789306) 97 | --------------------------------------------------------------------------------