├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── example ├── README.md ├── example_match.png ├── example_results_control.csv ├── yeast_1000.msp └── yeast_exp.mgf ├── include ├── LICENSE └── cxxopts.hpp ├── scripts ├── merge_pin_output.py └── test.csv └── src ├── DefineConstants.h ├── build_index.cpp ├── configuration.cpp ├── configuration.h ├── fragment_ion_index.cpp ├── fragment_ion_index.h ├── index_file_reader.cpp ├── index_file_reader.h ├── index_file_writer.cpp ├── index_file_writer.h ├── indexing_manager.cpp ├── indexing_manager.h ├── library.cpp ├── library.h ├── main.cpp ├── match.cpp ├── match.h ├── mgf_reader.cpp ├── mgf_reader.h ├── msp_reader.cpp ├── msp_reader.h ├── naive_search.cpp ├── precursor_index.cpp ├── precursor_index.h ├── quick_scan.cpp ├── scanner.cpp ├── scanner.h ├── scores.cpp ├── scores.h ├── search_index.cpp ├── search_manager.cpp ├── search_manager.h ├── settings.cpp ├── settings.h ├── spectral_search.cpp ├── spectral_search.h ├── spectrum.cpp ├── spectrum.h ├── test_SIMD.cpp ├── thread_pool.cpp └── thread_pool.h /.gitignore: -------------------------------------------------------------------------------- 1 | **/.idea 2 | **/cmake-* 3 | **/build 4 | **/pBuild 5 | **/.vscode 6 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.13) # CMake version check 2 | project(simple_example) # Create project "simple_example" 3 | set(CMAKE_CXX_STANDARD 20) # Enable c++20 standard 4 | 5 | option(AVX_2 "Add compiler flags to support avx2 instructions" OFF) 6 | option(AVX_512 "Add compiler flags to support avx512 instruction" OFF) 7 | 8 | if (AVX_2) 9 | set(AVX_COMPILE_FLAGS "-mavx2 -march=skylake -DUSE_AVX_2=true") 10 | endif () 11 | 12 | if (AVX_512) 13 | set(AVX_COMPILE_FLAGS "-mavx2 -march=skylake-avx512 -mavx512f -DUSE_AVX_512=true") 14 | endif () 15 | 16 | set(CMAKE_CXX_FLAGS "-Ofast -pthread ${AVX_COMPILE_FLAGS} ") 17 | # Add main.cpp file of project root directory as source file 18 | set(SOURCE_FILES src/spectrum.cpp src/msp_reader.cpp src/scores.cpp src/library.cpp src/library.h src/mgf_reader.cpp src/mgf_reader.h src/spectral_search.h src/match.cpp src/match.h src/fragment_ion_index.cpp src/fragment_ion_index.h src/precursor_index.cpp src/precursor_index.h src/index_file_reader.h src/index_file_reader.cpp src/index_file_writer.cpp src/index_file_writer.h) #deleted src/spectral_search.cpp 19 | 20 | include_directories(include) # Added argument_parser header-only library 21 | 22 | 23 | # Add executable target with source files listed in SOURCE_FILES variable 24 | #add_executable(test_executable src/main.cpp ${SOURCE_FILES}) 25 | #add_executable(quick_scan src/quick_scan.cpp ${SOURCE_FILES} src/scanner.cpp src/scanner.h) 26 | add_executable(mistle-build src/build_index.cpp src/indexing_manager.cpp src/indexing_manager.h src/msp_reader.cpp src/msp_reader.h src/mgf_reader.cpp src/mgf_reader.h src/spectrum.h src/spectrum.cpp src/index_file_writer.cpp src/index_file_writer.h src/precursor_index.cpp src/precursor_index.h src/fragment_ion_index.cpp src/fragment_ion_index.h src/index_file_reader.cpp src/index_file_reader.h src/configuration.cpp src/configuration.h src/thread_pool.cpp src/thread_pool.h src/settings.cpp src/settings.h) 27 | add_executable(mistle-search src/search_index.cpp src/indexing_manager.cpp src/indexing_manager.h src/msp_reader.cpp src/msp_reader.h src/spectrum.h src/spectrum.cpp src/index_file_writer.cpp src/index_file_writer.h src/precursor_index.cpp src/precursor_index.h src/fragment_ion_index.cpp src/fragment_ion_index.h src/index_file_reader.cpp src/index_file_reader.h src/search_manager.cpp src/search_manager.h src/configuration.cpp src/configuration.h src/library.cpp src/library.h src/mgf_reader.cpp src/mgf_reader.h src/match.cpp src/match.h src/thread_pool.cpp src/thread_pool.h src/settings.cpp src/settings.h) 28 | #add_executable(naive-search src/naive_search.cpp src/library.cpp src/precursor_index.cpp src/msp_reader.cpp src/mgf_reader.cpp src/spectrum.cpp src/fragment_ion_index.cpp src/index_file_writer.cpp src/spectral_search.cpp src/scores.cpp src/match.cpp) 29 | #add_executable(test-SIMD src/test_SIMD.cpp) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (2022) [Yannek Nowatzky, Bundesanstalt für Materialforschung und -prüfung (BAM)] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mistle 2 | 3 | Mistle is a fast spectral search engine. It uses a fragment-indexing technique and SIMD intrinsics to match experimental MS2 spectra to large spectral libraries at a high performance. Find out more about Mistle in our publication: 4 | 5 | >**Mistle: bringing spectral library predictions to metaproteomics with an efficient search index** 6 | > Yannek Nowatzky, Philipp Benner, Knut Reinert, Thilo Muth 7 | > Bioinformatics, Volume 39, Issue 6, June 2023, btad376, https://doi.org/10.1093/bioinformatics/btad376 8 | 9 | Please use the above citation, if you are using Mistle. 10 | 11 | ## Requirements 12 | Tested only on linux (debian) for the specified versions: 13 | 14 | * C++20 15 | * Cmake (version 3.19.3) 16 | * g++ (10.2.1) 17 | 18 | ## Build 19 | 20 | For building the project, please create (mkdir) a separate build directory. Change into the build directory and run: 21 | 22 | cmake /path/to/mistle/ 23 | cmake --build . 24 | 25 | In order to make use of SIMD instruction AVX2 or AVX512 build with -DAVX_2=ON or -DAVX_512=ON compiler flag. Check if your CPU supports these. If necessary adjust CMakeList.txt according to the preferences of your CPU. 26 | 27 | Optionally, export the directory where *mistle* was built as an executable PATH in the *~/.bashrc* file. Add the following line: 28 | 29 | export PATH="/home/$USER/path/to/mistle/build:$PATH" 30 | 31 | 32 | ## Usage 33 | 34 | ### Mistle build 35 | 36 | Build Mistle's fragment ion index from spectral library. 37 | 38 | mistle-build -i /path/to/library/ -o /path/to/index/ [optional args] 39 | 40 | Required arguments are the input directory, which must contain spectral library files (.msp or .mgf format), and the output directory for the fragment index. 41 | 42 | ### Mistle search 43 | 44 | Search experimental mass spectra in Mistle's fragment ion index. 45 | 46 | 47 | mistle-search -s /path/to/search_file.mgf -i /path/to/index/ [optional args] 48 | 49 | Required arguments are the search file (.mgf or .msp format) and the path to the fragment index. Additionally, output directory and formats can be specified as well as various search parameters. Use *-h* flag to print the help message for more information. Also, refer to the [EXAMPLE README](example/README.md) and the example directory to test the program. 50 | 51 | ## Output format 52 | 53 | Peptide spectrum matches (PSMs) are provided in tab separated format. 54 | First line (comment tagged by #) names the exact shell command and parameters used to produce the output. 55 | 56 | The next line is the header listing all tracked attributes (tab separated). 57 | 58 | id spectrum charge hit_rank match peptide isomers similarity bias [...] 59 | 60 | A large number of scores and statistics are appended as additional columns (marked [...]). A detailed explanation of the scores can be found in the next section. 61 | 62 | Below the header, all matched experimental spetra are listed and indexed by their scan name and the rank of the matched library spectrum. (Rank R is appended with /R to the scan name). See example [output](example/example_results_control.csv). 63 | 64 | Alternatively, a pin-tab format that is readable by Percolator (Käll *et al.*, 2007) can be produced, listing the same scores as features. To obtain this output format, the user needs to specify the output path (*-o*) during mistle-search with the file extension *.pin*. Note that the library label needs to be set correctly at index construction (1: target, -1: decoy libary) and the *results.pin* files of target and decoy search need to be concatenated or merged before using Percolator. It's recommended to use the this python [script](scripts/merge_pin_output.py) to merge the query results and correctly update delta scores. 65 | 66 | ## Scores 67 | 68 | *Similarity* is the preferred baseline score, which is a refined version of the normalized dot product based on square root transformed peak intensities. A *bias* measurement highlights how biased the *similarity* is on a few matching peaks, and a *delta_similarity* score describes the *similarity* difference between the top hit and second-best hit. Additionally, an *annotation_similarity* version of these scores exists, which accounts only for peak intensities matching reference peaks. This is useful when the library consists of fewer annotated or predicted peaks and is less noisy than the query spectra. 69 | 70 | As a high-quality discriminant scoring function we suggest the *avg_bias_adjusted_similarity*, which is composed equally of the *similarity* and *annotation similarity* metrics. Specifically, a *bias-adjusted similarity* (*sim2*) is calculated by the product of *similarity* and *(1-bias)* and is averaged between standard and annotation version. This scoring function provides excellent discrimination between target and decoy matches. 71 | 72 | 73 | 74 | 75 | ## Known issues 76 | 77 | ### On linux 78 | 79 | Input files coming from Windows distributions may have a line ending with \r\n (carriage return). Linux and Mistle require \n as the exclusive line ending. 80 | Remove \r character (char 13) using the following commad line 81 | * *tr -d '\r' < FILE.mgf > FILE_FIXED.mgf* 82 | 83 | -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | # Mistle Example Usage 2 | 3 | A toy example is provided to test the program. A library of 1000 simulated mass spectra (predicted by Prosit (Gessulat et al., 2019)) of yeast peptides (*Saccharomyces cerevisiae*) is used for reference. 3 experimental spectra matching the species are selected from the 9MM FASP dataset (Tanca et al., 2013). 4 | 5 | ## Running the test 6 | 7 | Open terminal or change into this directory (path/to/mistle/example). Create a new directory for the index with *mkdir index*. To construct the fragment index from the example spectral library, run 8 | 9 | mistle-build -i yeast_1000.msp -o index/ -n 4 -t 1 10 | 11 | This should create 4 index partitions (binary format) in the index directory, a precursor index file, and a human-readable config.txt. Note that if the PATH to mistle-build and mistle-search is not exported, the user is required to specify the executables (e.g. */path/to/mistle/build/mistle-build [...]*). 12 | 13 | Next, perform example searches 14 | 15 | mistle-search -s yeast_exp.mgf -i index/ -o example_results.csv -p 10 -b 0.2 -t 1 --hits_per_spectrum 1 16 | 17 | This should produce an *example_results.csv* file in the current directory. Compare the output to the *example_results_control.csv*, which already resides in this directory. If they align, *mistle* is configured correctly and is read to use. (Note that floating-point inaccuracy may occur when using different hardware or advanced vector extensions. Results and scores may deviate from the control in the last digit(s)). 18 | 19 | Below, the first match of the list is displayed by a mirror plot (experimental spectrum top; matched simulated spectrum bottom) generated using the python *spectrum_utils* package. 20 | 21 | Logo 22 | -------------------------------------------------------------------------------- /example/example_match.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BAMeScience/Mistle/ca43b8c3ad98163827a49d1ad16b17360cefb8bb/example/example_match.png -------------------------------------------------------------------------------- /example/example_results_control.csv: -------------------------------------------------------------------------------- 1 | #mistle-search -s yeast_exp.mgf -i index/ -o example_results.csv -p 10 -b 0.2 -t 1 --hits_per_spectrum 1 2 | id spectrum charge hit_rank match peptide isomers similarity bias annotation_similarity annotation_bias dot_product delta_dot delta_similarity delta_sim2 mass_difference peak_count_query peak_count_ref sim2 x_score x_score_dot x_lgamma x_lgamma_dot st_score st_score_dot 3 | 9MM_FASP.42839.42839.2/1 9MM_FASP.42839.42839.2 2 1 997 DVAAQDFINAYASFLQR 0.601264 0.283916 0.965076 0.283916 0.603536 0.56061 0.558355 0.415923 -0.000671387 1000 21 0.430556 8.53642e+18 8.56868e+18 41.8269 41.8307 0.732213 0.733672 4 | 9MM_FASP.40363.40363.2/1 9MM_FASP.40363.40363.2 2 1 998 TAGIQIVADDLTVTNPAR 0.603145 0.291692 0.911218 0.291692 0.580075 0.51632 0.531702 0.393204 -0.00201416 1000 18 0.427212 3.86156e+15 3.71386e+15 32.9995 32.9605 0.714507 0.704081 5 | 9MM_FASP.16147.16147.2/1 9MM_FASP.16147.16147.2 2 1 999 TNEAAGDGTTSATVLGR 0.652331 0.293515 0.935983 0.293515 0.639207 0.639207 0.652331 0.460862 0 1000 20 0.460862 1.58706e+18 1.55513e+18 38.9127 38.8924 0.791398 0.783524 6 | -------------------------------------------------------------------------------- /example/yeast_exp.mgf: -------------------------------------------------------------------------------- 1 | BEGIN IONS 2 | TITLE=9MM_FASP.42839.42839.2 3 | RTINSECONDS=16250.9127 4 | PEPMASS=964.979370117188 3021333.001953000203 5 | CHARGE=2+ 6 | 101.0707016 33087.515625 7 | 110.0712433 4736.6293945313 8 | 112.0866013 6470.267578125 9 | 115.0863647 21151.966796875 10 | 116.0705643 5116.921875 11 | 118.8389282 11158.431640625 12 | 118.9499054 4830.1323242188 13 | 120.080574 47082.66015625 14 | 127.0864334 4406.6918945313 15 | 129.1019897 24503.693359375 16 | 131.4036865 5022.0922851563 17 | 136.0753479 20044.265625 18 | 142.0860596 5587.869140625 19 | 143.0812683 33124.98828125 20 | 155.0811462 5163.4438476563 21 | 158.0922546 13841.7548828125 22 | 160.1084595 8320.451171875 23 | 160.5261078 5044.71875 24 | 169.0962067 5223.8173828125 25 | 171.1124725 9784.7001953125 26 | 175.1188354 78650.90625 27 | 183.0763702 9102.408203125 28 | 183.149292 6998.7641601563 29 | 185.1283875 4452.4750976563 30 | 186.0867462 12476.5283203125 31 | 187.0678558 4350.3198242188 32 | 187.1074982 168854.46875 33 | 188.1114807 9722.388671875 34 | 197.1283569 8607.0908203125 35 | 199.1074982 4636.8251953125 36 | 200.1027679 35163.23828125 37 | 200.1387634 4821.775390625 38 | 201.1234436 9745.8994140625 39 | 207.1132355 5408.6245117188 40 | 211.1442719 4847.7451171875 41 | 215.102478 205652.546875 42 | 216.1076813 15140.7451171875 43 | 226.0823364 11055.17578125 44 | 226.1181183 7858.3627929688 45 | 227.0666809 12251.2333984375 46 | 228.1340485 10020.7353515625 47 | 229.1179047 7222.828125 48 | 233.1644745 14039.5283203125 49 | 235.1073761 26386.353515625 50 | 241.1179199 9526.1181640625 51 | 242.1497345 16059.6181640625 52 | 243.1462708 4499.3676757813 53 | 244.0925293 63368.921875 54 | 245.0948639 6055.5258789063 55 | 254.1498718 4388.76171875 56 | 258.1073608 4900.2783203125 57 | 261.1590881 9499.5302734375 58 | 263.102356 19258.388671875 59 | 268.1455994 5136.5786132813 60 | 269.1246643 9959.6552734375 61 | 270.144928 14568.9599609375 62 | 271.1398315 26787.990234375 63 | 286.138092 149259.984375 64 | 287.1423645 17651.451171875 65 | 288.1345215 7673.8784179688 66 | 298.1027832 17691.439453125 67 | 299.1706543 10069.3173828125 68 | 303.1766357 25486.693359375 69 | 304.1318665 4636.7861328125 70 | 306.1441956 9795.7177734375 71 | 312.1547241 4983.0 72 | 315.129364 60298.390625 73 | 316.1318054 7792.0830078125 74 | 322.1389771 6632.0043945313 75 | 328.1326599 4369.7392578125 76 | 329.1812439 5224.4736328125 77 | 334.1390686 5000.8452148438 78 | 349.1504211 16053.3134765625 79 | 357.1764832 39490.07421875 80 | 358.1777649 5851.5483398438 81 | 369.1389465 6786.1235351563 82 | 373.1526794 5985.4814453125 83 | 374.1323242 7837.0854492188 84 | 374.2080994 4354.1879882813 85 | 375.2001343 9772.779296875 86 | 376.1856689 6498.8842773438 87 | 378.1285706 8469.3818359375 88 | 382.2089539 8304.3955078125 89 | 386.1663208 42721.83984375 90 | 387.1668701 5545.8603515625 91 | 388.1896057 4768.796875 92 | 391.1607666 25515.9765625 93 | 399.2348633 15695.8251953125 94 | 416.2614136 22494.701171875 95 | 417.1773987 6709.1762695313 96 | 419.2272949 4557.974609375 97 | 420.1894226 9260.6181640625 98 | 434.2037659 7486.6000976563 99 | 441.2110901 7759.9013671875 100 | 445.1200256 69070.4296875 101 | 446.2433472 5370.7470703125 102 | 451.1963501 4808.5693359375 103 | 459.2232361 8310.0439453125 104 | 462.193634 9719.5283203125 105 | 485.2359619 5535.1767578125 106 | 487.2197571 5813.396484375 107 | 488.2176514 5587.8901367188 108 | 505.2393188 5599.9204101563 109 | 507.2207947 4686.4897460938 110 | 546.3031006 6769.8666992188 111 | 561.2654419 11778.8291015625 112 | 563.328186 18782.712890625 113 | 600.2608032 10594.6923828125 114 | 601.2592773 4544.8671875 115 | 636.2729492 4648.42578125 116 | 650.361084 45236.2578125 117 | 651.3654785 9676.509765625 118 | 672.2941284 7039.6684570313 119 | 721.3981323 60447.5703125 120 | 722.4011841 17896.1328125 121 | 884.4619141 68769.625 122 | 885.4655151 33357.15234375 123 | 886.46698 5799.1528320313 124 | 955.4975586 63005.56640625 125 | 956.5010376 27270.22265625 126 | 957.5122681 4905.5834960938 127 | 1052.522339 21506.96484375 128 | 1053.520386 9743.3623046875 129 | 1069.541504 163773.625 130 | 1070.543945 88356.46875 131 | 1071.550537 18056.533203125 132 | 1080.509766 11126.55859375 133 | 1081.516235 5718.029296875 134 | 1165.599854 5571.5815429688 135 | 1182.626831 79612.2578125 136 | 1183.624268 50932.83203125 137 | 1184.629395 12699.2744140625 138 | 1193.590332 8337.3564453125 139 | 1312.669067 4801.9868164063 140 | 1329.693115 64695.46875 141 | 1330.695923 45749.37890625 142 | 1331.692749 16048.2177734375 143 | 1427.700439 8999.4677734375 144 | 1428.711182 6104.9599609375 145 | 1444.722534 70359.890625 146 | 1445.723999 57531.28125 147 | 1446.71814 17378.087890625 148 | 1554.77771 4594.904296875 149 | 1555.751587 13335.068359375 150 | 1556.755737 11739.7900390625 151 | 1572.781494 28325.662109375 152 | 1573.781494 22906.359375 153 | 1574.783325 8739.1484375 154 | 1643.809448 14282.5986328125 155 | 1644.825073 10837.4716796875 156 | END IONS 157 | BEGIN IONS 158 | TITLE=9MM_FASP.40363.40363.2 159 | RTINSECONDS=15496.7056 160 | PEPMASS=928.001647949219 6510079.84375 161 | CHARGE=2+ 162 | 101.0708542 77888.9140625 163 | 110.0711594 19265.568359375 164 | 112.0866776 8511.8994140625 165 | 120.0806885 17434.61328125 166 | 129.0660095 12940.408203125 167 | 129.1021881 51164.7109375 168 | 130.2890015 15493.1015625 169 | 130.4004669 8281.0634765625 170 | 131.1177673 8266.388671875 171 | 136.0758362 15647.880859375 172 | 141.1023254 8588.7392578125 173 | 141.2971344 18972.61328125 174 | 141.4078522 9020.203125 175 | 143.1179199 126126.296875 176 | 145.097168 64125.0078125 177 | 153.8619843 7908.4111328125 178 | 155.0814362 9177.904296875 179 | 157.1086884 7781.240234375 180 | 158.0924683 11019.87109375 181 | 169.097168 13650.4208984375 182 | 169.1336365 12272.03125 183 | 171.1129761 59545.9921875 184 | 173.0922089 131033.53125 185 | 173.1264801 7183.7651367188 186 | 175.1191406 79029.640625 187 | 183.1134796 11350.1455078125 188 | 185.0920715 9108.482421875 189 | 185.1648865 10192.673828125 190 | 186.1235504 9753.572265625 191 | 188.1394806 9003.4716796875 192 | 196.1446686 18823.2421875 193 | 197.1286163 32228.150390625 194 | 201.1239166 19130.03125 195 | 212.1031647 34801.296875 196 | 213.1598969 12924.912109375 197 | 214.1550903 76166.0234375 198 | 215.1400452 8345.6455078125 199 | 216.0982056 7945.4721679688 200 | 224.1395874 16281.8115234375 201 | 225.122879 9684.4462890625 202 | 229.1320038 8665.00390625 203 | 230.1137543 314596.1875 204 | 231.1168671 27428.09765625 205 | 241.1549072 7073.8876953125 206 | 242.1500092 110686.9375 207 | 243.1539612 7429.3100585938 208 | 254.1505127 11942.2900390625 209 | 259.1763916 8622.40234375 210 | 283.1408997 8339.05078125 211 | 283.1772461 9583.833984375 212 | 286.1399231 10952.8779296875 213 | 296.1983032 9854.3203125 214 | 297.1914673 11624.509765625 215 | 298.175415 16445.005859375 216 | 299.1714478 88843.1015625 217 | 300.175415 9889.9638671875 218 | 302.0975342 15737.2451171875 219 | 308.1587524 7444.9565429688 220 | 315.2029114 131341.859375 221 | 316.206604 14992.513671875 222 | 323.2087402 7304.1611328125 223 | 324.1923523 16106.66796875 224 | 325.1870728 31081.755859375 225 | 326.183075 86579.640625 226 | 327.1889038 7246.5234375 227 | 329.1815491 7393.8139648438 228 | 340.1617432 10642.8330078125 229 | 341.1447144 11160.146484375 230 | 341.2179871 15938.5361328125 231 | 343.2012939 83693.2890625 232 | 344.2012024 13296.505859375 233 | 358.1729126 18888.26171875 234 | 360.224823 15558.7197265625 235 | 367.2328186 7427.3828125 236 | 370.2089539 29496.71484375 237 | 382.2107239 7742.3227539063 238 | 397.2044067 7452.1977539063 239 | 398.2061462 9965.7998046875 240 | 401.1687622 13184.6484375 241 | 411.2567749 10920.513671875 242 | 412.2570801 18743.740234375 243 | 415.1811523 9001.271484375 244 | 425.2499695 10866.1943359375 245 | 426.2323303 12793.5400390625 246 | 427.1837463 7111.9663085938 247 | 436.2202759 12748.3720703125 248 | 440.2252502 42649.328125 249 | 443.2632751 10977.650390625 250 | 445.1200256 57425.515625 251 | 445.9109497 7194.9560546875 252 | 453.2455444 42437.90234375 253 | 454.2271729 19238.376953125 254 | 457.2518005 33785.8515625 255 | 471.2563782 48694.91796875 256 | 472.262207 10564.0478515625 257 | 498.2193298 14104.1591796875 258 | 514.2525024 7893.763671875 259 | 516.2279053 12113.2705078125 260 | 539.3215332 9356.349609375 261 | 556.3502197 11642.4951171875 262 | 558.2995605 52882.86328125 263 | 559.3012085 14281.9990234375 264 | 566.3325806 17119.720703125 265 | 584.3406982 16211.431640625 266 | 597.2859497 19063.87109375 267 | 598.2902832 8193.8447265625 268 | 615.2963867 11352.3564453125 269 | 657.3684082 32348.044921875 270 | 698.336792 7151.2465820313 271 | 758.414978 53055.1796875 272 | 759.4221191 14875.1953125 273 | 818.3222656 13256.541015625 274 | 827.4797974 10391.4892578125 275 | 871.5007935 145674.875 276 | 872.5050049 58906.9921875 277 | 873.5021362 9140.0087890625 278 | 942.4938354 12114.091796875 279 | 969.5022583 10454.162109375 280 | 986.5270386 117214.0234375 281 | 987.5280151 52795.1796875 282 | 988.5264893 8640.2451171875 283 | 1084.534302 10026.9853515625 284 | 1101.552979 81097.75 285 | 1102.554688 52823.890625 286 | 1103.547485 8687.359375 287 | 1128.561157 11452.8505859375 288 | 1154.578247 10374.5087890625 289 | 1155.570435 8706.462890625 290 | 1172.592163 229647.375 291 | 1173.592407 125851.75 292 | 1174.594604 31559.689453125 293 | 1227.639038 14841.5625 294 | 1228.633301 7910.3549804688 295 | 1253.651978 13080.9677734375 296 | 1254.654053 7995.7016601563 297 | 1271.658203 261442.078125 298 | 1272.661621 145139.15625 299 | 1273.659424 35773.10546875 300 | 1340.716064 9193.5107421875 301 | 1384.742798 117004.96875 302 | 1385.744629 78891.1953125 303 | 1386.742676 24188.56640625 304 | 1469.788574 7112.1069335938 305 | 1494.796021 11901.818359375 306 | 1495.778198 39154.66796875 307 | 1496.779297 23381.884765625 308 | 1497.790405 7938.263671875 309 | 1512.80188 53807.27734375 310 | 1513.803589 48195.265625 311 | 1514.80127 16152.77734375 312 | END IONS 313 | BEGIN IONS 314 | TITLE=9MM_FASP.16147.16147.2 315 | RTINSECONDS=6778.9345 316 | PEPMASS=810.894836425781 172010.068847699993 317 | CHARGE=2+ 318 | 101.0707703 1217.3948974609 319 | 102.0546494 1171.0164794922 320 | 106.9486847 583.6018066406 321 | 110.0708237 405.9848327637 322 | 111.0064316 301.827331543 323 | 111.4262161 295.3147277832 324 | 112.0869141 497.2803039551 325 | 113.0343399 1190.6411132813 326 | 116.0333099 338.2010192871 327 | 120.0800018 1300.7055664063 328 | 126.0643921 662.5599975586 329 | 126.1747437 415.7754211426 330 | 127.0862961 320.0195922852 331 | 129.0657196 2649.853515625 332 | 129.102005 2786.2429199219 333 | 133.5157471 317.4209289551 334 | 136.0754089 1103.1275634766 335 | 141.8504944 435.1179504395 336 | 143.0809021 958.8742675781 337 | 147.1132355 589.1596069336 338 | 149.8830109 375.5520324707 339 | 155.0826111 599.940246582 340 | 158.0915985 509.8851928711 341 | 169.0982056 368.4065246582 342 | 171.0762177 1581.8051757813 343 | 172.0719452 1258.7937011719 344 | 173.0920105 1573.6694335938 345 | 175.1190796 3150.8952636719 346 | 181.0608063 1131.0991210938 347 | 181.5852051 303.6040649414 348 | 181.5953674 315.2122497559 349 | 183.0770569 508.908416748 350 | 185.0924988 516.541809082 351 | 188.1027527 1764.9710693359 352 | 198.0875244 1088.6629638672 353 | 199.0714264 2894.9458007813 354 | 201.0868835 1389.2589111328 355 | 212.1040955 513.1270751953 356 | 215.1139832 532.7591552734 357 | 216.0977478 15811.8017578125 358 | 216.6444855 345.5026550293 359 | 217.0833435 602.8255615234 360 | 217.1008911 877.2660522461 361 | 220.3491211 316.3650817871 362 | 226.0816956 1178.1096191406 363 | 226.118576 611.6033325195 364 | 232.1404572 2523.6206054688 365 | 233.1648254 1126.3984375 366 | 242.1139069 897.2562866211 367 | 244.0923615 4751.0708007813 368 | 254.1131744 750.9210205078 369 | 258.1089783 1152.3553466797 370 | 272.1256409 552.3010864258 371 | 280.0917664 390.0112915039 372 | 282.1081238 466.878692627 373 | 294.1807861 980.2778320313 374 | 297.1178589 826.6633911133 375 | 297.1839905 313.9739379883 376 | 298.1026611 1294.8156738281 377 | 299.9552612 316.6298217773 378 | 310.1011658 638.9633178711 379 | 311.1316223 479.184753418 380 | 315.129303 2370.4162597656 381 | 317.1424255 894.8659667969 382 | 325.1171875 753.084777832 383 | 327.1298828 1602.0756835938 384 | 328.1131592 3785.6862792969 385 | 328.1950378 747.895324707 386 | 330.846283 368.969543457 387 | 343.1257019 390.5220947266 388 | 343.1565247 385.1192321777 389 | 345.1400757 8732.6298828125 390 | 345.2227783 2003.0583496094 391 | 346.1449585 1156.5886230469 392 | 350.3141174 399.9468688965 393 | 357.1445007 509.2389526367 394 | 371.1553345 2326.8596191406 395 | 381.1363525 609.2717895508 396 | 398.1668091 969.7222290039 397 | 399.150116 2073.6362304688 398 | 405.2097778 697.6733398438 399 | 416.1773682 1853.4498291016 400 | 428.1824646 350.5044555664 401 | 444.2934875 1221.3486328125 402 | 444.7707214 764.6782836914 403 | 445.7382813 395.6271362305 404 | 445.9076843 754.2020263672 405 | 483.0623169 354.065032959 406 | 487.2117615 509.4663391113 407 | 536.208313 544.1396484375 408 | 545.3380737 1747.8859863281 409 | 546.3411865 562.4497680664 410 | 550.7709961 522.1591186523 411 | 616.3764038 1612.9791259766 412 | 617.3740845 382.4893188477 413 | 623.2409668 376.6261291504 414 | 703.4082031 2703.6765136719 415 | 704.4126587 830.151550293 416 | 804.4575806 3665.4167480469 417 | 805.4550171 1032.9884033203 418 | 905.5076904 1447.0906982422 419 | 918.5022583 585.7772216797 420 | 944.5067749 1127.22265625 421 | 962.5265503 9907.2578125 422 | 963.5253296 4811.33203125 423 | 964.5343018 996.1773681641 424 | 1077.550171 2585.9892578125 425 | 1078.551758 1037.4976806641 426 | 1090.558472 704.0826416016 427 | 1116.557373 1197.2247314453 428 | 1117.574219 465.5159301758 429 | 1134.574829 9586.7255859375 430 | 1135.577026 5311.3666992188 431 | 1136.590942 1415.9747314453 432 | 1161.610352 662.1987304688 433 | 1187.596802 1224.1402587891 434 | 1205.609131 7781.3291015625 435 | 1206.610596 4323.6279296875 436 | 1207.613403 1575.6195068359 437 | 1276.649414 3899.642578125 438 | 1277.644653 2981.9809570313 439 | 1278.64978 850.1265258789 440 | 1387.694702 504.8869018555 441 | 1405.680786 840.313293457 442 | 1406.700195 1185.7927246094 443 | 1522.564087 399.8696899414 444 | END IONS 445 | -------------------------------------------------------------------------------- /include/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 Jarryd Beck 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /scripts/merge_pin_output.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import argparse 4 | import random 5 | from matplotlib import pyplot as plt 6 | 7 | def parse_args(): 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("-t", "--target", 11 | help="target search results file (.pin) format", 12 | type=str,required=True) 13 | parser.add_argument("-d", "--decoy", 14 | help="decoy search results file (.pin) format", 15 | type=str,required=True) 16 | parser.add_argument("-o", "--output", 17 | help="output file (.pin) format", 18 | type=str, required=True) 19 | parser.add_argument("--score", 20 | help="discriminant score for target decoy competition", 21 | type=str, default="avg_bias_adjusted_similarity") 22 | parser.add_argument("--update_delta_scores", 23 | help="Update delta scores when target/decoy are 1st and 2nd ranked hit", 24 | action='store_true') 25 | parser.add_argument("--main_features", 26 | help="Track main features only. Otherwise all features will be tracked, which can reduce separation performance.", 27 | action='store_true') 28 | parser.add_argument("--drop_redundant_features", 29 | help="Track main features only. Otherwise all features will be tracked, which can reduce separation performance.", 30 | action='store_true') 31 | 32 | 33 | 34 | args = parser.parse_args() 35 | return args 36 | 37 | 38 | def update_delta_scores(df1, idx1, df2, idx2): 39 | if df1.at[idx1, "delta_avg"] > df1.at[idx1, "avg_bias_adjusted_similarity"] - df2.at[idx2, "avg_bias_adjusted_similarity"]: 40 | df1.at[idx1, "delta_similarity"] = df1.at[idx1, "similarity"] - df2.at[idx2, "similarity"] 41 | df1.at[idx1, "delta_dot"] = df1.at[idx1, "dot_product"] - df2.at[idx2, "dot_product"] 42 | df1.at[idx1, "delta_annotation_similarity"] = df1.at[idx1, "annotation_similarity"] - df2.at[idx2, "annotation_similarity"] 43 | df1.at[idx1, "delta_sim2"] = df1.at[idx1, "sim2"] - df2.at[idx2, "sim2"] 44 | df1.at[idx1, "delta_avg"] = df1.at[idx1, "avg_bias_adjusted_similarity"] - df2.at[idx2, "avg_bias_adjusted_similarity"] 45 | return 46 | 47 | def merge_files(args): 48 | 49 | print("+++ Merging target and decoy results (.pin format) +++") 50 | df = pd.read_csv(args.target, sep='\t', comment='#', low_memory=False) 51 | df_decoy = pd.read_csv(args.decoy, sep='\t', comment='#', low_memory=False) 52 | 53 | target_nans = df.isnull().any(axis=1).sum() 54 | decoy_nans = df_decoy.isnull().any(axis=1).sum() 55 | if target_nans > 0 or decoy_nans > 0: 56 | print(f"Waring: NaN values detected. Dropping {target_nans} target and {decoy_nans} decoy matches.") 57 | df.dropna(inplace=True) 58 | df_decoy.dropna(inplace=True) 59 | 60 | if not all(df["Label"].unique() == 1): 61 | print("Warning: Not all target labels match expected value of 1.") 62 | 63 | if not all(df_decoy["Label"].unique() == -1): 64 | print("Warning: Not all decoy labels match expected value of -1.") 65 | 66 | print(f"Detected {df.shape[0]} target and {df_decoy.shape[0]} decoy matches.") 67 | scans = df["ScanNr"].unique() 68 | 69 | for num in scans: 70 | decoy_match = df_decoy[df_decoy["ScanNr"] == num] 71 | if len(decoy_match) == 0: 72 | continue 73 | elif len(decoy_match) > 1: 74 | print("Error: multiple occurance of a ScanNr") 75 | exit(1) 76 | else: 77 | decoy_idx = decoy_match.index[0] 78 | decoy_match = decoy_match.iloc[0] 79 | 80 | target_match = df[df["ScanNr"] == num] 81 | target_idx = target_match.index[0] 82 | target_match = target_match.iloc[0] 83 | 84 | # Equal peptide -> Drop decoy 85 | if target_match["Peptide"].replace("L", "I") == decoy_match["Peptide"].replace("L", "I"): 86 | df_decoy.drop(decoy_idx, inplace=True) 87 | continue 88 | 89 | #print(target_idx, decoy_idx) 90 | # Compare score -> Keep higher scoring match 91 | if target_match[args.score] > decoy_match[args.score]: 92 | if args.update_delta_scores: 93 | update_delta_scores(df, target_idx, df_decoy, decoy_idx) 94 | df_decoy.drop(decoy_idx, inplace=True) 95 | else: 96 | if args.update_delta_scores: 97 | update_delta_scores(df_decoy, decoy_idx, df, target_idx) 98 | df.drop(target_idx, inplace=True) 99 | 100 | 101 | 102 | df = pd.concat([df, df_decoy], ignore_index=True) 103 | #df["sim2_half"] = df["sim2"] / 2.0 104 | #df["sim2_double"] = 2.0 * df["sim2"] 105 | 106 | #cols = ["PSMId", "Label", "ScanNr", "sim2", "sim2_half", "sim2_double", "Peptide", "Proteins"] 107 | #df = df[cols] 108 | 109 | if args.main_features: 110 | features = ["charge", "similarity", "bias", "delta_similarity", "sim2", "delta_sim2", "annotation_similarity", "annotation_bias", "annotation_sim2", "delta_annotation_similarity", "peak_count_ref", "avg_bias_adjusted_similarity", "delta_avg", "abs_mass_difference", "ppm_difference", "peptide_length", "precursor_mz"] 111 | col = ["PSMId", "Label", "ScanNr"] + features + ["Peptide", "Proteins"] 112 | df = df[col] 113 | if args.drop_redundant_features: 114 | df.drop(columns=["x_score", "x_score_dot"], inplace=True) 115 | #df.drop(columns=["fragment_standard_deviation", "fragment_weighted_standard_deviation"], inplace=True) 116 | #if args.experimental: 117 | # df["exp1"] = 118 | #df.drop(columns=["x_score", "x_score_dot"], inplace=True) 119 | #df.drop(columns=["fragment_standard_deviation", "fragment_weighted_standard_deviation"], inplace=True) 120 | df.to_csv(args.output, sep="\t", index=False) 121 | 122 | num_targets = sum(df["Label"] == 1) 123 | num_decoys = sum(df["Label"] == -1) 124 | 125 | 126 | print(f"Files merged successfully! {num_targets} targets and {num_decoys} decoys remaining after competition.") 127 | 128 | 129 | 130 | def main(): 131 | args = parse_args() 132 | merge_files(args) 133 | 134 | if __name__ == "__main__": 135 | main() 136 | 137 | 138 | # bug fix: Remove -inf values 139 | # sed -i 's/-inf/-9999/g' yeast_td.pin -------------------------------------------------------------------------------- /scripts/test.csv: -------------------------------------------------------------------------------- 1 | 2 | PSMId Label ScanNr charge similarity bias annotation_similarity annotation_bias avg_bias_adjusted_similarity dot_product delta_dot delta_similarity delta_annotation_similarity delta_sim2 delta_avg dot_contrast_angle similarity_contrast_angle annotation_contrast_angle mass_difference abs_mass_difference ppm_difference peptide_length precursor_mz peak_count_query peak_count_ref fragment_standard_deviation fragment_weighted_standard_deviation sim2 annotation_sim2 x_score x_score_dot x_lgamma x_lgamma_dot st_score st_score_dot Peptide Proteins 3 | 64 1 189 2 1.00149 0.220921 0.823551 0.220921 0.710924 0.873902 0.444778 0.342503 0.202489 0.304451 0.248828 0.676836 0.61602 -0.000854492 0.000854492 1.30494 11 654.814 1000 142 0.408765 0.421254 0.780238 0.641611 0.0 0.0 560.171 560.034 0.73769 0.727923 X.YGRPPDSHHSR.X Unknown 4 | 5 | 6 | charge similarity bias annotation_similarity annotation_bias avg_bias_adjusted_similarity dot_product delta_dot delta_similarity delta_annotation_similarity delta_sim2delta_avg dot_contrast_angle similarity_contrast_angle annotation_contrast_angle mass_difference abs_mass_difference ppm_difference peptide_length precursor_mz peak_count_query peak_count_ref fragment_standard_deviation fragment_weighted_standard_deviation sim2 annotation_sim2 x_score x_score_dot x_lgamma x_lgamma_dot st_score st_score_dot m0 7 | -0.0833 -0.2382 -0.1786 2.4159 -0.1786 -0.0354 1.3627 0.3950 -0.4690 -0.1501 0.1838 0.9446 0.0305 -1.3467 -0.4678 -0.0915 -0.0932 -0.4207 0.0715 0.1952 0.0000 1.7078 0.0120 0.0322 -0.7453 0.3660 0.1577 0.1706 0.0408 0.1718 0.3820 0.4089 -0.9830 8 | 9 | -------------------------------------------------------------------------------- /src/DefineConstants.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_DEFINECONSTANTS_H 2 | #define SIMPLE_EXAMPLE_DEFINECONSTANTS_H 3 | 4 | #define BIN_MIN_MZ 0 // spectrast uses 10 // why? 5 | #define BIN_MAX_MZ 2000 6 | 7 | #define STANDARD_PARENT_UPPER_MZ 1500 8 | #define STANDARD_PARENT_LOWER_MZ 400 9 | 10 | //#define FLOAT_OUTPUT_PRECISION std::numeric_limits::max_digits10 11 | 12 | 13 | #endif //SIMPLE_EXAMPLE_DEFINECONSTANTS_H 14 | -------------------------------------------------------------------------------- /src/build_index.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "indexing_manager.h" 4 | 5 | using namespace std; 6 | 7 | 8 | cxxopts::ParseResult parseArgs(int argc, const char* argv[], std::vector &input_directories, const std::shared_ptr& config) { 9 | try { 10 | for (int i = 0; i < argc; ++i) { 11 | config->build_command += argv[i]; 12 | config->build_command += " "; 13 | } 14 | config->build_command.pop_back(); 15 | 16 | cxxopts::Options options("mistle-build", "Build mistle's fragment ion index for spectral matching"); 17 | 18 | options.positional_help("[optional args]").show_positional_help(); 19 | 20 | options.add_options() 21 | ("h, help", "Print this help message") 22 | ("i,input", "list of input files or directories containing mass spectra (.msp format)", cxxopts::value(), "PATH") 23 | ("o,output", "output directory where indices will be generated", cxxopts::value(), "PATH") 24 | ("n,num_indices", "number of buckets the fragment ion index will be split in", cxxopts::value()->default_value("64"), "NUM") 25 | ("min_pep_length", "Minimum peptide length for the reference spectrum to be loaded into the index", cxxopts::value()->default_value("7"), "NUM") 26 | ("label", "Give the library a label (1: target; -1: decoy)", cxxopts::value()->default_value("1"), "NUM") 27 | ("t,threads", "number of threads (experimental)\n - 1 thread for reading, other threads for processing. Has increased RAM costs (try using more threads or GLIBC_TUNABLES=glibc.malloc.tcache_count=0 for compensation)", cxxopts::value()->default_value("1"), "NUM"); 28 | 29 | options.parse_positional({"input", "output"}); 30 | 31 | auto result = options.parse(argc,argv); 32 | 33 | 34 | if (result.count("help")) 35 | { 36 | std::cout << options.help() << std::endl; 37 | exit(0); 38 | } 39 | if (result.count("threads")) { 40 | config->num_build_threads = result["threads"].as(); 41 | } 42 | if (result.count("num_indices")) { 43 | config->num_indices = result["num_indices"].as(); 44 | } 45 | if (result.count("input")) { 46 | // Parse list of input directories (separated by black space) 47 | std::string dir_list = result["input"].as(); 48 | std::string::size_type start_pos = 0; 49 | for (auto end_pos = 0; (end_pos = dir_list.find(' ', end_pos)) != std::string::npos; ++end_pos) 50 | { 51 | input_directories.push_back(dir_list.substr(start_pos, end_pos - start_pos)); 52 | start_pos = end_pos + 1; 53 | } 54 | 55 | input_directories.push_back(dir_list.substr(start_pos)); 56 | } else { 57 | std::cout << "Argument Error: Missing input directory." << std::endl; 58 | exit(1); 59 | } 60 | if (result.count("output")) { 61 | config->idx_path = result["output"].as(); 62 | } else { 63 | std::cout << "Argument Error: Missing output directory." << std::endl; 64 | exit(1); 65 | } 66 | config->minimum_peptide_length = result["min_pep_length"].as(); 67 | config->label = result["label"].as(); 68 | 69 | 70 | return result; 71 | 72 | } 73 | catch (const cxxopts::OptionException& e) { 74 | std::cout << "Error parsing options: " << e.what() << std::endl; 75 | exit(1); 76 | } 77 | } 78 | 79 | 80 | int main(int argc, const char* argv[]) { 81 | cout << "+++ Mistle Build +++" << endl; 82 | 83 | /* 84 | * Args 85 | */ 86 | std::vector input_directories; 87 | std::shared_ptr config = std::make_shared(); 88 | 89 | parseArgs(argc, argv, input_directories, config); 90 | 91 | /* 92 | * Build indices 93 | */ 94 | 95 | 96 | auto start = chrono::high_resolution_clock::now(); 97 | indexing_manager im(input_directories, config); 98 | im.build_indices(); 99 | auto stop = chrono::high_resolution_clock::now(); 100 | auto duration = duration_cast(stop - start); 101 | cout << "Total time elapsed: " << duration.count() << " seconds" << endl; 102 | 103 | return 0; 104 | 105 | } -------------------------------------------------------------------------------- /src/configuration.cpp: -------------------------------------------------------------------------------- 1 | #include "configuration.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | unsigned int configuration::assign_to_index(float mz) { 10 | for (int i = 0; i < (num_indices - 1); ++i) { 11 | if (mz < sub_idx_limits[i]) { 12 | return i; 13 | } 14 | } 15 | return num_indices - 1; 16 | } 17 | 18 | bool configuration::save_configuration_to_file(const std::string& config_file_path) { 19 | 20 | std::ofstream f(config_file_path, std::ios::out); 21 | std::string delimiter = ";"; 22 | 23 | f << "Num indices: " << num_indices << "\n"; 24 | f << "Index limits: "; 25 | for (unsigned int lim : sub_idx_limits) { 26 | f << lim << delimiter; 27 | } 28 | f << "\n"; 29 | f << "Label: " << label << "\n"; 30 | f << "Min peptide length: " << minimum_peptide_length << "\n"; 31 | f << "Build command: " << build_command << "\n"; 32 | f.close(); 33 | return true; 34 | } 35 | 36 | bool configuration::load_configuration_from_file(const std::string& config_file_path) { 37 | 38 | std::ifstream f(config_file_path, std::ios::in); 39 | std::string delimiter = ";"; 40 | 41 | //First line 42 | std::string line; 43 | getline(f, line); 44 | 45 | if(line.rfind("Num indices: ", 0) == 0) { 46 | num_indices = std::stoi(line.substr(13, std::string::npos)); 47 | } else { 48 | std::cerr << "Wrong config format" << std::endl; 49 | return false; 50 | } 51 | 52 | //Second line 53 | getline(f, line); 54 | if(line.rfind("Index limits: ", 0) == 0) { 55 | std::stringstream ss(line.substr(14, std::string::npos)); 56 | std::string str; 57 | while(getline(ss, str, ';')) { 58 | sub_idx_limits.push_back(std::stoi(str)); 59 | } 60 | if (sub_idx_limits.size() != num_indices - 1) { 61 | std::cerr << "Num sub idx not matching" << std::endl; 62 | return false; 63 | } 64 | } else { 65 | std::cerr << "Wrong config format" << std::endl; 66 | return false; 67 | } 68 | 69 | //Third line (Label) 70 | getline(f, line); 71 | if(line.rfind("Label: ", 0) == 0) { 72 | label = std::stoi(line.substr(7, std::string::npos)); 73 | } else { 74 | label = 1; 75 | } 76 | 77 | f.close(); 78 | 79 | idx_path = config_file_path.substr(0,config_file_path.rfind('/') + 1); 80 | precursor_index_path = idx_path + "precursor_idx.bin"; 81 | for (int i = 0; i < num_indices; ++i) { 82 | sub_idx_file_names.push_back(idx_path + "frag_idx_" + std::to_string(i) + ".bin"); 83 | } 84 | 85 | 86 | 87 | return true; 88 | } 89 | -------------------------------------------------------------------------------- /src/configuration.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_CONFIGURATION_H 2 | #define SIMPLE_EXAMPLE_CONFIGURATION_H 3 | 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * Config class 10 | * 11 | * Handles all (meta-)information about index set-up and configuration 12 | * Can be set by arguments or loaded from file. 13 | */ 14 | 15 | class configuration { 16 | public: 17 | 18 | std::string idx_path = ""; 19 | std::string precursor_index_path; 20 | unsigned int num_indices = 24; 21 | int label = 1; 22 | 23 | 24 | unsigned int sub_idx_range; 25 | std::vector sub_idx_limits; 26 | std::vector sub_idx_file_names; 27 | unsigned int minimum_peptide_length; 28 | std::string build_command; 29 | 30 | //TODO parse more info and move to file_writer/reader 31 | bool save_configuration_to_file(const std::string& config_file_path); 32 | bool load_configuration_from_file(const std::string& config_file_path); 33 | 34 | 35 | unsigned int assign_to_index(float mz); 36 | 37 | 38 | /* 39 | * Build only 40 | */ 41 | 42 | int num_build_threads = 1; 43 | 44 | }; 45 | 46 | 47 | #endif //SIMPLE_EXAMPLE_CONFIGURATION_H 48 | -------------------------------------------------------------------------------- /src/fragment_ion_index.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "fragment_ion_index.h" 8 | #include "DefineConstants.h" 9 | #include "settings.h" 10 | 11 | using namespace std; 12 | 13 | 14 | fragment_ion_index::fragment_ion_index() { 15 | 16 | } 17 | 18 | 19 | fragment_ion_index::fragment_ion_index(precursor_index *parent_index) { 20 | 21 | /* 22 | * todo 23 | 24 | fragment_bins = vector(BIN_MAX_MZ + 1); //TODO remove/determine actual max #bins 25 | for (int i = 0; i < parent_index->get_size(); ++i) { 26 | spectrum *c_spectrum = parent_index->get_spectrum(i); 27 | 28 | 29 | //Iterate all peaks and save them as fragments in the corresponding ion mz bin 30 | for (int j = 0; j < c_spectrum->binned_peaks.size(); ++j) { 31 | int bin = c_spectrum->binned_peaks[j]; 32 | fragment frag(c_spectrum->id, c_spectrum->binned_intensities[j]); 33 | fragment_bins[bin].push_back(frag); 34 | } 35 | } 36 | */ 37 | } 38 | 39 | 40 | 41 | 42 | 43 | fragment_ion_index::fragment_ion_index(string path) : file_path(path) { 44 | 45 | //load_index_from_file(file_path); 46 | load_index_from_binary_file(file_path); 47 | prepare_axv_access(); 48 | 49 | } 50 | 51 | bool fragment_ion_index::sort_index(std::unique_ptr& parent_index) { 52 | 53 | /* 54 | * Sort all bins according to parent rankings 55 | */ 56 | 57 | for (fragment_bin &bin : fragment_bins) { 58 | sort(bin.begin(), bin.end(), [&](fragment a, fragment b){ 59 | return parent_index->get_rank(a.parent_id) < parent_index->get_rank(b.parent_id); 60 | }); 61 | } 62 | 63 | return true; 64 | } 65 | 66 | bool fragment_ion_index::load_index_from_file(const std::string& path) { 67 | 68 | /* 69 | * Read index from file 70 | */ 71 | 72 | ifstream f(path, ios::in); 73 | string delimiter = ";"; 74 | 75 | fragment_bins.clear(); 76 | fragment_bins.resize(BIN_MAX_MZ + 1); 77 | 78 | string line; 79 | while (getline(f,line)) { 80 | size_t delim_pos = line.find(delimiter); 81 | size_t delim_right_pos = line.rfind(delimiter); 82 | 83 | unsigned int id = stoi(line.substr(0, delim_pos)); 84 | int mz_bin = stoi(line.substr(delim_pos + 1, delim_right_pos - delim_pos - 1)); 85 | float intensity = stof(line.substr(delim_right_pos + 1, string::npos)); 86 | 87 | fragment_bins[mz_bin].emplace_back(fragment(id, intensity)); 88 | 89 | } 90 | 91 | 92 | f.close(); 93 | return true; 94 | } 95 | 96 | bool fragment_ion_index::save_index_to_file(const string &path) { 97 | 98 | /* 99 | * TODO: OBSOLETE using binary file reader 100 | */ 101 | 102 | ofstream f(path, ios::out); 103 | //f.precision(FLOAT_OUTPUT_PRECISION); 104 | 105 | string delimiter = ";"; 106 | 107 | for (int i = 0; i < fragment_bins.size(); ++i) { 108 | 109 | fragment_bin bin = fragment_bins[i]; 110 | for (auto & j : bin) { 111 | f << j.parent_id << delimiter << i << delimiter << j.intensity << "\n"; 112 | } 113 | } 114 | 115 | f.close(); 116 | return true; 117 | } 118 | 119 | bool fragment_ion_index::load_index_from_binary_file(const string &path) { 120 | 121 | /* 122 | * Read index from binary file 123 | */ 124 | 125 | ifstream f(path, ios::binary | ios::in); 126 | 127 | fragment_bins.clear(); 128 | fragment_bins.resize(int((BIN_MAX_MZ - BIN_MIN_MZ) / settings::bin_size) + 1); 129 | 130 | while (!f.eof()) { //TODO might not actually end the loop correctly 131 | unsigned int id; 132 | float mz; 133 | float intensity; 134 | 135 | f.read((char *) &id, sizeof(unsigned int)); 136 | if (f.eof()) { //Double check 137 | 138 | break; 139 | } 140 | f.read((char *) &mz, sizeof(float)); 141 | f.read((char *) &intensity, sizeof(float)); 142 | 143 | if (settings::turn_off_fragment_intensities) { 144 | intensity = 1.f; 145 | } 146 | 147 | if (mz > BIN_MAX_MZ || mz < BIN_MIN_MZ) { 148 | continue; 149 | } 150 | int mz_bin = spectrum::get_mz_bin(mz); 151 | 152 | //if (BIN_MIN_MZ > 1) 153 | // std::cerr << "NEIJ: " << BIN_MIN_MZ << std::endl; 154 | 155 | // Same parent peaks falling into the same bin 156 | if (!fragment_bins[mz_bin].empty() && fragment_bins[mz_bin].back().parent_id == id) { 157 | fragment &frag = fragment_bins[mz_bin].back(); 158 | if (abs(frag.mz - mz) > settings::bin_size){ 159 | std::cerr << "NONO " << frag.mz << " " << mz << std::endl; 160 | exit(1); 161 | } 162 | //Track peak composition in fragment 163 | if (frag.peak_composition.empty()) { 164 | frag.peak_composition.emplace_back(frag.mz, frag.intensity); 165 | } 166 | frag.peak_composition.emplace_back(mz, intensity); 167 | 168 | if (frag.peak_composition.size() > 100) { 169 | std::cerr << "NOT LIKE THIS :( " << std::endl; 170 | } 171 | //Update overall intensity 172 | frag.intensity = sqrt(frag.intensity * frag.intensity + intensity * intensity); 173 | 174 | } 175 | else { 176 | fragment_bins[mz_bin].emplace_back(fragment(id, intensity, mz)); 177 | } 178 | 179 | } 180 | 181 | 182 | f.close(); 183 | return true; 184 | } 185 | 186 | bool fragment_ion_index::save_index_to_binary_file(const string &path) { 187 | 188 | ofstream f(path, ios::binary | ios::out); 189 | 190 | 191 | for (auto &bin : fragment_bins) { 192 | 193 | for (auto & j : bin) { 194 | f.write((char *) &j.parent_id, sizeof(unsigned int)); //TODO 195 | f.write((char *) &j.mz, sizeof(float)); 196 | f.write((char *) &j.intensity, sizeof(float)); 197 | } 198 | } 199 | 200 | f.close(); 201 | return true; 202 | } 203 | 204 | #if USE_AVX_512 205 | bool fragment_ion_index::prepare_axv_access() { 206 | frag_bins.clear(); 207 | frag_bins.resize(fragment_bins.size()); 208 | for (int i = 0; i < fragment_bins.size(); ++i) { 209 | __m512 intensity_x16; 210 | __m512i identity_x16; 211 | int ranks[16]; 212 | for (int j = 0; j < fragment_bins[i].size(); ++j) { 213 | frag_bins[i].intensities.push_back(fragment_bins[i][j].intensity); 214 | frag_bins[i].parent_ids.push_back(fragment_bins[i][j].parent_id); 215 | if(j % 16 == 0 && j > 0) { 216 | intensity_x16 = _mm512_loadu_ps(&frag_bins[i].intensities[j-16]); 217 | identity_x16 = _mm512_loadu_si512((__m256i*)&frag_bins[i].parent_ids[j-16]); 218 | frag_bins[i]._intensities.push_back(intensity_x16); 219 | frag_bins[i]._parent_ids.push_back(identity_x16); 220 | //frag_bins[i]._parent_ranks.push_back(_mm256_load_si256((__m256i*)& ranks)); 221 | } 222 | //ranks[j % 16] = (int) precursor_idx->get_rank(fragment_bins[i][j].parent_id); 223 | } 224 | assert(reinterpret_cast(frag_bins[i]._intensities.data()) % alignof(__m512) == 0); 225 | assert(reinterpret_cast(frag_bins[i]._parent_ids.data()) % alignof(__m512i) == 0); 226 | //assert(reinterpret_cast(frag_bins[i]._parent_ranks.data()) % alignof(__m256i) == 0); 227 | } 228 | return true; 229 | } 230 | #elif USE_AVX_2 231 | bool fragment_ion_index::prepare_axv_access() { 232 | frag_bins.clear(); 233 | frag_bins.resize(fragment_bins.size()); 234 | for (int i = 0; i < fragment_bins.size(); ++i) { 235 | __m256 intensity_x8; 236 | __m256i identity_x8; 237 | int ranks[8]; 238 | for (int j = 0; j < fragment_bins[i].size(); ++j) { 239 | frag_bins[i].intensities.push_back(fragment_bins[i][j].intensity); 240 | frag_bins[i].parent_ids.push_back(fragment_bins[i][j].parent_id); 241 | if(j % 8 == 0 && j > 0) { 242 | intensity_x8 = _mm256_loadu_ps(&frag_bins[i].intensities[j-8]); 243 | //identity_x8 = _mm256_loadu_si256((__m256i*)&frag_bins[i].parent_ids[j-8]); 244 | frag_bins[i]._intensities.push_back(intensity_x8); 245 | //frag_bins[i]._parent_ids.push_back(identity_x8); 246 | //frag_bins[i]._parent_ranks.push_back(_mm256_load_si256((__m256i*)& ranks)); 247 | } 248 | //ranks[j % 8] = (int) precursor_idx->get_rank(fragment_bins[i][j].parent_id); 249 | } 250 | assert(reinterpret_cast(frag_bins[i]._intensities.data()) % alignof(__m256) == 0); 251 | //assert(reinterpret_cast(frag_bins[i]._parent_ids.data()) % alignof(__m256i) == 0); 252 | //assert(reinterpret_cast(frag_bins[i]._parent_ranks.data()) % alignof(__m256i) == 0); 253 | } 254 | return true; 255 | } 256 | 257 | #else 258 | bool fragment_ion_index::prepare_axv_access() { 259 | return false; 260 | } 261 | #endif 262 | 263 | 264 | bool fragment_ion_index::load_preliminary_index_from_binary_file(const string &path) { 265 | /* 266 | * Read index from binary file 267 | */ 268 | 269 | ifstream f(path, ios::binary | ios::in); 270 | 271 | fragment_bins.clear(); 272 | fragment_bins.resize(1); 273 | 274 | while (!f.eof()) { //TODO might not actually end the loop correctly 275 | unsigned int id; 276 | float mz; 277 | float intensity; 278 | 279 | f.read((char *) &id, sizeof(unsigned int)); 280 | f.read((char *) &mz, sizeof(float)); 281 | f.read((char *) &intensity, sizeof(float)); 282 | 283 | fragment_bins[0].emplace_back(fragment(id, intensity, mz)); 284 | 285 | } 286 | 287 | 288 | f.close(); 289 | return true; 290 | } -------------------------------------------------------------------------------- /src/fragment_ion_index.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_FRAGMENT_ION_INDEX_H 2 | #define SIMPLE_EXAMPLE_FRAGMENT_ION_INDEX_H 3 | #include 4 | #include 5 | #include "precursor_index.h" 6 | 7 | 8 | struct fragment { 9 | unsigned int parent_id; 10 | float intensity; 11 | float mz; 12 | 13 | // If multiple peaks are contributing to the fragment - keep track of composition 14 | std::vector> peak_composition = {}; // 15 | 16 | fragment(unsigned int parent_id, float intensity) : parent_id(parent_id), intensity(intensity) {}; 17 | fragment(unsigned int parent_id, float intensity, float mz) : parent_id(parent_id), intensity(intensity), mz(mz) {}; 18 | }; 19 | 20 | 21 | typedef std::vector fragment_bin; 22 | 23 | 24 | 25 | struct __attribute__ ((aligned (32))) fragment_binn { 26 | __attribute__ ((aligned (32))) std::vector intensities; 27 | __attribute__ ((aligned (32))) std::vector parent_ids; 28 | #if USE_AVX_512 29 | __attribute__ ((aligned (32))) std::vector<__m512> _intensities; 30 | __attribute__ ((aligned (32))) std::vector<__m512i> _parent_ids; 31 | 32 | #elif USE_AVX_2 33 | __attribute__ ((aligned (32))) std::vector<__m256> _intensities; 34 | __attribute__ ((aligned (32))) std::vector<__m256i> _parent_ids; 35 | __attribute__ ((aligned (32))) std::vector<__m256i> _parent_ranks; 36 | #endif 37 | }; 38 | 39 | class fragment_ion_index { 40 | public: 41 | 42 | std::shared_ptr precursor_idx; 43 | std::string file_path; 44 | std::vector fragment_bins; 45 | __attribute__ ((aligned (32))) std::vector frag_bins; 46 | 47 | 48 | fragment_ion_index(); 49 | explicit fragment_ion_index(precursor_index *parent_index); 50 | explicit fragment_ion_index(std::string path); 51 | 52 | bool sort_index(std::unique_ptr& parent_index); 53 | 54 | 55 | bool prepare_axv_access(); 56 | bool load_index_from_file(const std::string& path); 57 | bool load_index_from_binary_file(const std::string& path); 58 | bool load_preliminary_index_from_binary_file(const std::string& path); 59 | bool save_index_to_file(const std::string& path); 60 | bool save_index_to_binary_file(const std::string& path); 61 | 62 | }; 63 | 64 | 65 | #endif //SIMPLE_EXAMPLE_FRAGMENT_ION_INDEX_H 66 | -------------------------------------------------------------------------------- /src/index_file_reader.cpp: -------------------------------------------------------------------------------- 1 | #include "index_file_reader.h" 2 | #include 3 | #include 4 | 5 | 6 | bool index_file_reader::read_file_into_precursor_index(const std::string &file_path, 7 | const std::shared_ptr& precursor_idx) { 8 | 9 | 10 | std::ifstream f(file_path, std::ios::in); 11 | std::string delimiter = ";"; 12 | std::string line; 13 | 14 | if (!getline(f, line)) { 15 | return false; 16 | } 17 | 18 | if (line.rfind("Num: ", 0) != 0) { 19 | std::cerr << "Incorrect file format" << std::endl; 20 | return false; 21 | } 22 | 23 | //Read header 24 | unsigned int size = std::stoi(line.substr(5, std::string::npos)); //TODO check 4 or 5 25 | precursor_idx->set_size(size); 26 | 27 | // Parse precursors line by line 28 | //precursor_idx->add_precursor_record(p); 29 | while (getline(f, line)) { 30 | 31 | size_t delim_pos = line.find(delimiter); 32 | unsigned int id = std::stoi(line.substr(0, delim_pos)); 33 | 34 | size_t length = line.find(delimiter, delim_pos + 1) - delim_pos; 35 | unsigned int rank = std::stoi(line.substr(delim_pos + 1, length - 1)); 36 | 37 | delim_pos = delim_pos + length; 38 | length = line.find(delimiter, delim_pos + 1) - delim_pos; 39 | float mz = std::stof(line.substr(delim_pos + 1, length - 1)); 40 | 41 | delim_pos = delim_pos + length; 42 | length = line.find(delimiter, delim_pos + 1) - delim_pos; 43 | int charge = std::stoi(line.substr(delim_pos + 1, length - 1)); 44 | std::string peptide = line.substr(delim_pos + length + 1, std::string::npos); 45 | 46 | precursor_idx->add_precursor_record(precursor(id, rank, mz, charge, peptide)); 47 | } 48 | if (precursor_idx->get_size() != size) { 49 | std::cerr << "Wrong number of precursors" << std::endl; 50 | } 51 | 52 | return true; 53 | } 54 | -------------------------------------------------------------------------------- /src/index_file_reader.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_INDEX_FILE_READER_H 2 | #define SIMPLE_EXAMPLE_INDEX_FILE_READER_H 3 | 4 | 5 | #include 6 | #include "precursor_index.h" 7 | 8 | class index_file_reader { 9 | public: 10 | static bool read_file_into_precursor_index(const std::string &file_path, const std::shared_ptr& precursor_idx); 11 | }; 12 | 13 | 14 | #endif //SIMPLE_EXAMPLE_INDEX_FILE_READER_H 15 | -------------------------------------------------------------------------------- /src/index_file_writer.cpp: -------------------------------------------------------------------------------- 1 | #include "index_file_writer.h" 2 | #include "DefineConstants.h" 3 | #include 4 | #include 5 | 6 | 7 | bool index_file_writer::stream_peaks_to_file(std::fstream &f, unsigned int parent_id, const std::shared_ptr& spec) { 8 | 9 | std::string delimiter = ";"; 10 | for (int i = 0; i < spec->binned_peaks.size(); ++i) { 11 | int bin = spec->binned_peaks[i]; 12 | float intensity = spec->binned_intensities[i]; 13 | 14 | f << parent_id << delimiter << bin << delimiter << intensity << "\n"; 15 | 16 | } 17 | 18 | 19 | return true; 20 | } 21 | 22 | bool index_file_writer::save_precursor_index(const std::string& file_path, std::vector &precursors) { 23 | 24 | std::ofstream f(file_path, std::ofstream::out); 25 | std::string delimiter = ";"; 26 | //f.precision(FLOAT_OUTPUT_PRECISION); 27 | 28 | //Have num precursors as header (needed for efficient parsing) 29 | f << "Num: " << precursors.size() << "\n"; 30 | 31 | /* 32 | * ENCODING: ID;RANK;MZ;CHARGE;PEPTIDE 33 | */ 34 | for (precursor &p : precursors) { 35 | f << p.id << delimiter << p.rank << delimiter << p.mz << delimiter << p.charge << delimiter << p.peptide << "\n"; 36 | } 37 | 38 | f.close(); 39 | 40 | return true; 41 | } 42 | 43 | bool index_file_writer::save_matches_to_file(const std::string &file_path, std::vector &matches) { 44 | std::fstream outfile; 45 | std::string delimiter = ";"; 46 | 47 | 48 | outfile.open(file_path, std::ios::out); 49 | if (!outfile.good()) 50 | return false; 51 | 52 | // Add header 53 | outfile << "spectrum"+delimiter+"match"+delimiter+"peptide"+delimiter+"dot-product"+delimiter+"mass-difference\n"; 54 | 55 | // Go through matches and parse relevant information for each 56 | for (int i = 0; i < matches.size(); ++i) { 57 | match psm = matches[i]; 58 | //TODO 59 | outfile << psm.query_spectrum->name << delimiter << psm.matched_spectrum->name << delimiter << psm.matched_spectrum->peptide << delimiter << psm.dot_product << delimiter << psm.mass_difference << "\n"; 60 | } 61 | 62 | outfile.close(); 63 | return true; 64 | } 65 | 66 | bool index_file_writer::stream_peaks_to_binary_file(std::fstream &f, unsigned int parent_id, 67 | const std::shared_ptr &spec) { 68 | for (int i = 0; i < spec->intensities.size(); ++i) { 69 | float mz = spec->peak_positions[i]; 70 | float intensity = spec->intensities[i]; 71 | 72 | f.write((char *) &parent_id, sizeof(unsigned int)); 73 | f.write((char *) &mz, sizeof(float)); 74 | f.write((char *) &intensity, sizeof(float)); 75 | 76 | } 77 | 78 | 79 | return true; 80 | } 81 | 82 | bool index_file_writer::save_precursor_index_to_binary_file(const std::string &file_path, 83 | std::vector &precursors) { 84 | 85 | std::fstream f(file_path, std::ios::binary | std::ofstream::out); 86 | 87 | 88 | //Save #of precursors as first element (needed for efficient parsing) 89 | unsigned int size = precursors.size(); 90 | f.write((char *) &size, sizeof(unsigned int)); 91 | 92 | /* 93 | * ENCODING: ID;RANK;MZ;CHARGE;SIZE_OF_PEPTIDE;PEPTIDE (No semi-colons in binary) 94 | */ 95 | for (precursor &p : precursors) { 96 | f.write((char *) &p.id, sizeof(unsigned int)); 97 | f.write((char *) &p.rank, sizeof(unsigned int)); 98 | f.write((char *) &p.mz, sizeof(float)); 99 | f.write((char *) &p.charge, sizeof(int)); 100 | size_t pep_size = p.peptide.size(); 101 | f.write((char *) &pep_size, sizeof(pep_size)); 102 | f.write(p.peptide.c_str(), pep_size); 103 | } 104 | 105 | f.close(); 106 | 107 | return true; 108 | } 109 | -------------------------------------------------------------------------------- /src/index_file_writer.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_INDEX_FILE_WRITER_H 2 | #define SIMPLE_EXAMPLE_INDEX_FILE_WRITER_H 3 | 4 | 5 | #include 6 | #include "spectrum.h" 7 | #include "precursor_index.h" 8 | #include "match.h" 9 | 10 | class index_file_writer { 11 | public: 12 | 13 | //TODO static std::string delimiter; 14 | 15 | static bool stream_peaks_to_file(std::fstream &f, unsigned int parent_id, const std::shared_ptr& spec); 16 | static bool stream_peaks_to_binary_file(std::fstream &f, unsigned int parent_id, const std::shared_ptr& spec); 17 | static bool save_precursor_index(const std::string& file_path, std::vector &precursors); 18 | static bool save_precursor_index_to_binary_file(const std::string& file_path, std::vector &precursors); 19 | static bool save_matches_to_file(const std::string& file_path, std::vector &matches); 20 | }; 21 | 22 | 23 | #endif //SIMPLE_EXAMPLE_INDEX_FILE_WRITER_H 24 | -------------------------------------------------------------------------------- /src/indexing_manager.cpp: -------------------------------------------------------------------------------- 1 | #include "indexing_manager.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "msp_reader.h" 7 | #include "mgf_reader.h" 8 | #include "index_file_writer.h" 9 | #include "DefineConstants.h" 10 | #include "fragment_ion_index.h" 11 | 12 | using namespace std; 13 | 14 | indexing_manager::indexing_manager() { 15 | cout << "Empty Constructor not in use" << endl; 16 | exit(1); 17 | } 18 | 19 | indexing_manager::indexing_manager(string path) { 20 | cout << "! Using IndexingManager without config parameter is deprecated" << endl; 21 | exit(1); 22 | /* 23 | * Init 24 | * TODO delete (eventually) 25 | */ 26 | 27 | precursorIndex = make_unique(); 28 | 29 | config->sub_idx_range = (STANDARD_PARENT_UPPER_MZ - STANDARD_PARENT_LOWER_MZ) / config->num_indices; 30 | for (int i = 1; i < config->num_indices; ++i) { //Starting from 1 31 | cout << "LIMIT: " << STANDARD_PARENT_LOWER_MZ + config->sub_idx_range * i << endl; 32 | config->sub_idx_limits.push_back(STANDARD_PARENT_LOWER_MZ + config->sub_idx_range * i); 33 | } 34 | 35 | 36 | for (const auto & entry : std::filesystem::directory_iterator(path)) { 37 | if (entry.path().extension() == ".msp") { 38 | lib_files.push_back(entry); 39 | } 40 | } 41 | } 42 | 43 | 44 | indexing_manager::indexing_manager(std::vector &input_paths, std::shared_ptr config) : input_paths(input_paths), config(config) { 45 | 46 | /* 47 | * Init 48 | */ 49 | 50 | precursorIndex = make_unique(); 51 | pool = std::make_shared(config->num_build_threads - 1); 52 | 53 | config->sub_idx_range = (STANDARD_PARENT_UPPER_MZ - STANDARD_PARENT_LOWER_MZ) / config->num_indices; 54 | for (int i = 1; i < config->num_indices; ++i) { //Starting from 1 55 | //cout << "LIMIT: " << STANDARD_PARENT_LOWER_MZ + config->sub_idx_range * i << endl; 56 | config->sub_idx_limits.push_back(STANDARD_PARENT_LOWER_MZ + config->sub_idx_range * i); 57 | } 58 | 59 | 60 | for (std::string &path : input_paths) { 61 | 62 | if (!std::filesystem::exists(path)) { 63 | std::cerr << "Bad file" << std::endl; 64 | std::cerr << path << " is broken or does not exist!" << std::endl; 65 | exit(1); 66 | } 67 | 68 | if (std::filesystem::is_directory(path)) { 69 | for (const auto & entry : std::filesystem::directory_iterator(path)) { 70 | if (entry.path().extension() == ".msp") { 71 | lib_files.push_back(entry.path()); 72 | file_format = MSP; 73 | } else if (entry.path().extension() == ".mgf") { 74 | lib_files.push_back(entry.path()); 75 | file_format = MGF; 76 | } 77 | } 78 | } else { 79 | std::filesystem::path file_path = path; 80 | if (file_path.extension() == ".msp") { 81 | file_format = MSP; 82 | lib_files.push_back(file_path); 83 | } else if (file_path.extension() == ".mgf") { 84 | file_format = MGF; 85 | lib_files.push_back(file_path); 86 | } else { 87 | std::cerr << "Unsupported file extension" << std::endl; 88 | std::cerr << file_path << " file is not supported" << std::endl; 89 | exit(1); 90 | } 91 | } 92 | 93 | } 94 | if (!std::filesystem::exists(config->idx_path) || !std::filesystem::is_directory(config->idx_path)) { 95 | std:cerr << "Bad output directory" << std::endl; 96 | std::cerr << config->idx_path << " is not a directory or does not exist!" << std::endl; 97 | exit(1); 98 | } 99 | if (!config->idx_path.ends_with('/')) { 100 | config->idx_path += "/"; 101 | } 102 | 103 | } 104 | 105 | 106 | bool indexing_manager::build_indices() { 107 | 108 | /* 109 | * Prepare in/output 110 | */ 111 | set_up_output_streams(); 112 | 113 | 114 | /* 115 | * Parsing files and creating preliminary indices 116 | */ 117 | 118 | cout << "Parsing "<< lib_files.size() <<" library files ..." << endl; 119 | auto start = chrono::high_resolution_clock::now(); 120 | for (int i = 0; i < lib_files.size(); ++i) { 121 | //cout << "Parsing library file no. " << i << " (" << lib_files[i].path().filename() << ")" << endl; 122 | parse_file(i); //TODO has multi-threading (experimental) 123 | } 124 | if (pool->get_size() > 0) { 125 | std::cout << "Waiting for threads to finish processing" << std::endl; 126 | pool->add_thread(); //Have "main" thread help out with the computation 127 | pool->wait_for_all_threads(); 128 | } 129 | auto stop = chrono::high_resolution_clock::now(); 130 | auto duration = duration_cast(stop - start); 131 | cout << "Loading Time: " << duration.count() << " seconds" << endl; 132 | 133 | /* 134 | * Storing and rebuilding indices 135 | */ 136 | 137 | //Precursor index 138 | cout << "Sorting precursors index" << endl; 139 | precursorIndex->sort_index(); 140 | cout << "Saving ..." << endl; 141 | //precursorIndex->save_index_to_file(config->idx_path + "precursor_idx.csv"); 142 | precursorIndex->save_index_to_binary_file(config->idx_path + "precursor_idx.bin"); 143 | 144 | config->save_configuration_to_file(config->idx_path + "config.txt"); 145 | 146 | //Closing output streams and reopening them as input streams 147 | for (int i = 0; i < output_streams.size(); ++i) { 148 | output_streams[i].close(); 149 | } 150 | 151 | cout << "Sorting fragment ion indices" << endl; 152 | for (int i = 0; i < config->sub_idx_file_names.size(); ++i) { 153 | string file_name = config->idx_path + config->sub_idx_file_names[i]; 154 | 155 | fragment_ion_index frag_index; 156 | frag_index.load_preliminary_index_from_binary_file(file_name); 157 | frag_index.sort_index(precursorIndex); 158 | frag_index.save_index_to_binary_file(file_name); 159 | } 160 | cout << "Done" << endl; 161 | 162 | return true; 163 | } 164 | 165 | bool indexing_manager::set_up_output_streams() { 166 | 167 | for (int i = 0; i < config->num_indices; ++i) { 168 | string file_name = config->idx_path + "frag_idx_" + to_string(i) + ".bin"; 169 | //cout << file_name << endl; 170 | config->sub_idx_file_names.push_back("frag_idx_" + to_string(i) + ".bin"); 171 | output_streams.emplace_back(fstream(file_name, std::ios::binary | std::ofstream::out)); 172 | } 173 | 174 | 175 | return true; 176 | } 177 | 178 | bool indexing_manager::parse_file(unsigned int file_num) { 179 | string file_path = lib_files[file_num].string(); 180 | 181 | ifstream f(file_path, ios::in); 182 | //f.precision(FLOAT_OUTPUT_PRECISION); 183 | 184 | string buffer; 185 | 186 | /* 187 | * Main loop reading library and creating preliminary indices on the fly 188 | */ 189 | 190 | while (!f.eof()) { 191 | 192 | //Read spectrum from file and pre-processing 193 | bool read_successfully = false; 194 | if (file_format == MSP) { 195 | read_successfully = msp_reader::read_next_entry_into_buffer(f, buffer); 196 | } else if (file_format == MGF) { 197 | read_successfully = mgf_reader::read_next_entry_into_buffer(f, buffer); 198 | } 199 | if (!read_successfully) 200 | continue; 201 | 202 | auto read_and_stream = [this, buffer]() { 203 | 204 | shared_ptr tmp_spectrum; 205 | if (file_format == MSP) { 206 | tmp_spectrum = msp_reader::read_spectrum_from_buffer(buffer); 207 | } else if (file_format == MGF) { 208 | tmp_spectrum = mgf_reader::read_spectrum_from_buffer(buffer); 209 | } 210 | 211 | if (tmp_spectrum->peptide.length() < config->minimum_peptide_length) { 212 | return; 213 | } 214 | 215 | 216 | //Lock for recording and streaming 217 | std::lock_guard guard(pool->mtx); 218 | 219 | //Save bookmark in precursor index 220 | precursor &bookmark = precursorIndex->record_new_precursor(tmp_spectrum); 221 | 222 | //Stream (binned) peaks into corresponding sub-index file 223 | 224 | unsigned int idx_num = config->assign_to_index(bookmark.mz); 225 | index_file_writer::stream_peaks_to_binary_file(output_streams[idx_num], bookmark.id, tmp_spectrum); 226 | //std::cout << bookmark.mz << std::endl; 227 | 228 | }; 229 | 230 | 231 | if (pool->get_size() > 0) { 232 | pool->enqueue(read_and_stream); 233 | } else { 234 | read_and_stream(); 235 | } 236 | } 237 | 238 | return true; 239 | } 240 | 241 | bool indexing_manager::parse_file_buffered(unsigned int file_num) { 242 | string file_path = lib_files[file_num].string(); 243 | 244 | ifstream f(file_path, ios::in); 245 | //f.precision(FLOAT_OUTPUT_PRECISION); //TODO 246 | 247 | unsigned int buffer_size = 40960;//1048576; //Byte //TODO fix if too small 248 | unsigned int carryover_pos = 0; 249 | string buffer; 250 | buffer.resize(buffer_size); 251 | 252 | /* 253 | * Main loop reading library and creating preliminary indices on the fly 254 | */ 255 | while (!f.eof()) { 256 | 257 | //Read large char buffer 258 | f.read(&buffer[carryover_pos], buffer_size - carryover_pos); 259 | unsigned int last_pos = buffer.rfind("Name:"); 260 | unsigned int current_pos = buffer.find("Name:"); 261 | 262 | // Parse spectra within the buffer 263 | while (current_pos != last_pos) { 264 | unsigned int next_pos = buffer.find("Name:", current_pos + 1); 265 | shared_ptr tmp_spectrum = msp_reader::read_spectrum_from_buffer(buffer.substr(current_pos, next_pos - current_pos)); 266 | 267 | //Save bookmark in precursor index 268 | precursor &bookmark = precursorIndex->record_new_precursor(tmp_spectrum); 269 | 270 | //Stream (binned) peaks into corresponding sub-index file 271 | unsigned int idx_num = config->assign_to_index(bookmark.mz); 272 | index_file_writer::stream_peaks_to_binary_file(output_streams[idx_num], bookmark.id, tmp_spectrum); 273 | current_pos = next_pos; 274 | } 275 | 276 | carryover_pos = buffer_size - current_pos; 277 | buffer.replace(0, carryover_pos, buffer.substr(current_pos, std::string::npos)); //keep ms2 carryover in the buffer 278 | current_pos = 0; //TODO why is this 0 in the first place 279 | //buffer = buffer.substr(current_pos, std::string::npos)); 280 | //buffer.resize(buffer_size); 281 | } 282 | return true; 283 | } 284 | -------------------------------------------------------------------------------- /src/indexing_manager.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_INDEXING_MANAGER_H 2 | #define SIMPLE_EXAMPLE_INDEXING_MANAGER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "configuration.h" 9 | #include "precursor_index.h" 10 | #include "thread_pool.h" 11 | 12 | enum FILE_FORMAT { 13 | MSP, MGF 14 | }; 15 | 16 | class indexing_manager { 17 | 18 | std::vector input_paths; 19 | std::vector lib_files; 20 | FILE_FORMAT file_format; 21 | 22 | 23 | //Precursor Index 24 | std::unique_ptr precursorIndex; 25 | 26 | /* 27 | * (Sub-) Indices 28 | */ 29 | 30 | std::shared_ptr config = std::make_shared(); 31 | std::vector output_streams; 32 | 33 | 34 | /* 35 | * Threading 36 | */ 37 | 38 | std::shared_ptr pool; 39 | 40 | public: 41 | indexing_manager(); 42 | explicit indexing_manager(std::string path); 43 | indexing_manager(std::vector &input_paths, std::shared_ptr config); 44 | 45 | 46 | bool build_indices(); 47 | bool set_up_output_streams(); 48 | bool parse_file(unsigned int file_num); 49 | bool parse_file_buffered(unsigned int file_num); 50 | 51 | 52 | 53 | 54 | }; 55 | 56 | 57 | #endif //SIMPLE_EXAMPLE_INDEXING_MANAGER_H 58 | -------------------------------------------------------------------------------- /src/library.cpp: -------------------------------------------------------------------------------- 1 | #include "library.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include "msp_reader.h" 7 | #include "mgf_reader.h" 8 | #include "settings.h" 9 | 10 | using namespace std; 11 | 12 | library::library() { 13 | 14 | } 15 | 16 | library::~library() { 17 | /*for (int i = 0; i < spectrum_list.size(); ++i) { 18 | delete spectrum_list[i]; 19 | }*/ 20 | spectrum_list.clear(); 21 | } 22 | 23 | 24 | library::library(string &path) { 25 | if (path[path.length() - 1] == '/' || path[path.length() - 1] == '\\') { 26 | cout << "Loading library from directory:" << endl; 27 | load_library_from_directory(path); 28 | } 29 | cout << "Loading library from single file:" << endl; 30 | load_spectra_from_file(path); 31 | } 32 | 33 | bool library::construct(string &path) { 34 | if (path[path.length() - 1] == '/' || path[path.length() - 1] == '\\') { 35 | cout << "Loading library from directory:" << endl; 36 | load_library_from_directory(path); 37 | } 38 | cout << "Loading library from single file:" << endl; 39 | load_spectra_from_file(path); 40 | 41 | return true; 42 | } 43 | 44 | bool library::load_library_from_directory(string &path) { 45 | 46 | if (settings::load_batches) { 47 | cerr << "Warning: Batch search for multiple files not fully implemented" << endl; 48 | exit(1); 49 | } 50 | for (const auto & entry : std::filesystem::directory_iterator(path)) { 51 | load_spectra_from_file(entry.path().string()); 52 | //cout << spectrum_list.size() << endl; 53 | } 54 | return true; 55 | } 56 | 57 | bool library::load_spectra_from_file(string path) { 58 | 59 | string extension = path.substr(path.rfind('.') + 1, string::npos); 60 | cout << "Loading spectra from file: " << path << endl; 61 | if (extension == "msp") { 62 | if (!msp_reader::read_file(path, spectrum_list)) { 63 | cout << "Error reading file: " << path << endl; 64 | return false; 65 | } 66 | } 67 | else if (extension == "mgf") { 68 | if (settings::load_batches) { 69 | file_stream.open(path, ios::in); 70 | last_batch = mgf_reader::read_file_batch(file_stream, spectrum_list, settings::batch_size); 71 | } 72 | else if (!mgf_reader::read_file(path, spectrum_list)) { 73 | cout << "Error reading file: " << path << endl; 74 | return false; 75 | } 76 | } 77 | else { 78 | cout << "Unknown file extension" << endl; 79 | return false; 80 | } 81 | std::cout << "\t" << spectrum_list.size() << " scans loaded" << std::endl; 82 | 83 | return true; 84 | } 85 | 86 | 87 | 88 | 89 | bool library::build_library_index() { 90 | cout << "Building precursor index" << endl; 91 | precursor_idx = new precursor_index(); 92 | //TODO refactoring get up to date 93 | cout << "Building fragment ion index" << endl; 94 | fragment_ion_idx = new class fragment_ion_index(precursor_idx); 95 | is_indexed = true; 96 | return false; 97 | } 98 | 99 | library::library(vector> &spectra) { 100 | spectrum_list = spectra; 101 | } 102 | 103 | bool library::load_next_batch() { 104 | last_batch = mgf_reader::read_file_batch(file_stream, spectrum_list, settings::batch_size); 105 | return last_batch; 106 | } 107 | 108 | -------------------------------------------------------------------------------- /src/library.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_LIBRARY_H 2 | #define SIMPLE_EXAMPLE_LIBRARY_H 3 | #include 4 | #include 5 | #include "spectrum.h" 6 | #include "precursor_index.h" 7 | #include "fragment_ion_index.h" 8 | 9 | 10 | class library { 11 | public: 12 | library(); 13 | library(std::string &path); 14 | library(std::vector> &spectra); 15 | bool construct(std::string &path); 16 | ~library(); 17 | 18 | bool load_spectra_from_file(std::string path); 19 | bool load_library_from_directory(std::string &path); 20 | bool load_next_batch(); 21 | 22 | bool build_library_index(); 23 | 24 | std::vector> spectrum_list; 25 | precursor_index* precursor_idx; 26 | fragment_ion_index* fragment_ion_idx; 27 | bool is_indexed = false; 28 | bool last_batch = true; 29 | private: 30 | std::fstream file_stream; 31 | 32 | }; 33 | 34 | 35 | #endif //SIMPLE_EXAMPLE_LIBRARY_H 36 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "spectrum.h" 6 | #include "msp_reader.h" 7 | #include "scores.h" 8 | #include "spectral_search.h" 9 | #include "library.h" 10 | //#include 11 | //#include 12 | 13 | //using namespace OpenMS; 14 | //using namespace OpenMSExternal; 15 | 16 | using namespace std; 17 | 18 | int main() { 19 | cout << "Welcome, welcome" << endl; 20 | //FeatureMap fm; 21 | //Feature feature; 22 | //PeakSpectrum p; 23 | 24 | //fm.push_back(feature); 25 | 26 | 27 | //string msp_file = "/home/ynowatzk/data/pyro_fur/PyroFur_reproduced.msp"; 28 | string msp_file = "/home/ynowatzk/data/9MM/msp/"; 29 | string mgf_file = "/home/ynowatzk/data/9MM/mgf/9MM_FASP.mgf"; // 30 | 31 | 32 | //library *search_lib = new library(mgf_file); 33 | 34 | auto start = chrono::high_resolution_clock::now(); 35 | library *lib = new library(msp_file); 36 | auto stop = chrono::high_resolution_clock::now(); 37 | auto duration = duration_cast(stop - start); 38 | cout << "Loading Time: " << duration.count() << " seconds" << endl; 39 | 40 | lib->build_library_index(); 41 | 42 | 43 | /* 44 | * SEARCH 45 | */ 46 | 47 | //TODO recosntruct 48 | 49 | //spectral_search search(search_lib, lib); 50 | 51 | /* 52 | //Rescoring of spectrast results 53 | cout << "Reading in" << endl; 54 | search.read_results_from_file(R"(C:\Users\ynowatzk\Desktop\data\pyrococcus_furiosus\results\reproduced\sp5_neighbors_matches.tsv)"); 55 | cout << "Rescoring" << endl; 56 | search.rescore_matches(); 57 | cout << "Saving" << endl; 58 | search.save_results_to_file(R"(C:\Users\ynowatzk\Desktop\data\pyrococcus_furiosus\results\reproduced\sp5_neighbors_rescored.tsv)"); 59 | */ 60 | 61 | 62 | //cout << "Searching fragment ion index" << endl; 63 | //start = chrono::high_resolution_clock::now(); 64 | //search.search_target_library(); 65 | //stop = chrono::high_resolution_clock::now(); 66 | //duration = duration_cast(stop - start); 67 | 68 | //cout << "Search Time: " << duration.count() << " seconds" << endl; 69 | //search.save_results_to_file("FIIndex.csv"); 70 | //exit(12); 71 | 72 | /* 73 | auto start = chrono::high_resolution_clock::now(); 74 | search.search_target_library(); 75 | auto stop = chrono::high_resolution_clock::now(); 76 | auto duration = duration_cast(stop - start); 77 | 78 | cout << "Search Time: " << duration.count() << " seconds" << endl; 79 | 80 | 81 | vector matches = search.get_results(); 82 | 83 | for (int i = 0; i < 10; ++i) { 84 | cout << matches[i].query_spectrum->name << " " << matches[i].matched_spectrum->peptide << " " << matches[i].dot_product << endl; 85 | } 86 | 87 | search.save_results_to_file("./out.csv"); 88 | 89 | */ 90 | return 0; 91 | } 92 | -------------------------------------------------------------------------------- /src/match.cpp: -------------------------------------------------------------------------------- 1 | #include "match.h" 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | 7 | match::match() { 8 | 9 | } 10 | 11 | match::match(unsigned int query_id, unsigned int target_id) : query_id(query_id), target_id(target_id) { 12 | 13 | } 14 | 15 | /* 16 | * Deprecated constructors 17 | */ 18 | 19 | match::match(std::shared_ptr search_spectrum, std::shared_ptr matched_spectrum) : query_spectrum(search_spectrum), matched_spectrum(matched_spectrum) { 20 | mass_difference = matched_spectrum->precursor_mass - search_spectrum->precursor_mass; 21 | } 22 | 23 | match::match(std::shared_ptr search_spectrum, std::shared_ptr matched_spectrum, float dot_product, int hit_rank) : query_spectrum(search_spectrum), matched_spectrum(matched_spectrum), dot_product(dot_product), hit_rank(hit_rank) { 24 | mass_difference = matched_spectrum->precursor_mass - search_spectrum->precursor_mass; 25 | } 26 | 27 | match::match(unsigned int query_id, unsigned int target_id, float dot_product, float mass_difference, int hit_rank) : query_id(query_id), target_id(target_id), dot_product(dot_product), mass_difference(mass_difference), hit_rank(hit_rank) { 28 | 29 | } 30 | 31 | match::match(unsigned int query_id, unsigned int target_id, float similarity_score, float dot_product, 32 | float mass_difference, int hit_rank) : query_id(query_id), target_id(target_id), similarity(similarity_score), dot_product(dot_product), mass_difference(mass_difference), hit_rank(hit_rank) { 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/match.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_MATCH_H 2 | #define SIMPLE_EXAMPLE_MATCH_H 3 | 4 | #include "spectrum.h" 5 | #include 6 | 7 | class match { 8 | public: 9 | std::shared_ptr query_spectrum; 10 | std::shared_ptr matched_spectrum; 11 | 12 | unsigned int query_id; 13 | unsigned int target_id; 14 | 15 | float mass_difference; 16 | float abs_mass_difference; 17 | float ppm_difference; 18 | unsigned int charge; 19 | std::vector isomers; //tracking homologous peptides 20 | 21 | // Default scores 22 | float similarity; 23 | float bias; 24 | float dot_product; 25 | 26 | 27 | //Annotation scores 28 | float annotation_similarity; 29 | float annotation_bias; 30 | float annotation_sim2; 31 | 32 | //Contrast angles 33 | float dot_contrast_angle; 34 | float similarity_contrast_angle; 35 | float annotation_contrast_angle; 36 | 37 | //Additional score 38 | float delta_dot; 39 | float delta_similarity; 40 | float delta_annotation_sim; 41 | float delta_sim2; 42 | int peak_count_query; 43 | int peak_count_target; 44 | float peak_mz_standard_deviation; 45 | float peak_mz_weighted_standard_deviation; 46 | 47 | //Advanced scores 48 | float avg_bias_adj_similarity; 49 | float delta_avg; 50 | float sim2; 51 | float spectraST_score; 52 | float spectraST_score_dot; 53 | double x_hunter_score; //all peaks, not just top 20 54 | double x_lgamma; 55 | double x_hunter_score_dot; //all peaks, not just top 20 56 | double x_lgamma_dot; 57 | 58 | int hit_rank; 59 | int query_index; //todo implement 60 | int num_matched_peaks; 61 | 62 | match(); 63 | match(unsigned int query_id, unsigned int target_id); 64 | 65 | /* 66 | * Deprecated constructors 67 | */ 68 | match(std::shared_ptr search_spectrum, std::shared_ptr matched_spectrum); 69 | match(std::shared_ptr search_spectrum, std::shared_ptr matched_spectrum, float dot_product, int hit_rank); 70 | match(unsigned int query_id, unsigned int target_id, float dot_product, float mass_difference, int hit_rank); 71 | match(unsigned int query_id, unsigned int target_id, float similarity_score, float dot_product, float mass_difference, int hit_rank); 72 | 73 | }; 74 | 75 | 76 | #endif //SIMPLE_EXAMPLE_MATCH_H 77 | -------------------------------------------------------------------------------- /src/mgf_reader.cpp: -------------------------------------------------------------------------------- 1 | #include "mgf_reader.h" 2 | #include 3 | #include 4 | #include 5 | #include "settings.h" 6 | 7 | using namespace std; 8 | 9 | mgf_reader::mgf_reader() { 10 | 11 | } 12 | 13 | bool mgf_reader::read_file(string path, vector> &output_spectra) { 14 | 15 | fstream infile; 16 | 17 | infile.open(path, ios::in); 18 | if (!infile) { 19 | return false; 20 | } 21 | 22 | std::shared_ptr c_spectrum = std::make_shared(); 23 | while (!infile.eof()) { 24 | string line; 25 | getline(infile, line); 26 | 27 | if (line == "END IONS") { 28 | // Post-process and save the current spectrum 29 | //c_spectrum->intensity_bin_spanning_factor = -1.f; //TODO figure out if neighbor_spanning here 30 | //c_spectrum->bin_peaks(true,true); //TODO comment out 31 | if (settings::apply_topX_in_window_denoising) 32 | c_spectrum->denoise_mz_window(settings::peaks_per_window, settings::window_size); //TODO this exists only for .mgf search file spectra 33 | c_spectrum->bin_peaks_sparse(true, true); 34 | c_spectrum->root_scale_intensities(); 35 | c_spectrum->normalize_intensities(); //TODO put into one somehow 36 | output_spectra.push_back(c_spectrum); 37 | continue; 38 | } 39 | 40 | if (line == "BEGIN IONS") { 41 | c_spectrum = std::make_shared(); 42 | continue; 43 | } 44 | 45 | // split up line to identify comment tags 46 | string tag, value; 47 | size_t separator_pos = line.find('='); 48 | 49 | if (separator_pos != string::npos) { 50 | tag = line.substr(0, separator_pos); 51 | value = line.substr(separator_pos + 1, string::npos); 52 | 53 | //parse information 54 | if (tag == "TITLE") { 55 | c_spectrum->name = value; 56 | } else if (tag == "PEPMASS") { 57 | c_spectrum->precursor_mass = stof(value); //todo check if that's actually true 58 | } else if (tag == "RTINSECONDS") { 59 | 60 | } else if (tag == "CHARGE") { 61 | c_spectrum->charge = stoi(value); 62 | } 63 | } 64 | else { // TODO make sure this works with long numbers. Otherwise implement like in msp reader 65 | // No separator: Assume peak information is noted down in the line 66 | 67 | 68 | if (line.empty()) 69 | continue; 70 | 71 | std::size_t space_pos = line.find(' '); 72 | if (space_pos == string::npos) 73 | continue; 74 | 75 | 76 | float pos = stof(line.substr(0, space_pos)); 77 | c_spectrum->peak_positions.push_back(pos); 78 | 79 | 80 | float intensity = stof(line.substr(space_pos, string::npos)); 81 | c_spectrum->intensities.push_back(intensity); 82 | 83 | 84 | /* 85 | * alternatively run (but it is slower) 86 | istringstream ss(line); 87 | float pos, intensity; 88 | ss >> pos >> intensity; 89 | */ 90 | } 91 | } 92 | infile.close(); 93 | return true; 94 | } 95 | 96 | bool mgf_reader::read_next_entry_into_buffer(ifstream &f, string &buffer) { //TODO 97 | /* 98 | * Requires open filestream and reads until end of a new entry 99 | */ 100 | 101 | buffer.clear(); 102 | 103 | string line; 104 | 105 | if(!getline(f, line)) { 106 | return false; 107 | } 108 | 109 | if (line == "BEGIN IOONS") { 110 | cout << line << endl; 111 | cerr << "entry does not start with BEGIN IONS" << endl; 112 | return false; 113 | } 114 | buffer.append(line + "\n"); 115 | 116 | while (getline(f, line)) { 117 | 118 | buffer.append(line + "\n"); 119 | if (line.rfind("END IONS", 0) == 0) { 120 | return true; 121 | } 122 | } 123 | 124 | 125 | return true; 126 | } 127 | 128 | 129 | shared_ptr mgf_reader::read_spectrum_from_buffer(const string& buffer) { //TODO 130 | 131 | std::string line, tag, value; 132 | shared_ptr c_spectrum = std::make_shared(); 133 | 134 | stringstream ss(buffer); 135 | while (getline(ss, line)) { // what if no colon -> colon_pos == string::npos 136 | if (ss.eof()) 137 | return nullptr; 138 | 139 | // parse information 140 | if (line == "END IONS") { 141 | //c_spectrum->bin_peaks_sparse(true, true); // TODO YES NO? 142 | c_spectrum->root_scale_intensities(); 143 | c_spectrum->normalize_intensities(); 144 | return c_spectrum; 145 | } 146 | if (line == "BEGIN IONS") { 147 | continue; 148 | } 149 | 150 | // split up line to identify comment tags 151 | size_t separator_pos = line.find('='); 152 | 153 | if (separator_pos != string::npos) { 154 | tag = line.substr(0, separator_pos); 155 | value = line.substr(separator_pos + 1, string::npos); 156 | 157 | //parse information 158 | if (tag == "TITLE") { 159 | c_spectrum->name = value; 160 | c_spectrum->peptide = value.substr(0, value.find('/')); //TODO Does this work? 161 | } else if (tag == "PEPMASS") { 162 | c_spectrum->precursor_mass = stof(value); //todo check if that's actually true 163 | } else if (tag == "RTINSECONDS") { 164 | 165 | } else if (tag == "CHARGE") { 166 | c_spectrum->charge = stoi(value); 167 | } 168 | } 169 | else { // TODO make sure this works with long numbers. Otherwise implement like in msp reader 170 | // No separator: Assume peak information is noted down in the line 171 | 172 | 173 | if (line.empty()) 174 | continue; 175 | 176 | std::size_t space_pos = line.find_first_of(" \t"); //Finds space or tab 177 | if (space_pos == string::npos) 178 | continue; 179 | 180 | 181 | float pos = stof(line.substr(0, space_pos)); 182 | c_spectrum->peak_positions.push_back(pos); 183 | 184 | 185 | float intensity = stof(line.substr(space_pos, string::npos)); 186 | c_spectrum->intensities.push_back(intensity); 187 | 188 | } 189 | } 190 | return nullptr; 191 | } 192 | 193 | 194 | 195 | bool mgf_reader::read_file_batch(fstream &infile, vector> &output_spectra, int batch_size) { 196 | 197 | if (!infile) { 198 | cerr << "Error reading file" << endl; 199 | exit(1); 200 | } 201 | 202 | int count = 0; 203 | std::shared_ptr c_spectrum = std::make_shared(); 204 | while (!infile.eof()) { 205 | string line; 206 | getline(infile, line); 207 | 208 | if (line == "END IONS") { 209 | // Post-process and save the current spectrum 210 | //c_spectrum->intensity_bin_spanning_factor = -1.f; //TODO figure out if neighbor_spanning here 211 | //c_spectrum->bin_peaks(true,true); //TODO comment out 212 | c_spectrum->bin_peaks_sparse(true, true); 213 | c_spectrum->root_scale_intensities(); 214 | c_spectrum->normalize_intensities(); //TODO put into one somehow 215 | output_spectra.push_back(c_spectrum); 216 | ++count; 217 | if (count==batch_size) { 218 | return false; //Indicating more batches to come 219 | } 220 | continue; 221 | } 222 | 223 | if (line == "BEGIN IONS") { 224 | c_spectrum = std::make_shared(); 225 | continue; 226 | } 227 | 228 | // split up line to identify comment tags 229 | string tag, value; 230 | size_t separator_pos = line.find('='); 231 | 232 | if (separator_pos != string::npos) { 233 | tag = line.substr(0, separator_pos); 234 | value = line.substr(separator_pos + 1, string::npos); 235 | 236 | //parse information 237 | if (tag == "TITLE") { 238 | c_spectrum->name = value; 239 | } else if (tag == "PEPMASS") { 240 | c_spectrum->precursor_mass = stof(value); //todo check if that's actually true 241 | } else if (tag == "RTINSECONDS") { 242 | 243 | } else if (tag == "CHARGE") { 244 | c_spectrum->charge = stoi(value); 245 | } 246 | } 247 | else { // TODO make sure this works with long numbers. Otherwise implement like in msp reader 248 | // No separator: Assume peak information is noted down in the line 249 | 250 | 251 | if (line.empty()) 252 | continue; 253 | 254 | std::size_t space_pos = line.find(' '); 255 | if (space_pos == string::npos) 256 | continue; 257 | 258 | 259 | float pos = stof(line.substr(0, space_pos)); 260 | c_spectrum->peak_positions.push_back(pos); 261 | 262 | 263 | float intensity = stof(line.substr(space_pos, string::npos)); 264 | c_spectrum->intensities.push_back(intensity); 265 | 266 | 267 | /* 268 | * alternatively run (but it is slower) 269 | istringstream ss(line); 270 | float pos, intensity; 271 | ss >> pos >> intensity; 272 | */ 273 | } 274 | } 275 | infile.close(); 276 | return true; //Indicating last batch was reached 277 | 278 | } 279 | -------------------------------------------------------------------------------- /src/mgf_reader.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_MGF_READER_H 2 | #define SIMPLE_EXAMPLE_MGF_READER_H 3 | 4 | #include 5 | #include "spectrum.h" 6 | 7 | 8 | 9 | class mgf_reader { 10 | public: 11 | mgf_reader(); 12 | 13 | static bool read_file(std::string path, std::vector> &output_spectra); 14 | static bool read_file_batch(std::fstream &infile, std::vector> &output_spectra, int batch_size); 15 | static bool read_next_entry_into_buffer(std::ifstream &f, std::string &buffer); 16 | static std::shared_ptr read_spectrum_from_buffer(const std::string& buffer); 17 | 18 | }; 19 | 20 | 21 | #endif //SIMPLE_EXAMPLE_MGF_READER_H 22 | -------------------------------------------------------------------------------- /src/msp_reader.cpp: -------------------------------------------------------------------------------- 1 | #include "msp_reader.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | std::fstream msp_reader::infile; 12 | 13 | 14 | bool msp_reader::read_file(string &path, vector> &output_spectra) { 15 | 16 | infile.open(path, ios::in); 17 | if (!infile) { 18 | cerr << "Could not open file at " << path << endl; 19 | return false; 20 | } 21 | 22 | string line; 23 | while (!infile.eof()) { 24 | string tag, value; 25 | std::shared_ptr c_spectrum = std::make_shared(); 26 | while (tag != "Num peaks") { // what if no colon -> colon_pos == string::npos 27 | if (infile.eof()) 28 | return false; 29 | getline(infile, line); 30 | 31 | // split up line to identify comment tags 32 | size_t colon_pos = line.find(':'); 33 | 34 | tag = line.substr(0, colon_pos); 35 | value = line.substr(colon_pos + 2, string::npos); 36 | 37 | // parse information 38 | if (tag == "Name") { 39 | c_spectrum = std::make_shared(); 40 | c_spectrum->name = value; 41 | c_spectrum->peptide = value.substr(0, value.find('/')); 42 | c_spectrum->charge = stoi(value.substr(value.rfind('/') + 1, string::npos)); 43 | } else if (tag == "MW") { 44 | c_spectrum->precursor_mass = stof(value); 45 | } else if (tag == "Comment") { 46 | 47 | } 48 | } 49 | //else case: tag = Num peak_positions 50 | int num_peaks = stoi(value); 51 | for (int i = 0; i < num_peaks; ++i) { 52 | getline(infile, value, '\t'); 53 | float peak = stof(value); 54 | getline(infile, value, '\t'); 55 | float intensity = stof(value); 56 | 57 | getline(infile, value); 58 | 59 | 60 | 61 | 62 | // parse peak_positions and intensities 63 | 64 | /*std::size_t tab_pos = line.find('\t'); 65 | float peak = stof(line.substr(0, tab_pos)); 66 | float intensity = stof(line.substr(tab_pos + 1, line.find('\t'))); 67 | */ 68 | 69 | 70 | c_spectrum->peak_positions.push_back(peak); 71 | c_spectrum->intensities.push_back(intensity); 72 | } 73 | //c_spectrum->intensity_bin_spanning_factor = -1.f; //TODO figure out if neighbor_spanning here 74 | c_spectrum->bin_peaks(true,true); 75 | //c_spectrum->bin_peaks_sparse(true, true); TODO uncomment!!! 76 | output_spectra.push_back(c_spectrum); 77 | 78 | } 79 | 80 | 81 | infile.close(); 82 | return true; 83 | } 84 | 85 | bool msp_reader::read_spectra_from_positions(string &path, vector &precursor_list, vector &output_spectra) { 86 | 87 | output_spectra.reserve(precursor_list.size()); 88 | infile.open(path); 89 | for (int i = 0; i < precursor_list.size(); ++i) { 90 | 91 | unsigned long start = precursor_list[i]->offset_begin; 92 | unsigned long end = precursor_list[i]->offset_end; 93 | 94 | infile.seekg(start); 95 | std::string s; 96 | //cout << precursor_list[i]->name << endl; 97 | //cout << start << " " << end << " " << end - start << endl; 98 | //if (end > 1844674407) 99 | // continue; 100 | s.resize(end - start); 101 | infile.read(&s[0], end - start); 102 | 103 | output_spectra.push_back(read_spectrum_from_buffer(s).get()); //TODO .get() is messing with pointers again!!! REFACTOR 104 | 105 | delete output_spectra.back(); 106 | output_spectra.pop_back(); 107 | } 108 | 109 | return false; 110 | } 111 | 112 | shared_ptr msp_reader::read_spectrum_from_buffer(const string& buffer) { 113 | 114 | string line, tag, value; 115 | shared_ptr c_spectrum = std::make_shared(); 116 | 117 | stringstream ss(buffer); 118 | while (tag != "Num peaks") { // what if no colon -> colon_pos == string::npos 119 | if (ss.eof()) 120 | return nullptr; 121 | getline(ss, line); 122 | // split up line to identify comment tags 123 | size_t colon_pos = line.find(':'); 124 | 125 | tag = line.substr(0, colon_pos); 126 | value = line.substr(colon_pos + 2, string::npos); 127 | 128 | // parse information 129 | if (tag == "Name") { 130 | c_spectrum = std::make_shared(); 131 | c_spectrum->name = value; 132 | c_spectrum->peptide = value.substr(0, value.find('/')); 133 | c_spectrum->charge = stoi(value.substr(value.rfind('/') + 1, string::npos)); 134 | } else if (tag == "MW") { 135 | c_spectrum->precursor_mass = stof(value); 136 | } else if (tag == "Comment") { 137 | 138 | } 139 | } 140 | //else case: tag = Num peak_positions 141 | int num_peaks = stoi(value); 142 | for (int i = 0; i < num_peaks; ++i) { 143 | getline(ss, value, '\t'); 144 | //std::cout << "X" << value << "X" << std::endl; 145 | 146 | float peak = stof(value); 147 | getline(ss, value, '\n'); 148 | 149 | //std::cout << "X" << value << "X" << std::endl; 150 | float intensity = stof(value); 151 | //std::cout << intensity << std::endl; 152 | 153 | //getline(ss, value); 154 | 155 | 156 | 157 | 158 | // parse peak_positions and intensities 159 | 160 | /*std::size_t tab_pos = line.find('\t'); 161 | float peak = stof(line.substr(0, tab_pos)); 162 | float intensity = stof(line.substr(tab_pos + 1, line.find('\t'))); 163 | */ 164 | 165 | 166 | c_spectrum->peak_positions.push_back(peak); 167 | c_spectrum->intensities.push_back(intensity); 168 | } 169 | //c_spectrum->intensity_bin_spanning_factor = -1.f; //TODO figure out if neighbor_spanning here 170 | //c_spectrum->bin_peaks(true,true); 171 | //c_spectrum->bin_peaks_sparse(true, true); 172 | c_spectrum->root_scale_intensities(); 173 | c_spectrum->normalize_intensities(); 174 | return c_spectrum; 175 | } 176 | 177 | 178 | bool msp_reader::read_file_precursors(string &path, vector &precursor_list) { 179 | 180 | infile.open(path, ios::in); 181 | if (!infile) { 182 | cerr << "Could not open file at " << path << endl; 183 | return false; 184 | } 185 | 186 | string line; 187 | while (!infile.eof()) { 188 | string tag, value; 189 | precursor *parent; 190 | while (tag != "Num peaks") { // what if no colon -> colon_pos == string::npos 191 | if (infile.eof()) 192 | return false; 193 | getline(infile, line); 194 | 195 | // split up line to identify comment tags 196 | size_t colon_pos = line.find(':'); 197 | 198 | tag = line.substr(0, colon_pos); 199 | value = line.substr(colon_pos + 2, string::npos); 200 | 201 | // parse information 202 | if (tag == "Name") { 203 | parent = new precursor(); 204 | parent->offset_begin = infile.tellg(); 205 | parent->offset_begin -= (line.length() + 1); 206 | parent->name = value; 207 | //parent->peptide = value.substr(0, value.find('/')); 208 | parent->charge = stoi(value.substr(value.rfind('/') + 1, string::npos)); 209 | } else if (tag == "MW") { 210 | parent->mz = stof(value); 211 | } else if (tag == "Comment") { 212 | 213 | } 214 | } 215 | //else case: tag = Num peak_positions 216 | int num_peaks = stoi(value); 217 | for (int i = 0; i < num_peaks; ++i) { 218 | //Skip lines 219 | getline(infile, value); 220 | } 221 | parent->offset_end = infile.tellg(); 222 | precursor_list.push_back(parent); 223 | } 224 | infile.clear(); 225 | infile.seekg(0, ios::end); 226 | precursor_list.back()->offset_end = infile.tellg(); 227 | 228 | 229 | infile.close(); 230 | return true; 231 | } 232 | 233 | bool msp_reader::read_file_precursors_efficient(string &path, vector &precursor_list) { 234 | 235 | 236 | 237 | infile.open(path, ios::in); 238 | if (!infile) { 239 | cerr << "Could not open file at " << path << endl; 240 | return false; 241 | } 242 | 243 | 244 | precursor *parent = new precursor(); 245 | string line; 246 | unsigned int id = 0; 247 | while (!infile.eof()) { 248 | getline(infile, line); 249 | 250 | if (line.rfind("Name:", 0) == 0) { // rightmost match, but starting at pos 0 (or earlier), i.e. prefix 251 | 252 | unsigned long start = infile.tellg(); 253 | start -= (line.length() + 1); 254 | 255 | //Push back previous element 256 | parent->offset_end = start - 1; 257 | precursor_list.push_back(parent); 258 | 259 | //Init new element 260 | parent = new precursor(); 261 | parent->id = id; 262 | parent->offset_begin = start; 263 | parent->charge = stoi(line.substr(line.rfind('/') + 1, string::npos)); 264 | ++id; 265 | } 266 | 267 | if (line.rfind("MW:", 0) == 0) { 268 | parent->mz = stof(line.substr(4, string::npos)); 269 | } 270 | 271 | } 272 | 273 | // Adding last element 274 | infile.clear(); 275 | infile.seekg(0, ios::end); 276 | parent->offset_end = infile.tellg(); 277 | precursor_list.push_back(parent); 278 | 279 | // Deleting first element (dummy) 280 | precursor_list.erase(precursor_list.begin()); //inefficient (but called once) 281 | 282 | infile.close(); 283 | 284 | return true; 285 | } 286 | 287 | bool msp_reader::read_next_entry_into_buffer(ifstream &f, string &buffer) { 288 | /* 289 | * Requires open filestream and reads until end a new entry 290 | */ 291 | 292 | buffer.clear(); 293 | 294 | string line; 295 | 296 | if(!getline(f, line)) { 297 | return false; 298 | } 299 | 300 | if (line.rfind("Name:", 0) != 0) { 301 | cout << line << endl; 302 | cerr << "entry does not start with Name:" << endl; 303 | return false; 304 | } 305 | buffer.append(line + "\n"); 306 | 307 | while (getline(f, line)) { 308 | if (line.rfind("Name:", 0) == 0) { // rightmost match, but starting at pos 0 (or earlier), i.e. prefix 309 | 310 | //Jump back to line beginning and return buffer 311 | f.seekg(-(line.length() + 1), ios::cur); 312 | return true; 313 | 314 | } 315 | 316 | buffer.append(line + "\n"); 317 | 318 | } 319 | 320 | 321 | return true; 322 | } 323 | 324 | bool msp_reader::continue_read_file(vector> &output_spectra) { 325 | 326 | 327 | return false; 328 | } 329 | 330 | -------------------------------------------------------------------------------- /src/msp_reader.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_MSP_READER_H 2 | #define SIMPLE_EXAMPLE_MSP_READER_H 3 | 4 | #include "spectrum.h" 5 | #include "precursor_index.h" 6 | #include 7 | 8 | 9 | class msp_reader { 10 | 11 | static std::fstream infile; 12 | 13 | 14 | public: 15 | 16 | static bool read_file(std::string &path, std::vector> &output_spectra); 17 | static bool continue_read_file(std::vector> &output_spectra); 18 | static bool read_file_precursors(std::string &path, std::vector &precursor_list); 19 | static bool read_file_precursors_efficient(std::string &path, std::vector &precursor_list); 20 | 21 | static bool read_spectra_from_positions(std::string &path, std::vector &precursor_list, std::vector &output_spectra); 22 | static bool read_next_entry_into_buffer(std::ifstream &f, std::string &buffer); 23 | static std::shared_ptr read_spectrum_from_buffer(const std::string& buffer); 24 | 25 | 26 | }; 27 | 28 | 29 | #endif //SIMPLE_EXAMPLE_MSP_READER_H 30 | -------------------------------------------------------------------------------- /src/naive_search.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "spectrum.h" 4 | #include "msp_reader.h" 5 | #include "scores.h" 6 | #include "spectral_search.h" 7 | #include "library.h" 8 | 9 | 10 | using namespace std; 11 | 12 | int main() { 13 | cout << "This is a naive search" << endl; 14 | //FeatureMap fm; 15 | //Feature feature; 16 | //PeakSpectrum p; 17 | 18 | //fm.push_back(feature); 19 | 20 | 21 | //string msp_file = "/home/ynowatzk/data/pyro_fur/PyroFur_reproduced.msp"; 22 | string msp_file = "/home/ynowatzk/data/9MM/msp/"; 23 | string mgf_file = "/home/ynowatzk/data/9MM/mgf/9MM_FASP.mgf"; // 24 | 25 | 26 | shared_ptr search_lib = make_shared(mgf_file); 27 | 28 | auto start = chrono::high_resolution_clock::now(); 29 | shared_ptr lib = make_shared(msp_file); 30 | auto stop = chrono::high_resolution_clock::now(); 31 | auto duration = duration_cast(stop - start); 32 | cout << "Loading Time: " << duration.count() << " seconds" << endl; 33 | 34 | 35 | 36 | 37 | /* 38 | * Search 39 | */ 40 | 41 | start = chrono::high_resolution_clock::now(); 42 | spectral_search search(search_lib, lib); 43 | search.search_target_library(); 44 | stop = chrono::high_resolution_clock::now(); 45 | duration = duration_cast(stop - start); 46 | cout << "Loading Time: " << duration.count() << " seconds" << endl; 47 | 48 | search.save_results_to_file("./naive_search_results.csv"); 49 | //lib->build_library_index(); 50 | 51 | 52 | /* 53 | * SEARCH 54 | */ 55 | 56 | /* 57 | //Rescoring of spectrast results 58 | cout << "Reading in" << endl; 59 | search.read_results_from_file(R"(C:\Users\ynowatzk\Desktop\data\pyrococcus_furiosus\results\reproduced\sp5_neighbors_matches.tsv)"); 60 | cout << "Rescoring" << endl; 61 | search.rescore_matches(); 62 | cout << "Saving" << endl; 63 | search.save_results_to_file(R"(C:\Users\ynowatzk\Desktop\data\pyrococcus_furiosus\results\reproduced\sp5_neighbors_rescored.tsv)"); 64 | */ 65 | 66 | 67 | //cout << "Searching fragment ion index" << endl; 68 | //start = chrono::high_resolution_clock::now(); 69 | //search.search_target_library(); 70 | //stop = chrono::high_resolution_clock::now(); 71 | //duration = duration_cast(stop - start); 72 | 73 | //cout << "Search Time: " << duration.count() << " seconds" << endl; 74 | //search.save_results_to_file("FIIndex.csv"); 75 | //exit(12); 76 | 77 | /* 78 | auto start = chrono::high_resolution_clock::now(); 79 | search.search_target_library(); 80 | auto stop = chrono::high_resolution_clock::now(); 81 | auto duration = duration_cast(stop - start); 82 | 83 | cout << "Search Time: " << duration.count() << " seconds" << endl; 84 | 85 | 86 | vector matches = search.get_results(); 87 | 88 | for (int i = 0; i < 10; ++i) { 89 | cout << matches[i].query_spectrum->name << " " << matches[i].matched_spectrum->peptide << " " << matches[i].dot_product << endl; 90 | } 91 | 92 | search.save_results_to_file("./out.csv"); 93 | 94 | */ 95 | return 0; 96 | } 97 | 98 | -------------------------------------------------------------------------------- /src/precursor_index.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "precursor_index.h" 6 | #include "index_file_writer.h" 7 | #include "index_file_reader.h" 8 | 9 | 10 | using namespace std; 11 | 12 | int precursor_index::get_size() { 13 | return precursors.size(); 14 | } 15 | 16 | /*spectrum *precursor_index::get_spectrum(int i) { 17 | return spectra[i]; 18 | }*/ 19 | 20 | float precursor_index::get_max_precursor_mass() { 21 | return precursors[ranking.back()].mz; 22 | } 23 | /* 24 | //TODO add self chosen bounds 25 | int precursor_index::get_lower_bound(int charge, float min_mass) { 26 | 27 | int lb = std::lower_bound(spectra.begin(), spectra.end(), make_pair(charge, min_mass), [](spectrum *s, pair charge_mass_tuple) { 28 | return *s < charge_mass_tuple; 29 | }) - spectra.begin(); 30 | 31 | return lb; 32 | } 33 | 34 | 35 | int precursor_index::get_upper_bound(int charge, float max_mass) { 36 | 37 | int ub = std::upper_bound(spectra.begin(), spectra.end(), make_pair(charge, max_mass), [](pair charge_mass_tuple, spectrum *s) { 38 | return !(*s <= charge_mass_tuple); 39 | }) - spectra.begin(); 40 | 41 | return ub - 1; 42 | } 43 | */ 44 | 45 | precursor_index::precursor_index() { 46 | 47 | } 48 | 49 | int precursor_index::get_lower_bound(int charge, float min_mass) { 50 | int lb = std::lower_bound(ranking.begin(), ranking.end(), make_pair(charge, min_mass), [&](unsigned int rank, pair charge_mass_tuple) { 51 | return precursors[rank] < charge_mass_tuple; 52 | }) - ranking.begin(); 53 | 54 | return lb; 55 | } 56 | 57 | 58 | int precursor_index::get_upper_bound(int charge, float max_mass) { 59 | int ub = std::upper_bound(ranking.begin(), ranking.end(), make_pair(charge, max_mass), [&](pair charge_mass_tuple, unsigned int rank) { 60 | return !(precursors[rank] <= charge_mass_tuple); 61 | }) - ranking.begin(); 62 | 63 | return ub - 1; 64 | } 65 | 66 | bool precursor_index::sort_index() { 67 | 68 | if (!(id_counter == precursors.size() && id_counter == ranking.size())) { 69 | cerr << "Number of recorded precursors does not match precursor ids :: Required to warrant correct mapping of id to precursors" << endl; 70 | return false; 71 | } 72 | 73 | 74 | sort(ranking.begin(), ranking.end(), [&](unsigned int a, unsigned int b) { 75 | return precursors[a] < precursors[b]; 76 | }); 77 | 78 | for (int i = 0; i < ranking.size(); ++i) { 79 | unsigned int id = ranking[i]; 80 | precursors[id].rank = i; 81 | } 82 | 83 | return true; 84 | } 85 | 86 | precursor &precursor_index::get_precursor(unsigned int id) { 87 | return precursors[id]; 88 | } 89 | 90 | precursor &precursor_index::get_precursor_by_rank(unsigned int id) { 91 | return precursors[ranking[id]]; 92 | } 93 | 94 | precursor &precursor_index::record_new_precursor(const shared_ptr& spec) { 95 | precursors.emplace_back(precursor(id_counter, spec->precursor_mass, spec->charge, spec->peptide)); 96 | ranking.push_back(id_counter); 97 | ++id_counter; 98 | return precursors.back(); 99 | } 100 | 101 | precursor &precursor_index::record_new_precursor(float mz, int charge, std::string peptide) { 102 | precursors.emplace_back(precursor(id_counter, mz, charge, peptide)); 103 | ranking.push_back(id_counter); 104 | ++id_counter; 105 | return precursors.back(); 106 | } 107 | 108 | unsigned int precursor_index::get_rank(unsigned int id) { 109 | return precursors[id].rank; 110 | } 111 | 112 | bool precursor_index::save_index_to_file(const string &file_path) { 113 | 114 | //Saving spectrum bookmarks (precursor info) 115 | index_file_writer::save_precursor_index(file_path, precursors); 116 | 117 | 118 | return true; 119 | } 120 | 121 | bool precursor_index::load_index_from_file(const string &file_path) { 122 | //index_file_reader::read_file_into_precursor_index(file_path, make_shared(*this)); 123 | 124 | std::ifstream f(file_path, std::ios::in); 125 | std::string delimiter = ";"; 126 | std::string line; 127 | 128 | if (!getline(f, line)) { 129 | return false; 130 | } 131 | 132 | if (line.rfind("Num: ", 0) != 0) { 133 | std::cerr << "Incorrect file format" << std::endl; 134 | return false; 135 | } 136 | 137 | //Read header 138 | unsigned int size = std::stoi(line.substr(5, std::string::npos)); //TODO check 4 or 5 139 | set_size(size); 140 | 141 | // Parse precursors line by line 142 | //precursor_idx->add_precursor_record(p); 143 | while (getline(f, line)) { 144 | 145 | size_t delim_pos = line.find(delimiter); 146 | unsigned int id = std::stoi(line.substr(0, delim_pos)); 147 | 148 | size_t length = line.find(delimiter, delim_pos + 1) - delim_pos; 149 | unsigned int rank = std::stoi(line.substr(delim_pos + 1, length - 1)); 150 | 151 | delim_pos = delim_pos + length; 152 | length = line.find(delimiter, delim_pos + 1) - delim_pos; 153 | float mz = std::stof(line.substr(delim_pos + 1, length - 1)); 154 | 155 | delim_pos = delim_pos + length; 156 | length = line.find(delimiter, delim_pos + 1) - delim_pos; 157 | int charge = std::stoi(line.substr(delim_pos + 1, length - 1)); 158 | std::string peptide = line.substr(delim_pos + length + 1, std::string::npos); 159 | 160 | add_precursor_record(precursor(id, rank, mz, charge, peptide)); 161 | } 162 | 163 | f.close(); 164 | 165 | if (get_size() != size) { 166 | std::cerr << "Wrong number of precursors" << std::endl; 167 | } 168 | 169 | //TODO delete this if not used anymore 170 | for (auto & precursor : precursors) { 171 | to_rank.push_back(precursor.rank); 172 | } 173 | 174 | return true; 175 | } 176 | 177 | bool precursor_index::add_precursor_record(const precursor& p) { 178 | precursors.emplace_back(p); //TODO test this 179 | ranking[p.rank] = p.id; 180 | return true; 181 | } 182 | 183 | bool precursor_index::set_size(unsigned int size) { 184 | precursors.reserve(size); 185 | ranking.resize(size); 186 | 187 | return true; 188 | } 189 | 190 | bool precursor_index::save_index_to_binary_file(const string &file_path) { 191 | 192 | //Saving spectrum bookmarks (precursor info) 193 | index_file_writer::save_precursor_index_to_binary_file(file_path, precursors); 194 | 195 | return true; 196 | 197 | } 198 | 199 | bool precursor_index::load_index_from_binary_file(const string &file_path) { 200 | ifstream f(file_path, ios::binary | ios::in); 201 | 202 | //Read header 203 | unsigned int size; 204 | f.read((char *) &size, sizeof(unsigned int)); 205 | set_size(size); 206 | 207 | unsigned int id, rank; 208 | float mz; 209 | int charge; 210 | size_t pep_size; 211 | std::string peptide; 212 | while (f.read((char *) &id, sizeof(unsigned int))) { 213 | f.read((char *) &rank, sizeof(unsigned int)); 214 | f.read((char *) &mz, sizeof(float)); 215 | f.read((char *) &charge, sizeof(int)); 216 | f.read((char *) &pep_size, sizeof(size_t)); 217 | peptide.resize(pep_size); 218 | f.read((char *) &peptide[0], pep_size); 219 | 220 | add_precursor_record(precursor(id, rank, mz, charge, peptide)); 221 | } 222 | 223 | f.close(); 224 | 225 | if (get_size() != size) { 226 | std::cerr << "Wrong number of precursors" << std::endl; 227 | std::cout << get_size() << " " << size << endl; 228 | } 229 | 230 | //TODO delete this if not used anymore 231 | for (auto & precursor : precursors) { 232 | to_rank.push_back(precursor.rank); 233 | } 234 | 235 | return true; 236 | 237 | } 238 | 239 | 240 | -------------------------------------------------------------------------------- /src/precursor_index.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_PRECURSOR_INDEX_H 2 | #define SIMPLE_EXAMPLE_PRECURSOR_INDEX_H 3 | #include 4 | #include 5 | #include "spectrum.h" 6 | 7 | 8 | 9 | struct precursor { 10 | /* 11 | * Key values 12 | */ 13 | unsigned int id; 14 | unsigned int rank; //unsure if needed here 15 | float mz; 16 | int charge; 17 | std::string peptide; 18 | 19 | /* 20 | * special cases 21 | */ 22 | 23 | unsigned long offset_begin; 24 | unsigned long offset_end; 25 | std::string name; 26 | 27 | precursor() {}; 28 | precursor(unsigned int id, float mass, int charge, std::string peptide="") : id(id), mz(mass), charge(charge), peptide(peptide) {}; 29 | precursor(unsigned int id, unsigned int rank, float mass, int charge, std::string peptide="") : id(id), rank(rank), mz(mass), charge(charge), peptide(peptide) {}; 30 | 31 | bool operator<(const precursor &other) const { 32 | return charge < other.charge || (charge == other.charge && mz < other.mz); 33 | }; 34 | 35 | bool operator<(std::pair charge_mass_tuple) const { 36 | return charge < charge_mass_tuple.first || (charge == charge_mass_tuple.first && mz < charge_mass_tuple.second); 37 | } 38 | 39 | bool operator<=(std::pair charge_mass_tuple) const { 40 | return charge < charge_mass_tuple.first || (charge == charge_mass_tuple.first && mz <= charge_mass_tuple.second); 41 | } 42 | 43 | 44 | }; 45 | 46 | 47 | class precursor_index { 48 | 49 | // Contains all spectrum bookmarks (precursors), sorted first by charge, then by precursor mz 50 | std::vector precursors; 51 | std::vector ranking; 52 | unsigned int id_counter = 0; 53 | 54 | 55 | public: 56 | std::vector to_rank; 57 | precursor_index(); //Init empty index 58 | precursor& record_new_precursor(const std::shared_ptr& spec); 59 | precursor& record_new_precursor(float mz, int charge, std::string peptide); 60 | bool add_precursor_record(const precursor& p); 61 | 62 | bool sort_index(); 63 | bool save_index_to_file(const std::string &file_path); 64 | bool save_index_to_binary_file(const std::string &file_path); 65 | bool load_index_from_file(const std::string &file_path); 66 | bool load_index_from_binary_file(const std::string &file_path); 67 | 68 | 69 | int get_size(); 70 | bool set_size(unsigned int size); 71 | int get_lower_bound(int charge, float min_mass); 72 | int get_upper_bound(int charge, float max_mass); 73 | float get_max_precursor_mass(); 74 | 75 | precursor& get_precursor(unsigned int id); 76 | precursor& get_precursor_by_rank(unsigned int id); 77 | unsigned int get_rank(unsigned int id); 78 | 79 | }; 80 | 81 | 82 | #endif //SIMPLE_EXAMPLE_PRECURSOR_INDEX_H 83 | -------------------------------------------------------------------------------- /src/quick_scan.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "scanner.h" 4 | #include "msp_reader.h" 5 | #include "library.h" 6 | #include "spectral_search.h" 7 | 8 | using namespace std; 9 | 10 | int main() { 11 | 12 | cout << "Hello Quick Scan" << endl; 13 | 14 | string directory = "/home/ynowatzk/data/9MM/msp/Brevibacillus+laterosporus.msp"; 15 | scanner *sc = new scanner(); 16 | 17 | cout << "Scanning input:" << endl; 18 | auto start = chrono::high_resolution_clock::now(); 19 | sc->scan_file(directory); 20 | auto stop = chrono::high_resolution_clock::now(); 21 | auto duration = duration_cast(stop - start); 22 | cout << "Scan Time: " << duration.count() << " seconds" << endl; 23 | 24 | sc->analyze(); 25 | sc->print_scan_results(); 26 | sc->save_precursor_distribution_to_file("./precursors.txt"); 27 | 28 | 29 | cout << "Loading spectra from saved positions" << endl; 30 | start = chrono::high_resolution_clock::now(); 31 | msp_reader::read_spectra_from_positions(directory, sc->parents, sc->specs); 32 | stop = chrono::high_resolution_clock::now(); 33 | duration = duration_cast(stop - start); 34 | cout << "Loading Time: " << duration.count() << " seconds" << endl; 35 | 36 | 37 | // COMPARE SEARCH RESULTS to make sure reading worked 38 | string mgf_file = "/home/ynowatzk/data/9MM/mgf/9MM_FASP.mgf"; 39 | library *search_lib = new library(mgf_file); 40 | library *lib = new library(sc->specs); 41 | lib->build_library_index(); 42 | 43 | spectral_search search(search_lib, lib); 44 | cout << "Searching fragment ion index" << endl; 45 | start = chrono::high_resolution_clock::now(); 46 | search.search_target_library(); 47 | stop = chrono::high_resolution_clock::now(); 48 | duration = duration_cast(stop - start); 49 | 50 | cout << "Search Time: " << duration.count() << " seconds" << endl; 51 | search.save_results_to_file("FIIndex2.csv"); 52 | 53 | 54 | return 0; 55 | } -------------------------------------------------------------------------------- /src/scanner.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "scanner.h" 5 | #include "msp_reader.h" 6 | #include "mgf_reader.h" 7 | 8 | using namespace std; 9 | 10 | scanner::scanner() { 11 | 12 | } 13 | 14 | bool scanner::scan_directory(string path) { 15 | cout << "Scanning directory: " << path << endl; 16 | for (const auto & entry : std::filesystem::directory_iterator(path)) { 17 | scan_file(entry.path().string()); 18 | } 19 | return true; 20 | } 21 | 22 | bool scanner::scan_file(string path) { 23 | cout << "Scanning: " << path << endl; 24 | 25 | 26 | string extension = path.substr(path.rfind('.') + 1, string::npos); 27 | 28 | if (extension == "msp") { 29 | if (!msp_reader::read_file_precursors_efficient(path, parents)) { 30 | cout << "Error reading file: " << path << endl; 31 | return false; 32 | } 33 | 34 | ++no_files_read; 35 | int size = std::filesystem::file_size(path) / 1024; 36 | cout << "File size " << size << " KB" << endl; 37 | kb_lib_size += size; 38 | } 39 | else if (extension == "mgf") { 40 | /*if (!mgf_reader::read_file(path, soondeleted())) { 41 | cout << "Error reading file: " << path << endl; 42 | return false; 43 | }*/ 44 | cout << ".mgf quick scan not implemented yet" << endl; 45 | return false; 46 | } 47 | else { 48 | cout << "Unknown file extension" << endl; 49 | return false; 50 | } 51 | 52 | return true; 53 | 54 | 55 | } 56 | 57 | bool scanner::analyze() { 58 | 59 | sort(parents.begin(), parents.end(), [](const precursor *a, const precursor *b) { 60 | return *a < *b; 61 | }); 62 | 63 | return true; 64 | } 65 | 66 | bool scanner::save_precursor_distribution_to_file(string path, string delimiter) { 67 | 68 | fstream outfile; 69 | outfile.open(path, ios::out); 70 | 71 | if (!outfile.good()) 72 | return false; 73 | 74 | // Add header 75 | outfile << "mz"+delimiter+"charge" << endl; 76 | 77 | // Go through matches and parse relevant information for each 78 | for (int i = 0; i < parents.size(); ++i) { 79 | precursor *p = parents[i]; 80 | outfile << p->mz << delimiter << p->charge << endl; 81 | } 82 | 83 | outfile.close(); 84 | return true; 85 | 86 | return false; 87 | } 88 | 89 | bool scanner::print_scan_results() { 90 | 91 | cout << "Readable files detected: " << no_files_read << endl; 92 | cout << "Total size: " << kb_lib_size / 1024 << " MB (" << float(kb_lib_size) / float(1024*1024) << " GB)" << endl; 93 | 94 | return false; 95 | } 96 | -------------------------------------------------------------------------------- /src/scanner.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_SCANNER_H 2 | #define SIMPLE_EXAMPLE_SCANNER_H 3 | 4 | #include 5 | #include "spectrum.h" 6 | 7 | class scanner { 8 | private: 9 | 10 | int no_files_read = 0; 11 | int kb_lib_size = 0; 12 | 13 | 14 | public: 15 | scanner(); 16 | bool scan_directory(std::string path); 17 | bool scan_file(std::string path); 18 | 19 | bool analyze(); 20 | bool save_precursor_distribution_to_file(std::string path, std::string delimiter="\t"); 21 | 22 | bool print_scan_results(); 23 | 24 | std::vector parents; 25 | std::vector specs; 26 | }; 27 | 28 | 29 | #endif //SIMPLE_EXAMPLE_SCANNER_H 30 | -------------------------------------------------------------------------------- /src/scores.cpp: -------------------------------------------------------------------------------- 1 | #include "scores.h" 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | 7 | float scores::dot_product(vector &target_bins, vector &other_bins) { 8 | float dot = 0.f; 9 | int num_bins = 0; 10 | 11 | for (int i = 0; i < target_bins.size(); ++i) { 12 | dot += target_bins[i] * other_bins[i]; 13 | /*if (target_bins[i] * other_bins[i] > 0) { 14 | //cout << target_bins[i] << " * " << other_bins[i] << " = " << target_bins[i] * other_bins[i] << endl; 15 | ++num_bins; 16 | }*/ 17 | } //TODO try and compare runtime for iterator 18 | 19 | /*float m1 = 0; 20 | for (float f:target_bins) { 21 | m1 += f*f; 22 | } 23 | m1 = sqrt(m1); 24 | 25 | float m2 = 0; 26 | for (float f: other_bins) { 27 | m2 += f*f; 28 | } 29 | m2 = sqrt(m2); 30 | 31 | cout << m1 << " " << m2 << " " << dot << " " << dot / (m1 * m2) << endl;*/ 32 | //cout << " no. " << num_bins << " "; 33 | return dot; 34 | } 35 | -------------------------------------------------------------------------------- /src/scores.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_SCORES_H 2 | #define SIMPLE_EXAMPLE_SCORES_H 3 | #include 4 | 5 | class scores { 6 | public: 7 | static float dot_product(std::vector &target_bins, std::vector &other_bins); 8 | }; 9 | 10 | 11 | #endif //SIMPLE_EXAMPLE_SCORES_H 12 | -------------------------------------------------------------------------------- /src/search_index.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "settings.h" 6 | #include "search_manager.h" 7 | #include "thread_pool.h" 8 | 9 | using namespace std; 10 | 11 | cxxopts::ParseResult parseArgs(int argc, const char* argv[]) { 12 | try { 13 | for (int i = 0; i < argc; ++i) { 14 | settings::search_command += argv[i]; 15 | settings::search_command += " "; 16 | } 17 | settings::search_command.pop_back(); 18 | 19 | cxxopts::Options options("mistle-search", "Search experimental mass spectra in mistle fragment ion index"); 20 | 21 | options.positional_help("[optional args]").show_positional_help(); 22 | 23 | options.add_options() 24 | ("h, help", "Print this help message") 25 | ("s,search", "search file or directory ", cxxopts::value(), "PATH") 26 | ("i,index", "index directory (must contain config.txt and binary index files)", cxxopts::value(), "PATH") 27 | ("o,output", "output path", cxxopts::value()->default_value("./results.csv"), "NAME") 28 | ("t,threads", "number of threads", cxxopts::value()->default_value("1"), "NUM") 29 | ("p,ppm_tolerance", "precursor mz tolerance given in ppm", cxxopts::value()->default_value("10"), "NUM") 30 | ("m,mz_tolerance", "precursor mz tolerance (absolut value in Da)", cxxopts::value(), "NUM") 31 | ("b,bin_size", "bin size for fragment ion binning (in Da)", cxxopts::value()->default_value("1"), "NUM") 32 | ("hits_per_spectrum", "number of output matches per input spectrum", cxxopts::value()->default_value("1"), "NUM") 33 | ("reduce_noise_in_window", "Apply noise reduction with the top X peaks in window w approach (default: off)", cxxopts::value()->default_value("false")) 34 | ("peaks_per_window", "number of peaks per window", cxxopts::value()->default_value("5"), "NUM") 35 | ("window_size", "window size", cxxopts::value()->default_value("100.0"), "NUM"); 36 | //("neighbors", "number of neighboring bins intensity is carried over (on search spectrum peaks)", cxxopts::value()->default_value("0"), "NUM") 37 | //("neighbors_intensity_factor", "fraction [0, 1] of intensity carried over to neighboring bin(s)", cxxopts::value()->default_value("0.5"), "NUM") 38 | //("B,batch_size", "number of mass spectra loaded in a batch", cxxopts::value(), "NUM"); 39 | 40 | 41 | options.parse_positional({"search", "index"}); 42 | 43 | auto result = options.parse(argc,argv); 44 | 45 | 46 | if (result.count("help")) { 47 | std::cout << options.help() << std::endl; 48 | exit(0); 49 | } 50 | if (result.count("search")) { 51 | settings::search_path = result["search"].as(); 52 | } else { 53 | std::cerr << "Missing input: -s/--search" << std::endl; 54 | exit(1); 55 | } 56 | if (result.count("index")) { 57 | settings::index_path = result["index"].as(); 58 | if (!settings::index_path.ends_with('/')) { 59 | settings::index_path += "/"; 60 | } 61 | } else { 62 | std::cerr << "Missing input: -i/--index" << std::endl; 63 | exit(1); 64 | } 65 | settings::output_path = result["output"].as(); 66 | 67 | if (result.count("threads")) { 68 | settings::num_threads = result["threads"].as(); 69 | settings::parallel = (settings::num_threads > 1); 70 | } 71 | if (result.count("hits_per_spectrum")) { 72 | settings::num_hit_ranks = result["hits_per_spectrum"].as(); 73 | } 74 | if (result.count("mz_tolerance")) { 75 | settings::mz_tolerance = result["mz_tolerance"].as(); 76 | settings::use_ppm_tolerance = false; 77 | } 78 | if (result.count("ppm_tolerance")) { 79 | settings::use_ppm_tolerance = true; 80 | settings::ppm_tolerance = result["ppm_tolerance"].as(); 81 | settings::ppm_factor = settings::ppm_tolerance / 1000000.0f; 82 | if (result.count("mz_tolerance")) { 83 | cerr << "Precursor mz tolerance given in ppm and Dalton. Please choose one." << endl; 84 | exit(1); 85 | } 86 | } 87 | if (result.count("bin_size")) { 88 | settings::bin_size = result["bin_size"].as(); 89 | } 90 | if (result.count("bin_size")) { 91 | settings::bin_size = result["bin_size"].as(); 92 | } 93 | if (result.count("reduce_noise_in_window")) { 94 | settings::apply_topX_in_window_denoising = true; 95 | settings::peaks_per_window = result["peaks_per_window"].as(); 96 | settings::window_size = result["window_size"].as(); 97 | } 98 | 99 | /*if (result.count("neighbors")) { 100 | settings::neighbors = result["neighbors"].as(); 101 | settings::neighbors_intensity_factor = result["neighbors_intensity_factor"].as(); 102 | } 103 | if (result.count("batch_size")) { 104 | settings::batch_size = result["batch_size"].as(); 105 | settings::load_batches = true; 106 | }*/ 107 | 108 | return result; 109 | 110 | } 111 | catch (const cxxopts::OptionException& e) { 112 | std::cout << "error parsing options: " << e.what() << std::endl; 113 | exit(1); 114 | } 115 | } 116 | 117 | int main(int argc, const char* argv[]) { 118 | 119 | cout << "+++ Mistle Search +++" << endl; 120 | 121 | 122 | /* 123 | * Args 124 | */ 125 | 126 | #if USE_AVX_2 127 | std::cout << "USING AVX2" << endl; 128 | #endif 129 | #if USE_AVX_512 130 | std::cout << "USING AVX512" << endl; 131 | #endif 132 | 133 | parseArgs(argc, argv); 134 | 135 | auto start = chrono::high_resolution_clock::now(); 136 | 137 | 138 | /* 139 | * Preparation and Search 140 | */ 141 | 142 | search_manager sm(settings::search_path, settings::index_path); 143 | 144 | std::cout << "Preparing libraries and indices ..." << std::endl; 145 | sm.prepare_search_library(); 146 | std::cout << "Loading precursor index" << std::endl; 147 | auto check_point = chrono::high_resolution_clock::now(); 148 | sm.prepare_precursor_index(); 149 | std::cout << "Loading time (index): " << duration_cast(chrono::high_resolution_clock::now() - check_point).count() << " seconds" << std::endl; 150 | 151 | std::cout << "Searching fragment-ion-indices" << endl; 152 | sm.perform_searches(); 153 | std::cout << "Merging overlapping results" << std::endl; 154 | sm.merge_matches(); 155 | //std::cout << "Writing results to file" << std::endl; 156 | 157 | if (settings::output_path.ends_with(".pin")) { 158 | sm.save_search_results_in_pin_format(settings::output_path); 159 | } else { 160 | sm.save_search_results_to_file(settings::output_path); 161 | } 162 | 163 | 164 | cout << "Inner search time elapsed: " << sm.get_time_spent_in_inner_search() << " seconds" << endl; 165 | 166 | 167 | auto stop = chrono::high_resolution_clock::now(); 168 | auto duration = duration_cast(stop - start); 169 | cout << "Total time elapsed: " << duration.count() << " seconds" << endl; 170 | 171 | return 0; 172 | 173 | } -------------------------------------------------------------------------------- /src/search_manager.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_SEARCH_MANAGER_H 2 | #define SIMPLE_EXAMPLE_SEARCH_MANAGER_H 3 | 4 | #include 5 | #include 6 | #include "library.h" 7 | #include "configuration.h" 8 | #include "match.h" 9 | #include "thread_pool.h" 10 | 11 | 12 | class search_manager { 13 | 14 | std::string search_file_path; 15 | std::string index_directory_path; 16 | std::shared_ptr config; 17 | 18 | bool last_batch = true; 19 | library search_library; //TODO probably replace by simple list of spectra 20 | 21 | //Mapped ms2 ids to sub-index where they might occur 22 | std::vector> mapped_search_ids; //TODO name right (bucket = subindex) 23 | 24 | /* 25 | * Indices 26 | */ 27 | std::shared_ptr precursor_idx; 28 | std::shared_ptr frag_idx; 29 | 30 | /* 31 | * Threading 32 | */ 33 | 34 | std::shared_ptr pool; 35 | 36 | /* 37 | * Results 38 | */ 39 | long starting_time; 40 | long total_time_elapsed; 41 | std::vector matches; 42 | 43 | /* 44 | * Timer 45 | */ 46 | 47 | std::chrono::duration inner_search_duration; 48 | 49 | public: 50 | 51 | search_manager(std::string search_file_path, std::string index_directory_path); 52 | 53 | bool prepare_search_library(); 54 | bool prepare_precursor_index(); 55 | bool perform_searches(); 56 | bool perform_searches_parallel(); 57 | bool merge_matches(); //todo probably going over ids back to front and popping matches in the back 58 | bool save_search_results_to_file(const std::string &file_path); 59 | bool save_search_results_in_pin_format(const std::string &file_path); 60 | 61 | long get_time_spent_in_inner_search(); 62 | 63 | private: 64 | /* 65 | * Search: 3-fold implementation, depending on cpu instruction level (STANDARD, AVX2, AVX512) 66 | */ 67 | bool search_spectrum(unsigned int search_id); 68 | std::vector order_of_scores(std::vector &scores); 69 | float rescore_spectrum(unsigned int search_id, unsigned int target_id); 70 | bool rescore_match_old(match &psm); 71 | bool rescore_match(match &psm); 72 | bool prepare_next_batch(); 73 | 74 | 75 | float sigma; 76 | float max_normal; 77 | static float normal_pdf(float x, float mean, float standard_deviation); 78 | static float normal_pdf_scaled(float x, float mean, float standard_deviation); 79 | static float contrast_angle(float dot); 80 | static long long unsigned int factorial(int n); 81 | static bool is_peptide_isomer(std::string &peptide, std::string &other); 82 | }; 83 | 84 | 85 | #endif //SIMPLE_EXAMPLE_SEARCH_MANAGER_H 86 | -------------------------------------------------------------------------------- /src/settings.cpp: -------------------------------------------------------------------------------- 1 | #include "settings.h" 2 | 3 | bool settings::parallel = false; 4 | int settings::num_threads = 4; 5 | int settings::batch_size = 100000; 6 | bool settings::load_batches = false; 7 | 8 | 9 | float settings::mz_tolerance = 3.0f; // By default unused 10 | bool settings::use_ppm_tolerance = true; 11 | float settings::ppm_tolerance = 10.0f; 12 | float settings::ppm_factor = 10.0f / 1000000.0f; 13 | float settings::bin_size = 1.0f; 14 | int settings::num_hit_ranks = 2; 15 | 16 | int settings::neighbors = 0; 17 | float settings::neighbors_intensity_factor = 0.5f; 18 | 19 | std::string settings::search_path; 20 | std::string settings::index_path; 21 | std::string settings::output_path; 22 | 23 | 24 | /* 25 | * Debugging and testing 26 | */ 27 | 28 | std::string settings::search_command; 29 | bool settings::save_search_command = true; 30 | bool settings::turn_off_fragment_intensities = false; //when loading fragment, set intensity to 1 31 | bool settings::apply_topX_in_window_denoising = false; 32 | 33 | int settings::peaks_per_window = 5; 34 | float settings::window_size = 100.f; 35 | -------------------------------------------------------------------------------- /src/settings.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_SETTINGS_H 2 | #define SIMPLE_EXAMPLE_SETTINGS_H 3 | 4 | 5 | #include 6 | 7 | class settings { 8 | 9 | public: 10 | /* 11 | * Run parameters 12 | */ 13 | 14 | static bool parallel; 15 | static int num_threads; 16 | static bool load_batches; 17 | static int batch_size; 18 | 19 | /* 20 | * Search properties 21 | */ 22 | static float mz_tolerance; 23 | static bool use_ppm_tolerance; 24 | static float ppm_tolerance; 25 | static float ppm_factor; 26 | //todo add parameters like neighbors bin scoring. sqrt normalization etc. 27 | static float bin_size; 28 | static int num_hit_ranks; 29 | 30 | static int neighbors; 31 | static float neighbors_intensity_factor; 32 | 33 | //Names and strings 34 | static std::string search_path; 35 | static std::string index_path; 36 | static std::string output_path; 37 | 38 | /* 39 | * Debugging and testing 40 | */ 41 | static bool save_search_command; 42 | static std::string search_command; 43 | static bool turn_off_fragment_intensities; 44 | 45 | static bool apply_topX_in_window_denoising; 46 | static int peaks_per_window; 47 | static float window_size; 48 | 49 | }; 50 | 51 | 52 | #endif //SIMPLE_EXAMPLE_SETTINGS_H 53 | -------------------------------------------------------------------------------- /src/spectral_search.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "spectral_search.h" 5 | #include "scores.h" 6 | 7 | using namespace std; 8 | 9 | spectral_search::spectral_search() { 10 | 11 | } 12 | 13 | spectral_search::spectral_search(std::shared_ptr query_lib) : query_lib(query_lib) { 14 | 15 | } 16 | 17 | spectral_search::spectral_search(std::shared_ptr query_lib, std::shared_ptr target_lib) : 18 | query_lib(query_lib), target_lib(target_lib) 19 | { 20 | 21 | } 22 | 23 | bool spectral_search::search_target_library(std::shared_ptr target_lib) { 24 | this->target_lib = target_lib; 25 | return search_target_library(); 26 | } 27 | 28 | bool spectral_search::is_candidate_suitable(shared_ptr candidate_spectrum, shared_ptrquery_spectrum) { 29 | bool has_equal_charge = candidate_spectrum->charge == query_spectrum->charge; 30 | bool is_in_mass_range = abs(candidate_spectrum->precursor_mass - query_spectrum->precursor_mass) <= mz_tolerance; 31 | return has_equal_charge && is_in_mass_range; 32 | } 33 | 34 | vector spectral_search::get_results() { 35 | return search_results; 36 | } 37 | 38 | bool spectral_search::save_results_to_file(string path, string delimiter) { 39 | 40 | fstream outfile; 41 | outfile.open(path, ios::out); 42 | 43 | if (!outfile.good()) 44 | return false; 45 | 46 | // Add header 47 | outfile << "spectrum"+delimiter+"match"+delimiter+"peptide"+delimiter+"dot-product"+delimiter+"mass-difference\n"; 48 | 49 | // Go through matches and parse relevant information for each 50 | for (int i = 0; i < search_results.size(); ++i) { 51 | match psm = search_results[i]; 52 | outfile << psm.query_spectrum->name << delimiter << psm.matched_spectrum->name << delimiter << psm.matched_spectrum->peptide << delimiter << psm.dot_product << delimiter << psm.mass_difference << endl; 53 | } 54 | 55 | outfile.close(); 56 | return true; 57 | } 58 | 59 | bool spectral_search::read_results_from_file(string path, char delimiter, bool read_dot, bool has_header) { 60 | 61 | /* 62 | * Requires file format to match delimiter separated result file as it is generated by this program 63 | * At the very least the format has to respect: name (of query_spectrum) delimiter match (name of library spectrum) 64 | */ 65 | 66 | search_results.clear(); 67 | fstream infile; 68 | 69 | infile.open(path, ios::in); 70 | if (!infile) { 71 | cerr << "Could not open file at " << path << endl; 72 | } 73 | if (read_dot) { 74 | cerr << "Reading dot-product not implemented. Do so. Proceeding without." << endl; 75 | } 76 | 77 | string line; 78 | if (has_header) { 79 | getline(infile, line); 80 | } 81 | 82 | string name, match_name; 83 | while (!infile.eof()) { 84 | getline(infile, name, delimiter); 85 | getline(infile, match_name, delimiter); 86 | if (read_dot) { 87 | 88 | } 89 | getline(infile, line); 90 | 91 | // Find spectra in the libraries according to the names 92 | 93 | shared_ptr query_spectrum, matched_spectrum; 94 | for (shared_ptr &s : query_lib->spectrum_list) { //TODO run-time optimize if this takes to long 95 | if (s->name == name) { 96 | query_spectrum = s; 97 | } 98 | } 99 | for (shared_ptr &s : target_lib->spectrum_list) { //TODO run-time optimize if this takes to long 100 | if (s->name == match_name) { 101 | matched_spectrum = s; 102 | } 103 | } 104 | search_results.emplace_back(query_spectrum, matched_spectrum, -1.f, -1); 105 | 106 | 107 | } 108 | 109 | return false; 110 | } 111 | 112 | bool spectral_search::rescore_matches() { 113 | 114 | for (match &m : search_results) { 115 | m.dot_product = scores::dot_product(m.query_spectrum->bins, m.matched_spectrum->bins); 116 | } 117 | 118 | return true; 119 | } 120 | 121 | bool spectral_search::search_target_library() { 122 | search_results.clear(); 123 | if (target_lib->is_indexed) { 124 | return search_fragment_ion_index(); 125 | } 126 | cout << "Begin searching target library" << endl; 127 | 128 | for (int i = 0; i < query_lib->spectrum_list.size(); ++i) { 129 | shared_ptr query_spectrum = query_lib->spectrum_list[i]; 130 | 131 | if (i % 1000 == 0) { 132 | cout << "progress: " << i << " of " << query_lib->spectrum_list.size() << " " << (float(i) / query_lib->spectrum_list.size()) * 100 << " %" << endl; 133 | } 134 | 135 | float max_dot = -1.f; 136 | std::shared_ptr best_candidate = make_shared(); 137 | //Naive exhaustive spectral_search 138 | for (shared_ptr &candidate_spectrum : target_lib->spectrum_list) { 139 | if (is_candidate_suitable(candidate_spectrum, query_spectrum)) { 140 | float dot = scores::dot_product(query_spectrum->bins, candidate_spectrum->bins); 141 | if (dot >= max_dot) {//todo What if equal 142 | best_candidate = candidate_spectrum; 143 | max_dot = dot; 144 | } 145 | } 146 | } 147 | if (max_dot >= 0.0) { // if any match was found (i.e. any spectra in mz range) 148 | match best_match(query_spectrum, best_candidate, max_dot, 1); 149 | search_results.push_back(best_match); 150 | } 151 | 152 | } 153 | 154 | return true; 155 | } 156 | 157 | bool spectral_search::search_fragment_ion_index() { 158 | 159 | precursor_index *precursor_index = target_lib->precursor_idx; 160 | fragment_ion_index *fragment_ion_index = target_lib->fragment_ion_idx; 161 | 162 | 163 | cout << "Searching fragment ion index" << endl; 164 | 165 | for (int i = 0; i < query_lib->spectrum_list.size(); ++i) { 166 | // Show progress 167 | if (i % 1000 == 0) { 168 | cout << "progress: " << i << " of " << query_lib->spectrum_list.size() << " " << (float(i) / query_lib->spectrum_list.size()) * 100 << " %" << endl; 169 | } 170 | 171 | shared_ptr query_spectrum = query_lib->spectrum_list[i]; 172 | 173 | // Determine range of candidate spectra 174 | int lower_index = precursor_index->get_lower_bound(query_spectrum->charge,query_spectrum->precursor_mass - mz_tolerance); 175 | int upper_index = precursor_index->get_upper_bound(query_spectrum->charge,query_spectrum->precursor_mass + mz_tolerance); 176 | 177 | if (lower_index < 0 || upper_index < 0 || lower_index > upper_index) { // No matching precursor masses 178 | continue; 179 | } 180 | 181 | // Init candidate scores 182 | vector dot_scores(upper_index - lower_index + 1, 0.f); 183 | 184 | // Update scores by matching all peaks using the fragment ion index 185 | for (int j = 0; j < query_spectrum->binned_peaks.size(); ++j) { 186 | 187 | // Open ion mass bin for corresponding peak 188 | fragment_bin ion_bin = fragment_ion_index->fragment_bins[query_spectrum->binned_peaks[j]]; 189 | 190 | // Determine starting point of lowest (candidate) parent index inside bin 191 | int lower_index_inside_bin = std::lower_bound(ion_bin.begin(), ion_bin.end(), lower_index, [](fragment f, int idx) { 192 | return f.parent_id < idx; 193 | }) - ion_bin.begin(); 194 | 195 | //Update scores for all parents with fragments in the range 196 | for (int k = lower_index_inside_bin; k < ion_bin.size() && ion_bin[k].parent_id <= upper_index; ++k) { 197 | fragment f = ion_bin[k]; 198 | dot_scores[f.parent_id - lower_index] += f.intensity * query_spectrum->binned_intensities[j]; 199 | } 200 | 201 | } 202 | 203 | // Prepare best-scoring PSM 204 | int max_elem = max_element(dot_scores.begin(), dot_scores.end()) - dot_scores.begin(); 205 | float dot = dot_scores[max_elem]; 206 | int parent_index = max_elem + lower_index; 207 | std::cerr << "currently unavailable" << std::endl; 208 | //TODO match top_match(query_spectrum, precursor_index->get_spectrum(parent_index), dot, 1); 209 | 210 | //TODO search_results.push_back(top_match); 211 | } 212 | return true; 213 | } -------------------------------------------------------------------------------- /src/spectral_search.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_SPECTRAL_SEARCH_H 2 | #define SIMPLE_EXAMPLE_SPECTRAL_SEARCH_H 3 | #include "library.h" 4 | #include "match.h" 5 | #include "fragment_ion_index.h" 6 | 7 | 8 | class spectral_search { 9 | std::shared_ptr query_lib; 10 | std::shared_ptr target_lib; 11 | 12 | std::vector search_results; 13 | float mz_tolerance=3.0; 14 | 15 | public: 16 | spectral_search(); 17 | explicit spectral_search(std::shared_ptr query_lib); 18 | spectral_search(std::shared_ptr query_lib, std::shared_ptr target_lib); 19 | 20 | /* 21 | * Searching query library against a target library 22 | */ 23 | bool search_target_library(); 24 | bool search_target_library(std::shared_ptr target_lib); 25 | bool search_fragment_ion_index(); 26 | 27 | std::vector get_results(); 28 | bool save_results_to_file(std::string path, std::string delimiter="\t"); 29 | bool read_results_from_file(std::string path, char delimiter='\t', bool read_dot=false, bool has_header=true); 30 | 31 | /* 32 | * (Re-)scoring of matches 33 | */ 34 | bool rescore_matches(); 35 | 36 | private: 37 | bool is_candidate_suitable(std::shared_ptr candidate_spectrum, std::shared_ptr query_spectrum); //Checking if charge and mz conditions are fullfilled to warrant a closer look 38 | }; 39 | 40 | 41 | #endif //SIMPLE_EXAMPLE_SPECTRAL_SEARCH_H 42 | -------------------------------------------------------------------------------- /src/spectrum.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "spectrum.h" 5 | #include "DefineConstants.h" 6 | #include "settings.h" 7 | 8 | using namespace std; 9 | 10 | 11 | spectrum::spectrum() { 12 | 13 | } 14 | 15 | 16 | bool spectrum::bin_peaks(bool root_rescale, bool normalize) { 17 | 18 | //TODO: OBSOLETE use sparse binning instead 19 | 20 | num_bins = int((BIN_MAX_MZ - BIN_MIN_MZ) / settings::bin_size) + 1; // TODO maybe smart 21 | 22 | bins = vector(num_bins, 0.f); 23 | 24 | for (int i = 0; i < peak_positions.size(); ++i) { 25 | int bin = get_mz_bin(peak_positions[i]); 26 | if (bin < 0 || bin > bins.size() - 1) { // TODO spectraST light ion, cut_off. bin < 180 ?? 27 | //TODO what would spectrast do? 28 | //cout << "Warning peak out of bin range :: discarding intensity" << endl; 29 | bin = 0; 30 | 31 | continue; 32 | } 33 | /*if (abs(float(bin) - precursor_mass) < 10) //TODO why is this a thing? 34 | continue;*/ 35 | if (remove_charge_reduced_precursor && spectrast_isNearPrecursor(peak_positions[i])) { 36 | continue; 37 | } 38 | 39 | float intensity = intensities[i]; 40 | if (root_rescale) 41 | intensity = sqrt(intensity); 42 | 43 | bins[bin] = sqrt(bins[bin] * bins[bin] + intensity + intensity); //sqrt-accumulate if multiple peaks fall into the sam bin 44 | if (intensity_bin_spanning_factor > 0.f) { 45 | float neighbor_intensity = intensity * intensity_bin_spanning_factor; 46 | if (bin > 0) { 47 | bins[bin-1] = sqrt(intensity * intensity + neighbor_intensity * neighbor_intensity); 48 | } 49 | if (bin < bins.size()) { 50 | bins[bin+1] = sqrt(intensity * intensity + neighbor_intensity * neighbor_intensity); 51 | } 52 | } 53 | } 54 | 55 | if (normalize) { 56 | /*float min_cut = 0.01f; //TODO undirty or delete this 57 | for (float &i : bins) { 58 | if (i m_parentMz - 60.0 && mz < m_parentMz + 20.0) { 74 | return (true); 75 | } 76 | } else {*/ 77 | 78 | int lowCharge = charge; //TODO this is the tool I am dealing with 79 | int highCharge = charge; 80 | // if (m_parentCharge == 0) { 81 | // remove all possible charge-reduced precursors for precursor charge up to 6. 82 | lowCharge = 3; 83 | highCharge = 6; 84 | //} 85 | 86 | for (int parentCharge = lowCharge; parentCharge <= highCharge; parentCharge++) { 87 | double parentMass = precursor_mass * parentCharge; 88 | for (int c = parentCharge; c >= 1; c--) { 89 | if (mz >= (parentMass - 20.0) / (double)c && mz <= (parentMass + 6.0) / (double)c) { 90 | return (true); 91 | } 92 | } 93 | } 94 | return (false); 95 | } 96 | 97 | 98 | bool spectrum::normalize_bins(float magnitude) { 99 | if (magnitude < 0.f) { // Default, magnitude not specified 100 | magnitude = 0.f; 101 | for (float &i : bins) { 102 | magnitude += i*i; 103 | } 104 | magnitude = sqrt(magnitude); 105 | } 106 | for (float &i : bins) { 107 | i = i / magnitude; 108 | } 109 | 110 | return true; 111 | } 112 | 113 | bool spectrum::bin_peaks_sparse(bool root_rescale, bool normalize) { 114 | 115 | binned_peaks.clear(); 116 | binned_intensities.clear(); 117 | num_bins = int((BIN_MAX_MZ - BIN_MIN_MZ) / settings::bin_size) + 1; 118 | 119 | for (int i = 0; i < peak_positions.size(); ++i) { 120 | 121 | if (peak_positions[i] < BIN_MIN_MZ || peak_positions[i] > BIN_MAX_MZ) { 122 | continue; 123 | } 124 | 125 | if (remove_charge_reduced_precursor && spectrast_isNearPrecursor(peak_positions[i])) { 126 | continue; 127 | } 128 | //Determine bin 129 | int bin = get_mz_bin(peak_positions[i]); 130 | 131 | //Retrieve and scale intensity 132 | float intensity = intensities[i]; 133 | if (root_rescale) 134 | intensity = sqrt(intensity); 135 | 136 | //Update existing bin (if possible) 137 | add_intensity_to_bin(bin, intensity); 138 | for (int j = 1; j <= settings::neighbors; ++j) { //Adding intensity fraction to neighboring bins 139 | add_intensity_to_bin(bin + j, intensity * pow(settings::neighbors_intensity_factor, j)); 140 | add_intensity_to_bin(bin - j, intensity * pow(settings::neighbors_intensity_factor, j)); 141 | } 142 | 143 | } 144 | 145 | //Normalize sparse bins 146 | if (normalize) { 147 | return normalize_sparse_bins(); 148 | } 149 | return true; 150 | } 151 | 152 | bool spectrum::normalize_sparse_bins(float magnitude) { 153 | if (magnitude < 0.f) { // Default, magnitude not specified 154 | magnitude = 0.f; 155 | for (float &i : binned_intensities) { 156 | magnitude += i*i; 157 | } 158 | magnitude = sqrt(magnitude); 159 | } 160 | for (float &i : binned_intensities) { 161 | i = i / magnitude; 162 | } 163 | 164 | return true; 165 | } 166 | 167 | bool spectrum::operator<(const spectrum &other) const { 168 | return charge < other.charge || (charge == other.charge && precursor_mass < other.precursor_mass); 169 | } 170 | 171 | bool spectrum::operator<(pair charge_mass_tuple) const { 172 | return charge < charge_mass_tuple.first || (charge == charge_mass_tuple.first && precursor_mass < charge_mass_tuple.second); 173 | } 174 | 175 | bool spectrum::operator<=(pair charge_mass_tuple) const { 176 | return charge < charge_mass_tuple.first || (charge == charge_mass_tuple.first && precursor_mass <= charge_mass_tuple.second); 177 | } 178 | 179 | bool spectrum::root_scale_intensities() { 180 | for (float &intensity : intensities) { 181 | intensity = sqrt(intensity); 182 | } 183 | return true; 184 | } 185 | 186 | bool spectrum::normalize_intensities() { 187 | float magnitude = 0.f; 188 | for (float &i : intensities) { 189 | magnitude += i*i; 190 | } 191 | magnitude = sqrt(magnitude); 192 | 193 | for (float &i : intensities) { 194 | i = i / magnitude; 195 | } 196 | 197 | return true; 198 | } 199 | 200 | int spectrum::get_mz_bin(float mz) { 201 | int bin = int((mz - BIN_MIN_MZ) / settings::bin_size); 202 | return bin; 203 | } 204 | 205 | bool spectrum::add_intensity_to_bin(int bin, float intensity) { 206 | if (bin < 0 || bin > num_bins - 1) { 207 | return false; 208 | } 209 | const vector::iterator &bin_iter = std::find(binned_peaks.begin(), binned_peaks.end(), bin); 210 | if (bin_iter != binned_peaks.end()) { 211 | int j = bin_iter - binned_peaks.begin(); 212 | binned_intensities[j] = sqrt(binned_intensities[j] * binned_intensities[j] + intensity * intensity); 213 | } else { 214 | binned_peaks.push_back(bin); 215 | binned_intensities.push_back(intensity); 216 | } 217 | return true; 218 | } 219 | 220 | bool spectrum::denoise_mz_window(int topX, float window_size) { 221 | 222 | //TODO: this is a naive sliding window implementation. To find pairs of peaks with distance > window size could optimize this process -> in the future 223 | 224 | auto start = *std::min_element(peak_positions.begin(), peak_positions.end()); 225 | for (int mz = int(start); mz <= int(BIN_MAX_MZ - window_size) + 1; ++mz) { 226 | float mz_max = float(mz) + window_size; 227 | 228 | std::vector peak_idx; 229 | std::vector peak_int; 230 | std::vector delete_idx; 231 | 232 | for (int i = 0; i < peak_positions.size(); ++i) { 233 | if (peak_positions[i] >= float(mz)) { 234 | if (peak_positions[i] > mz_max) { 235 | break; 236 | } 237 | peak_idx.push_back(i); 238 | peak_int.push_back(intensities[i]); 239 | } 240 | } 241 | 242 | if (peak_idx.size() > topX) { 243 | vector peak_int_sorted = peak_int; // Exists only to retrieve intensity of X-th highest peak 244 | std::sort(peak_int_sorted.begin(), peak_int_sorted.end(),greater <>()); 245 | 246 | float ref_intensity = peak_int_sorted[topX - 1]; 247 | for (int i = 0; i < peak_idx.size(); ++i) { 248 | if (peak_int[i] < ref_intensity) { 249 | delete_idx.push_back(peak_idx[i]); 250 | } 251 | } 252 | 253 | //Do the deletion (starting from the highest index to avoid a shift in vector indices) 254 | std::sort(delete_idx.begin(), delete_idx.end(),greater <>()); 255 | 256 | for (int i : delete_idx) { 257 | peak_positions.erase(peak_positions.begin() + i); 258 | intensities.erase(intensities.begin() + i); 259 | } 260 | } 261 | 262 | } 263 | 264 | 265 | return false; 266 | } 267 | 268 | bool spectrum::normalize_intensity_vector(vector &intensities) { 269 | float magnitude = 0.f; 270 | for (float &i : intensities) { 271 | magnitude += i*i; 272 | } 273 | magnitude = sqrt(magnitude); 274 | 275 | for (float &i : intensities) { 276 | i = i / magnitude; 277 | } 278 | 279 | return true; 280 | } 281 | 282 | bool spectrum::normalize_intensity_vector(vector> &peaks) { 283 | float magnitude = 0.f; 284 | for (auto &p : peaks) { 285 | magnitude += p.second * p.second; 286 | } 287 | magnitude = sqrt(magnitude); 288 | 289 | for (auto &p : peaks) { 290 | p.second = p.second / magnitude; 291 | } 292 | 293 | return true; 294 | } 295 | 296 | -------------------------------------------------------------------------------- /src/spectrum.h: -------------------------------------------------------------------------------- 1 | #ifndef SPECTRAL_SEARCH_ENGINE_SPECTRUM_H 2 | #define SPECTRAL_SEARCH_ENGINE_SPECTRUM_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | 10 | class spectrum { 11 | public: 12 | int id; 13 | std::string name; 14 | std::string peptide; 15 | //string species; 16 | float precursor_mass; 17 | int charge; 18 | 19 | int search_counter = 0; 20 | 21 | /* 22 | * Raw peaks 23 | * pos[i] corresponds to intensity[i] 24 | */ 25 | 26 | std::vector peak_positions; 27 | std::vector intensities; 28 | 29 | /* 30 | * Binning and rescaling intensities 31 | */ 32 | std::vector bins; //vector over all bins (including zeros) 33 | std::vector binned_peaks; //vector listing existing peaks binned 34 | std::vector binned_intensities; //vector listing intensities corresponding to binned peaks 35 | int num_bins; 36 | 37 | //factor of intensity carried over to neighboring bins to account for mz-shifts 38 | float intensity_bin_spanning_factor = -0.5f; //set to -1.f to turn off (negative) 39 | bool remove_charge_reduced_precursor = false; //TODO uses spectrast magic function 40 | 41 | spectrum(); 42 | 43 | bool bin_peaks(bool root_rescale=false, bool normalize=false); 44 | bool bin_peaks_sparse(bool root_rescale=false, bool normalize=false); 45 | 46 | bool denoise_mz_window(int topX, float window_size); 47 | 48 | bool root_scale_intensities(); 49 | bool normalize_intensities(); 50 | static bool normalize_intensity_vector(std::vector &intensities); 51 | static bool normalize_intensity_vector(std::vector> &peaks); 52 | bool normalize_bins(float magnitude=-1.f); 53 | bool normalize_sparse_bins(float magnitude=-1.f); 54 | static int get_mz_bin(float mz); 55 | 56 | 57 | //Compare 58 | //friend bool operator<(const spectrum &one, const spectrum &other); 59 | bool operator<(const spectrum &other) const; 60 | bool operator<(std::pair charge_mass_tuple) const; 61 | bool operator<=(std::pair charge_mass_tuple) const; 62 | 63 | private: 64 | bool add_intensity_to_bin(int bin, float intensity); 65 | bool spectrast_isNearPrecursor(double mz); 66 | }; 67 | 68 | 69 | #endif //SPECTRAL_SEARCH_ENGINE_SPECTRUM_H 70 | -------------------------------------------------------------------------------- /src/test_SIMD.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | //#include 5 | 6 | #include 7 | #include 8 | #include 9 | #include "library.h" 10 | 11 | using namespace std; 12 | 13 | void print_res(std::vector &res) { 14 | for (int i = 0; i < 10; ++i) { 15 | cout << res[i] << " "; 16 | } 17 | cout << endl; 18 | for (int i = res.size(); i > (res.size() - 10); --i) { 19 | cout << res[i-1] << " "; 20 | } 21 | cout << endl; 22 | } 23 | 24 | void multiply(float scalar, std::vector &vec, std::vector &res) { 25 | for (int i = 0; i < vec.size(); ++i) { 26 | res[i] = scalar * vec[i]; 27 | } 28 | } 29 | 30 | void multiply128(__m128 _scalar, std::vector &vec, std::vector &res) { 31 | for (int i = 0; i < vec.size(); i+=4) { 32 | __m128 _mini_vector = _mm_load_ps(&vec[i]); 33 | __m128 _result = _mm_mul_ps(_scalar, _mini_vector); 34 | //float m[4]; 35 | _mm_store_ps(&res[i], _result); 36 | } 37 | 38 | } 39 | 40 | void multiply256(__m256 _scalar, std::vector &vec, std::vector &res) { 41 | for (int i = 0; i < vec.size() - 100; i+=8) { 42 | __m256 _mini_vector = _mm256_load_ps(&vec[i + 2]); 43 | __m256 _result = _mm256_mul_ps(_scalar, _mini_vector); 44 | 45 | _mm256_storeu_ps(&res[i], _result); 46 | } 47 | } 48 | 49 | float normal_pdf(float x, float m, float s) 50 | { 51 | static const float inv_sqrt_2pi = 0.3989422804014327; 52 | float a = (x - m) / s; 53 | 54 | return inv_sqrt_2pi / s * std::exp(-0.5f * a * a); 55 | } 56 | 57 | float normal_pdf_div(float x, float m, float s) 58 | { 59 | return normal_pdf(x,m,s) / normal_pdf(m,m,s); 60 | } 61 | 62 | int main() { 63 | 64 | library lib; 65 | lib.is_indexed; 66 | 67 | cout << "Hello SIMD user" << endl; 68 | 69 | for (int i = -3; i <= 3; ++i) { 70 | cout << i << ":\t" << normal_pdf_div(i,0,1) << endl; 71 | } 72 | float i1 = 0.3f; 73 | float i2 = 0.25f; 74 | float dist = 0.1; //Daltons 75 | float s = 0.1; 76 | 77 | cout << i1 * i2 << " " << i1 * normal_pdf_div(dist, 0,s) * i2 << " " << i1 * normal_pdf_div(dist/2.f, 0, s) * normal_pdf_div(dist/2, 0, s) * i2 << endl; 78 | exit(12); 79 | /* 80 | * Args 81 | */ 82 | volatile int8_t test; 83 | cout << test << endl; 84 | int n = 1000; 85 | alignas(32) std::vector> vec(2); 86 | alignas(32) std::vector res; 87 | 88 | 89 | cout << alignof(vec) << endl; 90 | cout << alignof(vec[1]) << endl; 91 | vec[1].reserve(n); 92 | res.resize(n); 93 | //posix_memalign(vec, 16, 16) 94 | 95 | 96 | for (int i = 0; i < n; ++i) { 97 | vec[1].push_back(0.5f * (i)); 98 | } 99 | 100 | cout << "Test naive:" << endl; 101 | auto start = chrono::high_resolution_clock::now(); 102 | float val = 0.5f; 103 | multiply(val, vec[1], res); 104 | auto stop = chrono::high_resolution_clock::now(); 105 | auto duration = duration_cast(stop - start); 106 | cout << "Total time elapsed: " << duration.count() << " microseconds" << endl; 107 | 108 | print_res(res); 109 | res.resize(n, 0); 110 | 111 | cout << "Test m128:" << endl; 112 | start = chrono::high_resolution_clock::now(); 113 | __m128 _scalar = _mm_set_ps(0.5f, 0.5f, 0.5f, 0.5f); 114 | multiply128(_scalar, vec[1], res); 115 | 116 | stop = chrono::high_resolution_clock::now(); 117 | duration = duration_cast(stop - start); 118 | cout << "Total time elapsed: " << duration.count() << " microseconds" << endl; 119 | 120 | 121 | 122 | 123 | print_res(res); 124 | res.resize(n, 0); 125 | 126 | cout << "Test m256:" << endl; 127 | start = chrono::high_resolution_clock::now(); 128 | //__m256 _scalar256 = _mm256_set1_ps(0.5f); 129 | __m256 _scalar256 = _mm256_set_ps(0.5f,0.5f,0.5f,0.5f,0.5f,0.5f,0.5f,0.5f); 130 | multiply256(_scalar256, vec[1], res); 131 | stop = chrono::high_resolution_clock::now(); 132 | duration = duration_cast(stop - start); 133 | cout << "Total time elapsed: " << duration.count() << " microseconds" << endl; 134 | 135 | print_res(res); 136 | } 137 | -------------------------------------------------------------------------------- /src/thread_pool.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "thread_pool.h" 3 | 4 | thread_pool::thread_pool(size_t n) : size(n) { 5 | start(); 6 | busy_threads = 0; 7 | } 8 | 9 | thread_pool::~thread_pool() { 10 | stop(); 11 | } 12 | 13 | void thread_pool::start() { 14 | for (size_t i = 0; i < size; ++i) { 15 | threads.emplace_back(&thread_pool::thread_waiting_loop, this); //pushback std::thread 16 | } 17 | } 18 | 19 | void thread_pool::stop() { 20 | { 21 | std::unique_lock lock(mtx_queue); 22 | request_stop = true; 23 | } 24 | 25 | event_cond.notify_all(); 26 | join_all(); 27 | 28 | } 29 | 30 | 31 | 32 | void thread_pool::enqueue(std::function task) { 33 | { 34 | std::unique_lock lock(mtx_queue); 35 | tasks.emplace(task); 36 | } 37 | event_cond.notify_one(); 38 | } 39 | 40 | void thread_pool::join_all() { 41 | for (auto &t : threads) { 42 | t.join(); 43 | } 44 | } 45 | 46 | void thread_pool::wait_for_all_threads() { 47 | std::unique_lock lock(mtx_queue); 48 | finished_cond.wait(lock, [this] { return (tasks.empty() && (busy_threads == 0)); }); 49 | } 50 | 51 | void thread_pool::thread_waiting_loop() { 52 | while (true) { 53 | 54 | std::function task; 55 | { //New scope: Lock and wait for action 56 | 57 | std::unique_lock lock(mtx_queue); //TODO check std:acquire defer lock 58 | event_cond.wait(lock, [this] { return !tasks.empty() || request_stop; }); 59 | if (request_stop && tasks.empty()) 60 | break; 61 | 62 | ++busy_threads; 63 | task = std::move(tasks.front()); 64 | tasks.pop(); 65 | 66 | } 67 | task(); 68 | 69 | mtx_queue.lock(); 70 | --busy_threads; 71 | finished_cond.notify_one(); 72 | mtx_queue.unlock(); 73 | 74 | } 75 | } 76 | 77 | size_t thread_pool::get_size() const { 78 | return size; 79 | } 80 | 81 | void thread_pool::add_thread() { 82 | threads.emplace_back(&thread_pool::thread_waiting_loop, this); 83 | ++size; 84 | } 85 | 86 | /* 87 | template 88 | void thread_pool::enqueue(F f, Args &&... args) { 89 | tasks.emplace( std::bind(f, std::forward(args)...) ); 90 | }*/ 91 | -------------------------------------------------------------------------------- /src/thread_pool.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_EXAMPLE_THREAD_POOL_H 2 | #define SIMPLE_EXAMPLE_THREAD_POOL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | class thread_pool { 13 | 14 | std::vector threads; 15 | std::queue> tasks; 16 | 17 | std::mutex mtx_queue; 18 | std::condition_variable event_cond; 19 | std::condition_variable finished_cond; 20 | bool request_stop = false; 21 | 22 | size_t size; 23 | int busy_threads = 0; 24 | 25 | public: 26 | explicit thread_pool(size_t n); 27 | ~thread_pool(); 28 | void start(); 29 | void stop(); 30 | 31 | 32 | void add_thread(); 33 | void wait_for_all_threads(); 34 | void join_all(); 35 | 36 | //template 37 | //void enqueue(F f, Args&&... args); 38 | 39 | void enqueue(std::function task); 40 | std::mutex mtx; 41 | 42 | 43 | void thread_waiting_loop(); 44 | 45 | //Getter 46 | size_t get_size() const; 47 | private: 48 | 49 | }; 50 | 51 | 52 | #endif //SIMPLE_EXAMPLE_THREAD_POOL_H 53 | --------------------------------------------------------------------------------