├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── example
├── README.md
├── example_match.png
├── example_results_control.csv
├── yeast_1000.msp
└── yeast_exp.mgf
├── include
├── LICENSE
└── cxxopts.hpp
├── scripts
├── merge_pin_output.py
└── test.csv
└── src
├── DefineConstants.h
├── build_index.cpp
├── configuration.cpp
├── configuration.h
├── fragment_ion_index.cpp
├── fragment_ion_index.h
├── index_file_reader.cpp
├── index_file_reader.h
├── index_file_writer.cpp
├── index_file_writer.h
├── indexing_manager.cpp
├── indexing_manager.h
├── library.cpp
├── library.h
├── main.cpp
├── match.cpp
├── match.h
├── mgf_reader.cpp
├── mgf_reader.h
├── msp_reader.cpp
├── msp_reader.h
├── naive_search.cpp
├── precursor_index.cpp
├── precursor_index.h
├── quick_scan.cpp
├── scanner.cpp
├── scanner.h
├── scores.cpp
├── scores.h
├── search_index.cpp
├── search_manager.cpp
├── search_manager.h
├── settings.cpp
├── settings.h
├── spectral_search.cpp
├── spectral_search.h
├── spectrum.cpp
├── spectrum.h
├── test_SIMD.cpp
├── thread_pool.cpp
└── thread_pool.h
/.gitignore:
--------------------------------------------------------------------------------
1 | **/.idea
2 | **/cmake-*
3 | **/build
4 | **/pBuild
5 | **/.vscode
6 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.13) # CMake version check
2 | project(simple_example) # Create project "simple_example"
3 | set(CMAKE_CXX_STANDARD 20) # Enable c++20 standard
4 |
5 | option(AVX_2 "Add compiler flags to support avx2 instructions" OFF)
6 | option(AVX_512 "Add compiler flags to support avx512 instruction" OFF)
7 |
8 | if (AVX_2)
9 | set(AVX_COMPILE_FLAGS "-mavx2 -march=skylake -DUSE_AVX_2=true")
10 | endif ()
11 |
12 | if (AVX_512)
13 | set(AVX_COMPILE_FLAGS "-mavx2 -march=skylake-avx512 -mavx512f -DUSE_AVX_512=true")
14 | endif ()
15 |
16 | set(CMAKE_CXX_FLAGS "-Ofast -pthread ${AVX_COMPILE_FLAGS} ")
17 | # Add main.cpp file of project root directory as source file
18 | set(SOURCE_FILES src/spectrum.cpp src/msp_reader.cpp src/scores.cpp src/library.cpp src/library.h src/mgf_reader.cpp src/mgf_reader.h src/spectral_search.h src/match.cpp src/match.h src/fragment_ion_index.cpp src/fragment_ion_index.h src/precursor_index.cpp src/precursor_index.h src/index_file_reader.h src/index_file_reader.cpp src/index_file_writer.cpp src/index_file_writer.h) #deleted src/spectral_search.cpp
19 |
20 | include_directories(include) # Added argument_parser header-only library
21 |
22 |
23 | # Add executable target with source files listed in SOURCE_FILES variable
24 | #add_executable(test_executable src/main.cpp ${SOURCE_FILES})
25 | #add_executable(quick_scan src/quick_scan.cpp ${SOURCE_FILES} src/scanner.cpp src/scanner.h)
26 | add_executable(mistle-build src/build_index.cpp src/indexing_manager.cpp src/indexing_manager.h src/msp_reader.cpp src/msp_reader.h src/mgf_reader.cpp src/mgf_reader.h src/spectrum.h src/spectrum.cpp src/index_file_writer.cpp src/index_file_writer.h src/precursor_index.cpp src/precursor_index.h src/fragment_ion_index.cpp src/fragment_ion_index.h src/index_file_reader.cpp src/index_file_reader.h src/configuration.cpp src/configuration.h src/thread_pool.cpp src/thread_pool.h src/settings.cpp src/settings.h)
27 | add_executable(mistle-search src/search_index.cpp src/indexing_manager.cpp src/indexing_manager.h src/msp_reader.cpp src/msp_reader.h src/spectrum.h src/spectrum.cpp src/index_file_writer.cpp src/index_file_writer.h src/precursor_index.cpp src/precursor_index.h src/fragment_ion_index.cpp src/fragment_ion_index.h src/index_file_reader.cpp src/index_file_reader.h src/search_manager.cpp src/search_manager.h src/configuration.cpp src/configuration.h src/library.cpp src/library.h src/mgf_reader.cpp src/mgf_reader.h src/match.cpp src/match.h src/thread_pool.cpp src/thread_pool.h src/settings.cpp src/settings.h)
28 | #add_executable(naive-search src/naive_search.cpp src/library.cpp src/precursor_index.cpp src/msp_reader.cpp src/mgf_reader.cpp src/spectrum.cpp src/fragment_ion_index.cpp src/index_file_writer.cpp src/spectral_search.cpp src/scores.cpp src/match.cpp)
29 | #add_executable(test-SIMD src/test_SIMD.cpp)
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (2022) [Yannek Nowatzky, Bundesanstalt für Materialforschung und -prüfung (BAM)]
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Mistle
2 |
3 | Mistle is a fast spectral search engine. It uses a fragment-indexing technique and SIMD intrinsics to match experimental MS2 spectra to large spectral libraries at a high performance. Find out more about Mistle in our publication:
4 |
5 | >**Mistle: bringing spectral library predictions to metaproteomics with an efficient search index**
6 | > Yannek Nowatzky, Philipp Benner, Knut Reinert, Thilo Muth
7 | > Bioinformatics, Volume 39, Issue 6, June 2023, btad376, https://doi.org/10.1093/bioinformatics/btad376
8 |
9 | Please use the above citation, if you are using Mistle.
10 |
11 | ## Requirements
12 | Tested only on linux (debian) for the specified versions:
13 |
14 | * C++20
15 | * Cmake (version 3.19.3)
16 | * g++ (10.2.1)
17 |
18 | ## Build
19 |
20 | For building the project, please create (mkdir) a separate build directory. Change into the build directory and run:
21 |
22 | cmake /path/to/mistle/
23 | cmake --build .
24 |
25 | In order to make use of SIMD instruction AVX2 or AVX512 build with -DAVX_2=ON or -DAVX_512=ON compiler flag. Check if your CPU supports these. If necessary adjust CMakeList.txt according to the preferences of your CPU.
26 |
27 | Optionally, export the directory where *mistle* was built as an executable PATH in the *~/.bashrc* file. Add the following line:
28 |
29 | export PATH="/home/$USER/path/to/mistle/build:$PATH"
30 |
31 |
32 | ## Usage
33 |
34 | ### Mistle build
35 |
36 | Build Mistle's fragment ion index from spectral library.
37 |
38 | mistle-build -i /path/to/library/ -o /path/to/index/ [optional args]
39 |
40 | Required arguments are the input directory, which must contain spectral library files (.msp or .mgf format), and the output directory for the fragment index.
41 |
42 | ### Mistle search
43 |
44 | Search experimental mass spectra in Mistle's fragment ion index.
45 |
46 |
47 | mistle-search -s /path/to/search_file.mgf -i /path/to/index/ [optional args]
48 |
49 | Required arguments are the search file (.mgf or .msp format) and the path to the fragment index. Additionally, output directory and formats can be specified as well as various search parameters. Use *-h* flag to print the help message for more information. Also, refer to the [EXAMPLE README](example/README.md) and the example directory to test the program.
50 |
51 | ## Output format
52 |
53 | Peptide spectrum matches (PSMs) are provided in tab separated format.
54 | First line (comment tagged by #) names the exact shell command and parameters used to produce the output.
55 |
56 | The next line is the header listing all tracked attributes (tab separated).
57 |
58 | id spectrum charge hit_rank match peptide isomers similarity bias [...]
59 |
60 | A large number of scores and statistics are appended as additional columns (marked [...]). A detailed explanation of the scores can be found in the next section.
61 |
62 | Below the header, all matched experimental spetra are listed and indexed by their scan name and the rank of the matched library spectrum. (Rank R is appended with /R to the scan name). See example [output](example/example_results_control.csv).
63 |
64 | Alternatively, a pin-tab format that is readable by Percolator (Käll *et al.*, 2007) can be produced, listing the same scores as features. To obtain this output format, the user needs to specify the output path (*-o*) during mistle-search with the file extension *.pin*. Note that the library label needs to be set correctly at index construction (1: target, -1: decoy libary) and the *results.pin* files of target and decoy search need to be concatenated or merged before using Percolator. It's recommended to use the this python [script](scripts/merge_pin_output.py) to merge the query results and correctly update delta scores.
65 |
66 | ## Scores
67 |
68 | *Similarity* is the preferred baseline score, which is a refined version of the normalized dot product based on square root transformed peak intensities. A *bias* measurement highlights how biased the *similarity* is on a few matching peaks, and a *delta_similarity* score describes the *similarity* difference between the top hit and second-best hit. Additionally, an *annotation_similarity* version of these scores exists, which accounts only for peak intensities matching reference peaks. This is useful when the library consists of fewer annotated or predicted peaks and is less noisy than the query spectra.
69 |
70 | As a high-quality discriminant scoring function we suggest the *avg_bias_adjusted_similarity*, which is composed equally of the *similarity* and *annotation similarity* metrics. Specifically, a *bias-adjusted similarity* (*sim2*) is calculated by the product of *similarity* and *(1-bias)* and is averaged between standard and annotation version. This scoring function provides excellent discrimination between target and decoy matches.
71 |
72 |
73 |
74 |
75 | ## Known issues
76 |
77 | ### On linux
78 |
79 | Input files coming from Windows distributions may have a line ending with \r\n (carriage return). Linux and Mistle require \n as the exclusive line ending.
80 | Remove \r character (char 13) using the following commad line
81 | * *tr -d '\r' < FILE.mgf > FILE_FIXED.mgf*
82 |
83 |
--------------------------------------------------------------------------------
/example/README.md:
--------------------------------------------------------------------------------
1 | # Mistle Example Usage
2 |
3 | A toy example is provided to test the program. A library of 1000 simulated mass spectra (predicted by Prosit (Gessulat et al., 2019)) of yeast peptides (*Saccharomyces cerevisiae*) is used for reference. 3 experimental spectra matching the species are selected from the 9MM FASP dataset (Tanca et al., 2013).
4 |
5 | ## Running the test
6 |
7 | Open terminal or change into this directory (path/to/mistle/example). Create a new directory for the index with *mkdir index*. To construct the fragment index from the example spectral library, run
8 |
9 | mistle-build -i yeast_1000.msp -o index/ -n 4 -t 1
10 |
11 | This should create 4 index partitions (binary format) in the index directory, a precursor index file, and a human-readable config.txt. Note that if the PATH to mistle-build and mistle-search is not exported, the user is required to specify the executables (e.g. */path/to/mistle/build/mistle-build [...]*).
12 |
13 | Next, perform example searches
14 |
15 | mistle-search -s yeast_exp.mgf -i index/ -o example_results.csv -p 10 -b 0.2 -t 1 --hits_per_spectrum 1
16 |
17 | This should produce an *example_results.csv* file in the current directory. Compare the output to the *example_results_control.csv*, which already resides in this directory. If they align, *mistle* is configured correctly and is read to use. (Note that floating-point inaccuracy may occur when using different hardware or advanced vector extensions. Results and scores may deviate from the control in the last digit(s)).
18 |
19 | Below, the first match of the list is displayed by a mirror plot (experimental spectrum top; matched simulated spectrum bottom) generated using the python *spectrum_utils* package.
20 |
21 |
22 |
--------------------------------------------------------------------------------
/example/example_match.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BAMeScience/Mistle/ca43b8c3ad98163827a49d1ad16b17360cefb8bb/example/example_match.png
--------------------------------------------------------------------------------
/example/example_results_control.csv:
--------------------------------------------------------------------------------
1 | #mistle-search -s yeast_exp.mgf -i index/ -o example_results.csv -p 10 -b 0.2 -t 1 --hits_per_spectrum 1
2 | id spectrum charge hit_rank match peptide isomers similarity bias annotation_similarity annotation_bias dot_product delta_dot delta_similarity delta_sim2 mass_difference peak_count_query peak_count_ref sim2 x_score x_score_dot x_lgamma x_lgamma_dot st_score st_score_dot
3 | 9MM_FASP.42839.42839.2/1 9MM_FASP.42839.42839.2 2 1 997 DVAAQDFINAYASFLQR 0.601264 0.283916 0.965076 0.283916 0.603536 0.56061 0.558355 0.415923 -0.000671387 1000 21 0.430556 8.53642e+18 8.56868e+18 41.8269 41.8307 0.732213 0.733672
4 | 9MM_FASP.40363.40363.2/1 9MM_FASP.40363.40363.2 2 1 998 TAGIQIVADDLTVTNPAR 0.603145 0.291692 0.911218 0.291692 0.580075 0.51632 0.531702 0.393204 -0.00201416 1000 18 0.427212 3.86156e+15 3.71386e+15 32.9995 32.9605 0.714507 0.704081
5 | 9MM_FASP.16147.16147.2/1 9MM_FASP.16147.16147.2 2 1 999 TNEAAGDGTTSATVLGR 0.652331 0.293515 0.935983 0.293515 0.639207 0.639207 0.652331 0.460862 0 1000 20 0.460862 1.58706e+18 1.55513e+18 38.9127 38.8924 0.791398 0.783524
6 |
--------------------------------------------------------------------------------
/example/yeast_exp.mgf:
--------------------------------------------------------------------------------
1 | BEGIN IONS
2 | TITLE=9MM_FASP.42839.42839.2
3 | RTINSECONDS=16250.9127
4 | PEPMASS=964.979370117188 3021333.001953000203
5 | CHARGE=2+
6 | 101.0707016 33087.515625
7 | 110.0712433 4736.6293945313
8 | 112.0866013 6470.267578125
9 | 115.0863647 21151.966796875
10 | 116.0705643 5116.921875
11 | 118.8389282 11158.431640625
12 | 118.9499054 4830.1323242188
13 | 120.080574 47082.66015625
14 | 127.0864334 4406.6918945313
15 | 129.1019897 24503.693359375
16 | 131.4036865 5022.0922851563
17 | 136.0753479 20044.265625
18 | 142.0860596 5587.869140625
19 | 143.0812683 33124.98828125
20 | 155.0811462 5163.4438476563
21 | 158.0922546 13841.7548828125
22 | 160.1084595 8320.451171875
23 | 160.5261078 5044.71875
24 | 169.0962067 5223.8173828125
25 | 171.1124725 9784.7001953125
26 | 175.1188354 78650.90625
27 | 183.0763702 9102.408203125
28 | 183.149292 6998.7641601563
29 | 185.1283875 4452.4750976563
30 | 186.0867462 12476.5283203125
31 | 187.0678558 4350.3198242188
32 | 187.1074982 168854.46875
33 | 188.1114807 9722.388671875
34 | 197.1283569 8607.0908203125
35 | 199.1074982 4636.8251953125
36 | 200.1027679 35163.23828125
37 | 200.1387634 4821.775390625
38 | 201.1234436 9745.8994140625
39 | 207.1132355 5408.6245117188
40 | 211.1442719 4847.7451171875
41 | 215.102478 205652.546875
42 | 216.1076813 15140.7451171875
43 | 226.0823364 11055.17578125
44 | 226.1181183 7858.3627929688
45 | 227.0666809 12251.2333984375
46 | 228.1340485 10020.7353515625
47 | 229.1179047 7222.828125
48 | 233.1644745 14039.5283203125
49 | 235.1073761 26386.353515625
50 | 241.1179199 9526.1181640625
51 | 242.1497345 16059.6181640625
52 | 243.1462708 4499.3676757813
53 | 244.0925293 63368.921875
54 | 245.0948639 6055.5258789063
55 | 254.1498718 4388.76171875
56 | 258.1073608 4900.2783203125
57 | 261.1590881 9499.5302734375
58 | 263.102356 19258.388671875
59 | 268.1455994 5136.5786132813
60 | 269.1246643 9959.6552734375
61 | 270.144928 14568.9599609375
62 | 271.1398315 26787.990234375
63 | 286.138092 149259.984375
64 | 287.1423645 17651.451171875
65 | 288.1345215 7673.8784179688
66 | 298.1027832 17691.439453125
67 | 299.1706543 10069.3173828125
68 | 303.1766357 25486.693359375
69 | 304.1318665 4636.7861328125
70 | 306.1441956 9795.7177734375
71 | 312.1547241 4983.0
72 | 315.129364 60298.390625
73 | 316.1318054 7792.0830078125
74 | 322.1389771 6632.0043945313
75 | 328.1326599 4369.7392578125
76 | 329.1812439 5224.4736328125
77 | 334.1390686 5000.8452148438
78 | 349.1504211 16053.3134765625
79 | 357.1764832 39490.07421875
80 | 358.1777649 5851.5483398438
81 | 369.1389465 6786.1235351563
82 | 373.1526794 5985.4814453125
83 | 374.1323242 7837.0854492188
84 | 374.2080994 4354.1879882813
85 | 375.2001343 9772.779296875
86 | 376.1856689 6498.8842773438
87 | 378.1285706 8469.3818359375
88 | 382.2089539 8304.3955078125
89 | 386.1663208 42721.83984375
90 | 387.1668701 5545.8603515625
91 | 388.1896057 4768.796875
92 | 391.1607666 25515.9765625
93 | 399.2348633 15695.8251953125
94 | 416.2614136 22494.701171875
95 | 417.1773987 6709.1762695313
96 | 419.2272949 4557.974609375
97 | 420.1894226 9260.6181640625
98 | 434.2037659 7486.6000976563
99 | 441.2110901 7759.9013671875
100 | 445.1200256 69070.4296875
101 | 446.2433472 5370.7470703125
102 | 451.1963501 4808.5693359375
103 | 459.2232361 8310.0439453125
104 | 462.193634 9719.5283203125
105 | 485.2359619 5535.1767578125
106 | 487.2197571 5813.396484375
107 | 488.2176514 5587.8901367188
108 | 505.2393188 5599.9204101563
109 | 507.2207947 4686.4897460938
110 | 546.3031006 6769.8666992188
111 | 561.2654419 11778.8291015625
112 | 563.328186 18782.712890625
113 | 600.2608032 10594.6923828125
114 | 601.2592773 4544.8671875
115 | 636.2729492 4648.42578125
116 | 650.361084 45236.2578125
117 | 651.3654785 9676.509765625
118 | 672.2941284 7039.6684570313
119 | 721.3981323 60447.5703125
120 | 722.4011841 17896.1328125
121 | 884.4619141 68769.625
122 | 885.4655151 33357.15234375
123 | 886.46698 5799.1528320313
124 | 955.4975586 63005.56640625
125 | 956.5010376 27270.22265625
126 | 957.5122681 4905.5834960938
127 | 1052.522339 21506.96484375
128 | 1053.520386 9743.3623046875
129 | 1069.541504 163773.625
130 | 1070.543945 88356.46875
131 | 1071.550537 18056.533203125
132 | 1080.509766 11126.55859375
133 | 1081.516235 5718.029296875
134 | 1165.599854 5571.5815429688
135 | 1182.626831 79612.2578125
136 | 1183.624268 50932.83203125
137 | 1184.629395 12699.2744140625
138 | 1193.590332 8337.3564453125
139 | 1312.669067 4801.9868164063
140 | 1329.693115 64695.46875
141 | 1330.695923 45749.37890625
142 | 1331.692749 16048.2177734375
143 | 1427.700439 8999.4677734375
144 | 1428.711182 6104.9599609375
145 | 1444.722534 70359.890625
146 | 1445.723999 57531.28125
147 | 1446.71814 17378.087890625
148 | 1554.77771 4594.904296875
149 | 1555.751587 13335.068359375
150 | 1556.755737 11739.7900390625
151 | 1572.781494 28325.662109375
152 | 1573.781494 22906.359375
153 | 1574.783325 8739.1484375
154 | 1643.809448 14282.5986328125
155 | 1644.825073 10837.4716796875
156 | END IONS
157 | BEGIN IONS
158 | TITLE=9MM_FASP.40363.40363.2
159 | RTINSECONDS=15496.7056
160 | PEPMASS=928.001647949219 6510079.84375
161 | CHARGE=2+
162 | 101.0708542 77888.9140625
163 | 110.0711594 19265.568359375
164 | 112.0866776 8511.8994140625
165 | 120.0806885 17434.61328125
166 | 129.0660095 12940.408203125
167 | 129.1021881 51164.7109375
168 | 130.2890015 15493.1015625
169 | 130.4004669 8281.0634765625
170 | 131.1177673 8266.388671875
171 | 136.0758362 15647.880859375
172 | 141.1023254 8588.7392578125
173 | 141.2971344 18972.61328125
174 | 141.4078522 9020.203125
175 | 143.1179199 126126.296875
176 | 145.097168 64125.0078125
177 | 153.8619843 7908.4111328125
178 | 155.0814362 9177.904296875
179 | 157.1086884 7781.240234375
180 | 158.0924683 11019.87109375
181 | 169.097168 13650.4208984375
182 | 169.1336365 12272.03125
183 | 171.1129761 59545.9921875
184 | 173.0922089 131033.53125
185 | 173.1264801 7183.7651367188
186 | 175.1191406 79029.640625
187 | 183.1134796 11350.1455078125
188 | 185.0920715 9108.482421875
189 | 185.1648865 10192.673828125
190 | 186.1235504 9753.572265625
191 | 188.1394806 9003.4716796875
192 | 196.1446686 18823.2421875
193 | 197.1286163 32228.150390625
194 | 201.1239166 19130.03125
195 | 212.1031647 34801.296875
196 | 213.1598969 12924.912109375
197 | 214.1550903 76166.0234375
198 | 215.1400452 8345.6455078125
199 | 216.0982056 7945.4721679688
200 | 224.1395874 16281.8115234375
201 | 225.122879 9684.4462890625
202 | 229.1320038 8665.00390625
203 | 230.1137543 314596.1875
204 | 231.1168671 27428.09765625
205 | 241.1549072 7073.8876953125
206 | 242.1500092 110686.9375
207 | 243.1539612 7429.3100585938
208 | 254.1505127 11942.2900390625
209 | 259.1763916 8622.40234375
210 | 283.1408997 8339.05078125
211 | 283.1772461 9583.833984375
212 | 286.1399231 10952.8779296875
213 | 296.1983032 9854.3203125
214 | 297.1914673 11624.509765625
215 | 298.175415 16445.005859375
216 | 299.1714478 88843.1015625
217 | 300.175415 9889.9638671875
218 | 302.0975342 15737.2451171875
219 | 308.1587524 7444.9565429688
220 | 315.2029114 131341.859375
221 | 316.206604 14992.513671875
222 | 323.2087402 7304.1611328125
223 | 324.1923523 16106.66796875
224 | 325.1870728 31081.755859375
225 | 326.183075 86579.640625
226 | 327.1889038 7246.5234375
227 | 329.1815491 7393.8139648438
228 | 340.1617432 10642.8330078125
229 | 341.1447144 11160.146484375
230 | 341.2179871 15938.5361328125
231 | 343.2012939 83693.2890625
232 | 344.2012024 13296.505859375
233 | 358.1729126 18888.26171875
234 | 360.224823 15558.7197265625
235 | 367.2328186 7427.3828125
236 | 370.2089539 29496.71484375
237 | 382.2107239 7742.3227539063
238 | 397.2044067 7452.1977539063
239 | 398.2061462 9965.7998046875
240 | 401.1687622 13184.6484375
241 | 411.2567749 10920.513671875
242 | 412.2570801 18743.740234375
243 | 415.1811523 9001.271484375
244 | 425.2499695 10866.1943359375
245 | 426.2323303 12793.5400390625
246 | 427.1837463 7111.9663085938
247 | 436.2202759 12748.3720703125
248 | 440.2252502 42649.328125
249 | 443.2632751 10977.650390625
250 | 445.1200256 57425.515625
251 | 445.9109497 7194.9560546875
252 | 453.2455444 42437.90234375
253 | 454.2271729 19238.376953125
254 | 457.2518005 33785.8515625
255 | 471.2563782 48694.91796875
256 | 472.262207 10564.0478515625
257 | 498.2193298 14104.1591796875
258 | 514.2525024 7893.763671875
259 | 516.2279053 12113.2705078125
260 | 539.3215332 9356.349609375
261 | 556.3502197 11642.4951171875
262 | 558.2995605 52882.86328125
263 | 559.3012085 14281.9990234375
264 | 566.3325806 17119.720703125
265 | 584.3406982 16211.431640625
266 | 597.2859497 19063.87109375
267 | 598.2902832 8193.8447265625
268 | 615.2963867 11352.3564453125
269 | 657.3684082 32348.044921875
270 | 698.336792 7151.2465820313
271 | 758.414978 53055.1796875
272 | 759.4221191 14875.1953125
273 | 818.3222656 13256.541015625
274 | 827.4797974 10391.4892578125
275 | 871.5007935 145674.875
276 | 872.5050049 58906.9921875
277 | 873.5021362 9140.0087890625
278 | 942.4938354 12114.091796875
279 | 969.5022583 10454.162109375
280 | 986.5270386 117214.0234375
281 | 987.5280151 52795.1796875
282 | 988.5264893 8640.2451171875
283 | 1084.534302 10026.9853515625
284 | 1101.552979 81097.75
285 | 1102.554688 52823.890625
286 | 1103.547485 8687.359375
287 | 1128.561157 11452.8505859375
288 | 1154.578247 10374.5087890625
289 | 1155.570435 8706.462890625
290 | 1172.592163 229647.375
291 | 1173.592407 125851.75
292 | 1174.594604 31559.689453125
293 | 1227.639038 14841.5625
294 | 1228.633301 7910.3549804688
295 | 1253.651978 13080.9677734375
296 | 1254.654053 7995.7016601563
297 | 1271.658203 261442.078125
298 | 1272.661621 145139.15625
299 | 1273.659424 35773.10546875
300 | 1340.716064 9193.5107421875
301 | 1384.742798 117004.96875
302 | 1385.744629 78891.1953125
303 | 1386.742676 24188.56640625
304 | 1469.788574 7112.1069335938
305 | 1494.796021 11901.818359375
306 | 1495.778198 39154.66796875
307 | 1496.779297 23381.884765625
308 | 1497.790405 7938.263671875
309 | 1512.80188 53807.27734375
310 | 1513.803589 48195.265625
311 | 1514.80127 16152.77734375
312 | END IONS
313 | BEGIN IONS
314 | TITLE=9MM_FASP.16147.16147.2
315 | RTINSECONDS=6778.9345
316 | PEPMASS=810.894836425781 172010.068847699993
317 | CHARGE=2+
318 | 101.0707703 1217.3948974609
319 | 102.0546494 1171.0164794922
320 | 106.9486847 583.6018066406
321 | 110.0708237 405.9848327637
322 | 111.0064316 301.827331543
323 | 111.4262161 295.3147277832
324 | 112.0869141 497.2803039551
325 | 113.0343399 1190.6411132813
326 | 116.0333099 338.2010192871
327 | 120.0800018 1300.7055664063
328 | 126.0643921 662.5599975586
329 | 126.1747437 415.7754211426
330 | 127.0862961 320.0195922852
331 | 129.0657196 2649.853515625
332 | 129.102005 2786.2429199219
333 | 133.5157471 317.4209289551
334 | 136.0754089 1103.1275634766
335 | 141.8504944 435.1179504395
336 | 143.0809021 958.8742675781
337 | 147.1132355 589.1596069336
338 | 149.8830109 375.5520324707
339 | 155.0826111 599.940246582
340 | 158.0915985 509.8851928711
341 | 169.0982056 368.4065246582
342 | 171.0762177 1581.8051757813
343 | 172.0719452 1258.7937011719
344 | 173.0920105 1573.6694335938
345 | 175.1190796 3150.8952636719
346 | 181.0608063 1131.0991210938
347 | 181.5852051 303.6040649414
348 | 181.5953674 315.2122497559
349 | 183.0770569 508.908416748
350 | 185.0924988 516.541809082
351 | 188.1027527 1764.9710693359
352 | 198.0875244 1088.6629638672
353 | 199.0714264 2894.9458007813
354 | 201.0868835 1389.2589111328
355 | 212.1040955 513.1270751953
356 | 215.1139832 532.7591552734
357 | 216.0977478 15811.8017578125
358 | 216.6444855 345.5026550293
359 | 217.0833435 602.8255615234
360 | 217.1008911 877.2660522461
361 | 220.3491211 316.3650817871
362 | 226.0816956 1178.1096191406
363 | 226.118576 611.6033325195
364 | 232.1404572 2523.6206054688
365 | 233.1648254 1126.3984375
366 | 242.1139069 897.2562866211
367 | 244.0923615 4751.0708007813
368 | 254.1131744 750.9210205078
369 | 258.1089783 1152.3553466797
370 | 272.1256409 552.3010864258
371 | 280.0917664 390.0112915039
372 | 282.1081238 466.878692627
373 | 294.1807861 980.2778320313
374 | 297.1178589 826.6633911133
375 | 297.1839905 313.9739379883
376 | 298.1026611 1294.8156738281
377 | 299.9552612 316.6298217773
378 | 310.1011658 638.9633178711
379 | 311.1316223 479.184753418
380 | 315.129303 2370.4162597656
381 | 317.1424255 894.8659667969
382 | 325.1171875 753.084777832
383 | 327.1298828 1602.0756835938
384 | 328.1131592 3785.6862792969
385 | 328.1950378 747.895324707
386 | 330.846283 368.969543457
387 | 343.1257019 390.5220947266
388 | 343.1565247 385.1192321777
389 | 345.1400757 8732.6298828125
390 | 345.2227783 2003.0583496094
391 | 346.1449585 1156.5886230469
392 | 350.3141174 399.9468688965
393 | 357.1445007 509.2389526367
394 | 371.1553345 2326.8596191406
395 | 381.1363525 609.2717895508
396 | 398.1668091 969.7222290039
397 | 399.150116 2073.6362304688
398 | 405.2097778 697.6733398438
399 | 416.1773682 1853.4498291016
400 | 428.1824646 350.5044555664
401 | 444.2934875 1221.3486328125
402 | 444.7707214 764.6782836914
403 | 445.7382813 395.6271362305
404 | 445.9076843 754.2020263672
405 | 483.0623169 354.065032959
406 | 487.2117615 509.4663391113
407 | 536.208313 544.1396484375
408 | 545.3380737 1747.8859863281
409 | 546.3411865 562.4497680664
410 | 550.7709961 522.1591186523
411 | 616.3764038 1612.9791259766
412 | 617.3740845 382.4893188477
413 | 623.2409668 376.6261291504
414 | 703.4082031 2703.6765136719
415 | 704.4126587 830.151550293
416 | 804.4575806 3665.4167480469
417 | 805.4550171 1032.9884033203
418 | 905.5076904 1447.0906982422
419 | 918.5022583 585.7772216797
420 | 944.5067749 1127.22265625
421 | 962.5265503 9907.2578125
422 | 963.5253296 4811.33203125
423 | 964.5343018 996.1773681641
424 | 1077.550171 2585.9892578125
425 | 1078.551758 1037.4976806641
426 | 1090.558472 704.0826416016
427 | 1116.557373 1197.2247314453
428 | 1117.574219 465.5159301758
429 | 1134.574829 9586.7255859375
430 | 1135.577026 5311.3666992188
431 | 1136.590942 1415.9747314453
432 | 1161.610352 662.1987304688
433 | 1187.596802 1224.1402587891
434 | 1205.609131 7781.3291015625
435 | 1206.610596 4323.6279296875
436 | 1207.613403 1575.6195068359
437 | 1276.649414 3899.642578125
438 | 1277.644653 2981.9809570313
439 | 1278.64978 850.1265258789
440 | 1387.694702 504.8869018555
441 | 1405.680786 840.313293457
442 | 1406.700195 1185.7927246094
443 | 1522.564087 399.8696899414
444 | END IONS
445 |
--------------------------------------------------------------------------------
/include/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2014 Jarryd Beck
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/scripts/merge_pin_output.py:
--------------------------------------------------------------------------------
1 |
2 | import pandas as pd
3 | import argparse
4 | import random
5 | from matplotlib import pyplot as plt
6 |
7 | def parse_args():
8 |
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument("-t", "--target",
11 | help="target search results file (.pin) format",
12 | type=str,required=True)
13 | parser.add_argument("-d", "--decoy",
14 | help="decoy search results file (.pin) format",
15 | type=str,required=True)
16 | parser.add_argument("-o", "--output",
17 | help="output file (.pin) format",
18 | type=str, required=True)
19 | parser.add_argument("--score",
20 | help="discriminant score for target decoy competition",
21 | type=str, default="avg_bias_adjusted_similarity")
22 | parser.add_argument("--update_delta_scores",
23 | help="Update delta scores when target/decoy are 1st and 2nd ranked hit",
24 | action='store_true')
25 | parser.add_argument("--main_features",
26 | help="Track main features only. Otherwise all features will be tracked, which can reduce separation performance.",
27 | action='store_true')
28 | parser.add_argument("--drop_redundant_features",
29 | help="Track main features only. Otherwise all features will be tracked, which can reduce separation performance.",
30 | action='store_true')
31 |
32 |
33 |
34 | args = parser.parse_args()
35 | return args
36 |
37 |
38 | def update_delta_scores(df1, idx1, df2, idx2):
39 | if df1.at[idx1, "delta_avg"] > df1.at[idx1, "avg_bias_adjusted_similarity"] - df2.at[idx2, "avg_bias_adjusted_similarity"]:
40 | df1.at[idx1, "delta_similarity"] = df1.at[idx1, "similarity"] - df2.at[idx2, "similarity"]
41 | df1.at[idx1, "delta_dot"] = df1.at[idx1, "dot_product"] - df2.at[idx2, "dot_product"]
42 | df1.at[idx1, "delta_annotation_similarity"] = df1.at[idx1, "annotation_similarity"] - df2.at[idx2, "annotation_similarity"]
43 | df1.at[idx1, "delta_sim2"] = df1.at[idx1, "sim2"] - df2.at[idx2, "sim2"]
44 | df1.at[idx1, "delta_avg"] = df1.at[idx1, "avg_bias_adjusted_similarity"] - df2.at[idx2, "avg_bias_adjusted_similarity"]
45 | return
46 |
47 | def merge_files(args):
48 |
49 | print("+++ Merging target and decoy results (.pin format) +++")
50 | df = pd.read_csv(args.target, sep='\t', comment='#', low_memory=False)
51 | df_decoy = pd.read_csv(args.decoy, sep='\t', comment='#', low_memory=False)
52 |
53 | target_nans = df.isnull().any(axis=1).sum()
54 | decoy_nans = df_decoy.isnull().any(axis=1).sum()
55 | if target_nans > 0 or decoy_nans > 0:
56 | print(f"Waring: NaN values detected. Dropping {target_nans} target and {decoy_nans} decoy matches.")
57 | df.dropna(inplace=True)
58 | df_decoy.dropna(inplace=True)
59 |
60 | if not all(df["Label"].unique() == 1):
61 | print("Warning: Not all target labels match expected value of 1.")
62 |
63 | if not all(df_decoy["Label"].unique() == -1):
64 | print("Warning: Not all decoy labels match expected value of -1.")
65 |
66 | print(f"Detected {df.shape[0]} target and {df_decoy.shape[0]} decoy matches.")
67 | scans = df["ScanNr"].unique()
68 |
69 | for num in scans:
70 | decoy_match = df_decoy[df_decoy["ScanNr"] == num]
71 | if len(decoy_match) == 0:
72 | continue
73 | elif len(decoy_match) > 1:
74 | print("Error: multiple occurance of a ScanNr")
75 | exit(1)
76 | else:
77 | decoy_idx = decoy_match.index[0]
78 | decoy_match = decoy_match.iloc[0]
79 |
80 | target_match = df[df["ScanNr"] == num]
81 | target_idx = target_match.index[0]
82 | target_match = target_match.iloc[0]
83 |
84 | # Equal peptide -> Drop decoy
85 | if target_match["Peptide"].replace("L", "I") == decoy_match["Peptide"].replace("L", "I"):
86 | df_decoy.drop(decoy_idx, inplace=True)
87 | continue
88 |
89 | #print(target_idx, decoy_idx)
90 | # Compare score -> Keep higher scoring match
91 | if target_match[args.score] > decoy_match[args.score]:
92 | if args.update_delta_scores:
93 | update_delta_scores(df, target_idx, df_decoy, decoy_idx)
94 | df_decoy.drop(decoy_idx, inplace=True)
95 | else:
96 | if args.update_delta_scores:
97 | update_delta_scores(df_decoy, decoy_idx, df, target_idx)
98 | df.drop(target_idx, inplace=True)
99 |
100 |
101 |
102 | df = pd.concat([df, df_decoy], ignore_index=True)
103 | #df["sim2_half"] = df["sim2"] / 2.0
104 | #df["sim2_double"] = 2.0 * df["sim2"]
105 |
106 | #cols = ["PSMId", "Label", "ScanNr", "sim2", "sim2_half", "sim2_double", "Peptide", "Proteins"]
107 | #df = df[cols]
108 |
109 | if args.main_features:
110 | features = ["charge", "similarity", "bias", "delta_similarity", "sim2", "delta_sim2", "annotation_similarity", "annotation_bias", "annotation_sim2", "delta_annotation_similarity", "peak_count_ref", "avg_bias_adjusted_similarity", "delta_avg", "abs_mass_difference", "ppm_difference", "peptide_length", "precursor_mz"]
111 | col = ["PSMId", "Label", "ScanNr"] + features + ["Peptide", "Proteins"]
112 | df = df[col]
113 | if args.drop_redundant_features:
114 | df.drop(columns=["x_score", "x_score_dot"], inplace=True)
115 | #df.drop(columns=["fragment_standard_deviation", "fragment_weighted_standard_deviation"], inplace=True)
116 | #if args.experimental:
117 | # df["exp1"] =
118 | #df.drop(columns=["x_score", "x_score_dot"], inplace=True)
119 | #df.drop(columns=["fragment_standard_deviation", "fragment_weighted_standard_deviation"], inplace=True)
120 | df.to_csv(args.output, sep="\t", index=False)
121 |
122 | num_targets = sum(df["Label"] == 1)
123 | num_decoys = sum(df["Label"] == -1)
124 |
125 |
126 | print(f"Files merged successfully! {num_targets} targets and {num_decoys} decoys remaining after competition.")
127 |
128 |
129 |
130 | def main():
131 | args = parse_args()
132 | merge_files(args)
133 |
134 | if __name__ == "__main__":
135 | main()
136 |
137 |
138 | # bug fix: Remove -inf values
139 | # sed -i 's/-inf/-9999/g' yeast_td.pin
--------------------------------------------------------------------------------
/scripts/test.csv:
--------------------------------------------------------------------------------
1 |
2 | PSMId Label ScanNr charge similarity bias annotation_similarity annotation_bias avg_bias_adjusted_similarity dot_product delta_dot delta_similarity delta_annotation_similarity delta_sim2 delta_avg dot_contrast_angle similarity_contrast_angle annotation_contrast_angle mass_difference abs_mass_difference ppm_difference peptide_length precursor_mz peak_count_query peak_count_ref fragment_standard_deviation fragment_weighted_standard_deviation sim2 annotation_sim2 x_score x_score_dot x_lgamma x_lgamma_dot st_score st_score_dot Peptide Proteins
3 | 64 1 189 2 1.00149 0.220921 0.823551 0.220921 0.710924 0.873902 0.444778 0.342503 0.202489 0.304451 0.248828 0.676836 0.61602 -0.000854492 0.000854492 1.30494 11 654.814 1000 142 0.408765 0.421254 0.780238 0.641611 0.0 0.0 560.171 560.034 0.73769 0.727923 X.YGRPPDSHHSR.X Unknown
4 |
5 |
6 | charge similarity bias annotation_similarity annotation_bias avg_bias_adjusted_similarity dot_product delta_dot delta_similarity delta_annotation_similarity delta_sim2delta_avg dot_contrast_angle similarity_contrast_angle annotation_contrast_angle mass_difference abs_mass_difference ppm_difference peptide_length precursor_mz peak_count_query peak_count_ref fragment_standard_deviation fragment_weighted_standard_deviation sim2 annotation_sim2 x_score x_score_dot x_lgamma x_lgamma_dot st_score st_score_dot m0
7 | -0.0833 -0.2382 -0.1786 2.4159 -0.1786 -0.0354 1.3627 0.3950 -0.4690 -0.1501 0.1838 0.9446 0.0305 -1.3467 -0.4678 -0.0915 -0.0932 -0.4207 0.0715 0.1952 0.0000 1.7078 0.0120 0.0322 -0.7453 0.3660 0.1577 0.1706 0.0408 0.1718 0.3820 0.4089 -0.9830
8 |
9 |
--------------------------------------------------------------------------------
/src/DefineConstants.h:
--------------------------------------------------------------------------------
1 | #ifndef SIMPLE_EXAMPLE_DEFINECONSTANTS_H
2 | #define SIMPLE_EXAMPLE_DEFINECONSTANTS_H
3 |
4 | #define BIN_MIN_MZ 0 // spectrast uses 10 // why?
5 | #define BIN_MAX_MZ 2000
6 |
7 | #define STANDARD_PARENT_UPPER_MZ 1500
8 | #define STANDARD_PARENT_LOWER_MZ 400
9 |
10 | //#define FLOAT_OUTPUT_PRECISION std::numeric_limits::max_digits10
11 |
12 |
13 | #endif //SIMPLE_EXAMPLE_DEFINECONSTANTS_H
14 |
--------------------------------------------------------------------------------
/src/build_index.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include "indexing_manager.h"
4 |
5 | using namespace std;
6 |
7 |
8 | cxxopts::ParseResult parseArgs(int argc, const char* argv[], std::vector &input_directories, const std::shared_ptr& config) {
9 | try {
10 | for (int i = 0; i < argc; ++i) {
11 | config->build_command += argv[i];
12 | config->build_command += " ";
13 | }
14 | config->build_command.pop_back();
15 |
16 | cxxopts::Options options("mistle-build", "Build mistle's fragment ion index for spectral matching");
17 |
18 | options.positional_help("[optional args]").show_positional_help();
19 |
20 | options.add_options()
21 | ("h, help", "Print this help message")
22 | ("i,input", "list of input files or directories containing mass spectra (.msp format)", cxxopts::value(), "PATH")
23 | ("o,output", "output directory where indices will be generated", cxxopts::value(), "PATH")
24 | ("n,num_indices", "number of buckets the fragment ion index will be split in", cxxopts::value()->default_value("64"), "NUM")
25 | ("min_pep_length", "Minimum peptide length for the reference spectrum to be loaded into the index", cxxopts::value()->default_value("7"), "NUM")
26 | ("label", "Give the library a label (1: target; -1: decoy)", cxxopts::value()->default_value("1"), "NUM")
27 | ("t,threads", "number of threads (experimental)\n - 1 thread for reading, other threads for processing. Has increased RAM costs (try using more threads or GLIBC_TUNABLES=glibc.malloc.tcache_count=0 for compensation)", cxxopts::value()->default_value("1"), "NUM");
28 |
29 | options.parse_positional({"input", "output"});
30 |
31 | auto result = options.parse(argc,argv);
32 |
33 |
34 | if (result.count("help"))
35 | {
36 | std::cout << options.help() << std::endl;
37 | exit(0);
38 | }
39 | if (result.count("threads")) {
40 | config->num_build_threads = result["threads"].as();
41 | }
42 | if (result.count("num_indices")) {
43 | config->num_indices = result["num_indices"].as();
44 | }
45 | if (result.count("input")) {
46 | // Parse list of input directories (separated by black space)
47 | std::string dir_list = result["input"].as();
48 | std::string::size_type start_pos = 0;
49 | for (auto end_pos = 0; (end_pos = dir_list.find(' ', end_pos)) != std::string::npos; ++end_pos)
50 | {
51 | input_directories.push_back(dir_list.substr(start_pos, end_pos - start_pos));
52 | start_pos = end_pos + 1;
53 | }
54 |
55 | input_directories.push_back(dir_list.substr(start_pos));
56 | } else {
57 | std::cout << "Argument Error: Missing input directory." << std::endl;
58 | exit(1);
59 | }
60 | if (result.count("output")) {
61 | config->idx_path = result["output"].as();
62 | } else {
63 | std::cout << "Argument Error: Missing output directory." << std::endl;
64 | exit(1);
65 | }
66 | config->minimum_peptide_length = result["min_pep_length"].as();
67 | config->label = result["label"].as();
68 |
69 |
70 | return result;
71 |
72 | }
73 | catch (const cxxopts::OptionException& e) {
74 | std::cout << "Error parsing options: " << e.what() << std::endl;
75 | exit(1);
76 | }
77 | }
78 |
79 |
80 | int main(int argc, const char* argv[]) {
81 | cout << "+++ Mistle Build +++" << endl;
82 |
83 | /*
84 | * Args
85 | */
86 | std::vector input_directories;
87 | std::shared_ptr config = std::make_shared();
88 |
89 | parseArgs(argc, argv, input_directories, config);
90 |
91 | /*
92 | * Build indices
93 | */
94 |
95 |
96 | auto start = chrono::high_resolution_clock::now();
97 | indexing_manager im(input_directories, config);
98 | im.build_indices();
99 | auto stop = chrono::high_resolution_clock::now();
100 | auto duration = duration_cast(stop - start);
101 | cout << "Total time elapsed: " << duration.count() << " seconds" << endl;
102 |
103 | return 0;
104 |
105 | }
--------------------------------------------------------------------------------
/src/configuration.cpp:
--------------------------------------------------------------------------------
1 | #include "configuration.h"
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 |
8 |
9 | unsigned int configuration::assign_to_index(float mz) {
10 | for (int i = 0; i < (num_indices - 1); ++i) {
11 | if (mz < sub_idx_limits[i]) {
12 | return i;
13 | }
14 | }
15 | return num_indices - 1;
16 | }
17 |
18 | bool configuration::save_configuration_to_file(const std::string& config_file_path) {
19 |
20 | std::ofstream f(config_file_path, std::ios::out);
21 | std::string delimiter = ";";
22 |
23 | f << "Num indices: " << num_indices << "\n";
24 | f << "Index limits: ";
25 | for (unsigned int lim : sub_idx_limits) {
26 | f << lim << delimiter;
27 | }
28 | f << "\n";
29 | f << "Label: " << label << "\n";
30 | f << "Min peptide length: " << minimum_peptide_length << "\n";
31 | f << "Build command: " << build_command << "\n";
32 | f.close();
33 | return true;
34 | }
35 |
36 | bool configuration::load_configuration_from_file(const std::string& config_file_path) {
37 |
38 | std::ifstream f(config_file_path, std::ios::in);
39 | std::string delimiter = ";";
40 |
41 | //First line
42 | std::string line;
43 | getline(f, line);
44 |
45 | if(line.rfind("Num indices: ", 0) == 0) {
46 | num_indices = std::stoi(line.substr(13, std::string::npos));
47 | } else {
48 | std::cerr << "Wrong config format" << std::endl;
49 | return false;
50 | }
51 |
52 | //Second line
53 | getline(f, line);
54 | if(line.rfind("Index limits: ", 0) == 0) {
55 | std::stringstream ss(line.substr(14, std::string::npos));
56 | std::string str;
57 | while(getline(ss, str, ';')) {
58 | sub_idx_limits.push_back(std::stoi(str));
59 | }
60 | if (sub_idx_limits.size() != num_indices - 1) {
61 | std::cerr << "Num sub idx not matching" << std::endl;
62 | return false;
63 | }
64 | } else {
65 | std::cerr << "Wrong config format" << std::endl;
66 | return false;
67 | }
68 |
69 | //Third line (Label)
70 | getline(f, line);
71 | if(line.rfind("Label: ", 0) == 0) {
72 | label = std::stoi(line.substr(7, std::string::npos));
73 | } else {
74 | label = 1;
75 | }
76 |
77 | f.close();
78 |
79 | idx_path = config_file_path.substr(0,config_file_path.rfind('/') + 1);
80 | precursor_index_path = idx_path + "precursor_idx.bin";
81 | for (int i = 0; i < num_indices; ++i) {
82 | sub_idx_file_names.push_back(idx_path + "frag_idx_" + std::to_string(i) + ".bin");
83 | }
84 |
85 |
86 |
87 | return true;
88 | }
89 |
--------------------------------------------------------------------------------
/src/configuration.h:
--------------------------------------------------------------------------------
1 | #ifndef SIMPLE_EXAMPLE_CONFIGURATION_H
2 | #define SIMPLE_EXAMPLE_CONFIGURATION_H
3 |
4 |
5 | #include
6 | #include
7 |
8 | /*
9 | * Config class
10 | *
11 | * Handles all (meta-)information about index set-up and configuration
12 | * Can be set by arguments or loaded from file.
13 | */
14 |
15 | class configuration {
16 | public:
17 |
18 | std::string idx_path = "";
19 | std::string precursor_index_path;
20 | unsigned int num_indices = 24;
21 | int label = 1;
22 |
23 |
24 | unsigned int sub_idx_range;
25 | std::vector sub_idx_limits;
26 | std::vector sub_idx_file_names;
27 | unsigned int minimum_peptide_length;
28 | std::string build_command;
29 |
30 | //TODO parse more info and move to file_writer/reader
31 | bool save_configuration_to_file(const std::string& config_file_path);
32 | bool load_configuration_from_file(const std::string& config_file_path);
33 |
34 |
35 | unsigned int assign_to_index(float mz);
36 |
37 |
38 | /*
39 | * Build only
40 | */
41 |
42 | int num_build_threads = 1;
43 |
44 | };
45 |
46 |
47 | #endif //SIMPLE_EXAMPLE_CONFIGURATION_H
48 |
--------------------------------------------------------------------------------
/src/fragment_ion_index.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include "fragment_ion_index.h"
8 | #include "DefineConstants.h"
9 | #include "settings.h"
10 |
11 | using namespace std;
12 |
13 |
14 | fragment_ion_index::fragment_ion_index() {
15 |
16 | }
17 |
18 |
19 | fragment_ion_index::fragment_ion_index(precursor_index *parent_index) {
20 |
21 | /*
22 | * todo
23 |
24 | fragment_bins = vector(BIN_MAX_MZ + 1); //TODO remove/determine actual max #bins
25 | for (int i = 0; i < parent_index->get_size(); ++i) {
26 | spectrum *c_spectrum = parent_index->get_spectrum(i);
27 |
28 |
29 | //Iterate all peaks and save them as fragments in the corresponding ion mz bin
30 | for (int j = 0; j < c_spectrum->binned_peaks.size(); ++j) {
31 | int bin = c_spectrum->binned_peaks[j];
32 | fragment frag(c_spectrum->id, c_spectrum->binned_intensities[j]);
33 | fragment_bins[bin].push_back(frag);
34 | }
35 | }
36 | */
37 | }
38 |
39 |
40 |
41 |
42 |
43 | fragment_ion_index::fragment_ion_index(string path) : file_path(path) {
44 |
45 | //load_index_from_file(file_path);
46 | load_index_from_binary_file(file_path);
47 | prepare_axv_access();
48 |
49 | }
50 |
51 | bool fragment_ion_index::sort_index(std::unique_ptr& parent_index) {
52 |
53 | /*
54 | * Sort all bins according to parent rankings
55 | */
56 |
57 | for (fragment_bin &bin : fragment_bins) {
58 | sort(bin.begin(), bin.end(), [&](fragment a, fragment b){
59 | return parent_index->get_rank(a.parent_id) < parent_index->get_rank(b.parent_id);
60 | });
61 | }
62 |
63 | return true;
64 | }
65 |
66 | bool fragment_ion_index::load_index_from_file(const std::string& path) {
67 |
68 | /*
69 | * Read index from file
70 | */
71 |
72 | ifstream f(path, ios::in);
73 | string delimiter = ";";
74 |
75 | fragment_bins.clear();
76 | fragment_bins.resize(BIN_MAX_MZ + 1);
77 |
78 | string line;
79 | while (getline(f,line)) {
80 | size_t delim_pos = line.find(delimiter);
81 | size_t delim_right_pos = line.rfind(delimiter);
82 |
83 | unsigned int id = stoi(line.substr(0, delim_pos));
84 | int mz_bin = stoi(line.substr(delim_pos + 1, delim_right_pos - delim_pos - 1));
85 | float intensity = stof(line.substr(delim_right_pos + 1, string::npos));
86 |
87 | fragment_bins[mz_bin].emplace_back(fragment(id, intensity));
88 |
89 | }
90 |
91 |
92 | f.close();
93 | return true;
94 | }
95 |
96 | bool fragment_ion_index::save_index_to_file(const string &path) {
97 |
98 | /*
99 | * TODO: OBSOLETE using binary file reader
100 | */
101 |
102 | ofstream f(path, ios::out);
103 | //f.precision(FLOAT_OUTPUT_PRECISION);
104 |
105 | string delimiter = ";";
106 |
107 | for (int i = 0; i < fragment_bins.size(); ++i) {
108 |
109 | fragment_bin bin = fragment_bins[i];
110 | for (auto & j : bin) {
111 | f << j.parent_id << delimiter << i << delimiter << j.intensity << "\n";
112 | }
113 | }
114 |
115 | f.close();
116 | return true;
117 | }
118 |
119 | bool fragment_ion_index::load_index_from_binary_file(const string &path) {
120 |
121 | /*
122 | * Read index from binary file
123 | */
124 |
125 | ifstream f(path, ios::binary | ios::in);
126 |
127 | fragment_bins.clear();
128 | fragment_bins.resize(int((BIN_MAX_MZ - BIN_MIN_MZ) / settings::bin_size) + 1);
129 |
130 | while (!f.eof()) { //TODO might not actually end the loop correctly
131 | unsigned int id;
132 | float mz;
133 | float intensity;
134 |
135 | f.read((char *) &id, sizeof(unsigned int));
136 | if (f.eof()) { //Double check
137 |
138 | break;
139 | }
140 | f.read((char *) &mz, sizeof(float));
141 | f.read((char *) &intensity, sizeof(float));
142 |
143 | if (settings::turn_off_fragment_intensities) {
144 | intensity = 1.f;
145 | }
146 |
147 | if (mz > BIN_MAX_MZ || mz < BIN_MIN_MZ) {
148 | continue;
149 | }
150 | int mz_bin = spectrum::get_mz_bin(mz);
151 |
152 | //if (BIN_MIN_MZ > 1)
153 | // std::cerr << "NEIJ: " << BIN_MIN_MZ << std::endl;
154 |
155 | // Same parent peaks falling into the same bin
156 | if (!fragment_bins[mz_bin].empty() && fragment_bins[mz_bin].back().parent_id == id) {
157 | fragment &frag = fragment_bins[mz_bin].back();
158 | if (abs(frag.mz - mz) > settings::bin_size){
159 | std::cerr << "NONO " << frag.mz << " " << mz << std::endl;
160 | exit(1);
161 | }
162 | //Track peak composition in fragment
163 | if (frag.peak_composition.empty()) {
164 | frag.peak_composition.emplace_back(frag.mz, frag.intensity);
165 | }
166 | frag.peak_composition.emplace_back(mz, intensity);
167 |
168 | if (frag.peak_composition.size() > 100) {
169 | std::cerr << "NOT LIKE THIS :( " << std::endl;
170 | }
171 | //Update overall intensity
172 | frag.intensity = sqrt(frag.intensity * frag.intensity + intensity * intensity);
173 |
174 | }
175 | else {
176 | fragment_bins[mz_bin].emplace_back(fragment(id, intensity, mz));
177 | }
178 |
179 | }
180 |
181 |
182 | f.close();
183 | return true;
184 | }
185 |
186 | bool fragment_ion_index::save_index_to_binary_file(const string &path) {
187 |
188 | ofstream f(path, ios::binary | ios::out);
189 |
190 |
191 | for (auto &bin : fragment_bins) {
192 |
193 | for (auto & j : bin) {
194 | f.write((char *) &j.parent_id, sizeof(unsigned int)); //TODO
195 | f.write((char *) &j.mz, sizeof(float));
196 | f.write((char *) &j.intensity, sizeof(float));
197 | }
198 | }
199 |
200 | f.close();
201 | return true;
202 | }
203 |
204 | #if USE_AVX_512
205 | bool fragment_ion_index::prepare_axv_access() {
206 | frag_bins.clear();
207 | frag_bins.resize(fragment_bins.size());
208 | for (int i = 0; i < fragment_bins.size(); ++i) {
209 | __m512 intensity_x16;
210 | __m512i identity_x16;
211 | int ranks[16];
212 | for (int j = 0; j < fragment_bins[i].size(); ++j) {
213 | frag_bins[i].intensities.push_back(fragment_bins[i][j].intensity);
214 | frag_bins[i].parent_ids.push_back(fragment_bins[i][j].parent_id);
215 | if(j % 16 == 0 && j > 0) {
216 | intensity_x16 = _mm512_loadu_ps(&frag_bins[i].intensities[j-16]);
217 | identity_x16 = _mm512_loadu_si512((__m256i*)&frag_bins[i].parent_ids[j-16]);
218 | frag_bins[i]._intensities.push_back(intensity_x16);
219 | frag_bins[i]._parent_ids.push_back(identity_x16);
220 | //frag_bins[i]._parent_ranks.push_back(_mm256_load_si256((__m256i*)& ranks));
221 | }
222 | //ranks[j % 16] = (int) precursor_idx->get_rank(fragment_bins[i][j].parent_id);
223 | }
224 | assert(reinterpret_cast(frag_bins[i]._intensities.data()) % alignof(__m512) == 0);
225 | assert(reinterpret_cast(frag_bins[i]._parent_ids.data()) % alignof(__m512i) == 0);
226 | //assert(reinterpret_cast(frag_bins[i]._parent_ranks.data()) % alignof(__m256i) == 0);
227 | }
228 | return true;
229 | }
230 | #elif USE_AVX_2
231 | bool fragment_ion_index::prepare_axv_access() {
232 | frag_bins.clear();
233 | frag_bins.resize(fragment_bins.size());
234 | for (int i = 0; i < fragment_bins.size(); ++i) {
235 | __m256 intensity_x8;
236 | __m256i identity_x8;
237 | int ranks[8];
238 | for (int j = 0; j < fragment_bins[i].size(); ++j) {
239 | frag_bins[i].intensities.push_back(fragment_bins[i][j].intensity);
240 | frag_bins[i].parent_ids.push_back(fragment_bins[i][j].parent_id);
241 | if(j % 8 == 0 && j > 0) {
242 | intensity_x8 = _mm256_loadu_ps(&frag_bins[i].intensities[j-8]);
243 | //identity_x8 = _mm256_loadu_si256((__m256i*)&frag_bins[i].parent_ids[j-8]);
244 | frag_bins[i]._intensities.push_back(intensity_x8);
245 | //frag_bins[i]._parent_ids.push_back(identity_x8);
246 | //frag_bins[i]._parent_ranks.push_back(_mm256_load_si256((__m256i*)& ranks));
247 | }
248 | //ranks[j % 8] = (int) precursor_idx->get_rank(fragment_bins[i][j].parent_id);
249 | }
250 | assert(reinterpret_cast(frag_bins[i]._intensities.data()) % alignof(__m256) == 0);
251 | //assert(reinterpret_cast(frag_bins[i]._parent_ids.data()) % alignof(__m256i) == 0);
252 | //assert(reinterpret_cast(frag_bins[i]._parent_ranks.data()) % alignof(__m256i) == 0);
253 | }
254 | return true;
255 | }
256 |
257 | #else
258 | bool fragment_ion_index::prepare_axv_access() {
259 | return false;
260 | }
261 | #endif
262 |
263 |
264 | bool fragment_ion_index::load_preliminary_index_from_binary_file(const string &path) {
265 | /*
266 | * Read index from binary file
267 | */
268 |
269 | ifstream f(path, ios::binary | ios::in);
270 |
271 | fragment_bins.clear();
272 | fragment_bins.resize(1);
273 |
274 | while (!f.eof()) { //TODO might not actually end the loop correctly
275 | unsigned int id;
276 | float mz;
277 | float intensity;
278 |
279 | f.read((char *) &id, sizeof(unsigned int));
280 | f.read((char *) &mz, sizeof(float));
281 | f.read((char *) &intensity, sizeof(float));
282 |
283 | fragment_bins[0].emplace_back(fragment(id, intensity, mz));
284 |
285 | }
286 |
287 |
288 | f.close();
289 | return true;
290 | }
--------------------------------------------------------------------------------
/src/fragment_ion_index.h:
--------------------------------------------------------------------------------
1 | #ifndef SIMPLE_EXAMPLE_FRAGMENT_ION_INDEX_H
2 | #define SIMPLE_EXAMPLE_FRAGMENT_ION_INDEX_H
3 | #include
4 | #include
5 | #include "precursor_index.h"
6 |
7 |
8 | struct fragment {
9 | unsigned int parent_id;
10 | float intensity;
11 | float mz;
12 |
13 | // If multiple peaks are contributing to the fragment - keep track of composition
14 | std::vector> peak_composition = {}; //
15 |
16 | fragment(unsigned int parent_id, float intensity) : parent_id(parent_id), intensity(intensity) {};
17 | fragment(unsigned int parent_id, float intensity, float mz) : parent_id(parent_id), intensity(intensity), mz(mz) {};
18 | };
19 |
20 |
21 | typedef std::vector fragment_bin;
22 |
23 |
24 |
25 | struct __attribute__ ((aligned (32))) fragment_binn {
26 | __attribute__ ((aligned (32))) std::vector intensities;
27 | __attribute__ ((aligned (32))) std::vector parent_ids;
28 | #if USE_AVX_512
29 | __attribute__ ((aligned (32))) std::vector<__m512> _intensities;
30 | __attribute__ ((aligned (32))) std::vector<__m512i> _parent_ids;
31 |
32 | #elif USE_AVX_2
33 | __attribute__ ((aligned (32))) std::vector<__m256> _intensities;
34 | __attribute__ ((aligned (32))) std::vector<__m256i> _parent_ids;
35 | __attribute__ ((aligned (32))) std::vector<__m256i> _parent_ranks;
36 | #endif
37 | };
38 |
39 | class fragment_ion_index {
40 | public:
41 |
42 | std::shared_ptr precursor_idx;
43 | std::string file_path;
44 | std::vector fragment_bins;
45 | __attribute__ ((aligned (32))) std::vector frag_bins;
46 |
47 |
48 | fragment_ion_index();
49 | explicit fragment_ion_index(precursor_index *parent_index);
50 | explicit fragment_ion_index(std::string path);
51 |
52 | bool sort_index(std::unique_ptr& parent_index);
53 |
54 |
55 | bool prepare_axv_access();
56 | bool load_index_from_file(const std::string& path);
57 | bool load_index_from_binary_file(const std::string& path);
58 | bool load_preliminary_index_from_binary_file(const std::string& path);
59 | bool save_index_to_file(const std::string& path);
60 | bool save_index_to_binary_file(const std::string& path);
61 |
62 | };
63 |
64 |
65 | #endif //SIMPLE_EXAMPLE_FRAGMENT_ION_INDEX_H
66 |
--------------------------------------------------------------------------------
/src/index_file_reader.cpp:
--------------------------------------------------------------------------------
1 | #include "index_file_reader.h"
2 | #include
3 | #include
4 |
5 |
6 | bool index_file_reader::read_file_into_precursor_index(const std::string &file_path,
7 | const std::shared_ptr& precursor_idx) {
8 |
9 |
10 | std::ifstream f(file_path, std::ios::in);
11 | std::string delimiter = ";";
12 | std::string line;
13 |
14 | if (!getline(f, line)) {
15 | return false;
16 | }
17 |
18 | if (line.rfind("Num: ", 0) != 0) {
19 | std::cerr << "Incorrect file format" << std::endl;
20 | return false;
21 | }
22 |
23 | //Read header
24 | unsigned int size = std::stoi(line.substr(5, std::string::npos)); //TODO check 4 or 5
25 | precursor_idx->set_size(size);
26 |
27 | // Parse precursors line by line
28 | //precursor_idx->add_precursor_record(p);
29 | while (getline(f, line)) {
30 |
31 | size_t delim_pos = line.find(delimiter);
32 | unsigned int id = std::stoi(line.substr(0, delim_pos));
33 |
34 | size_t length = line.find(delimiter, delim_pos + 1) - delim_pos;
35 | unsigned int rank = std::stoi(line.substr(delim_pos + 1, length - 1));
36 |
37 | delim_pos = delim_pos + length;
38 | length = line.find(delimiter, delim_pos + 1) - delim_pos;
39 | float mz = std::stof(line.substr(delim_pos + 1, length - 1));
40 |
41 | delim_pos = delim_pos + length;
42 | length = line.find(delimiter, delim_pos + 1) - delim_pos;
43 | int charge = std::stoi(line.substr(delim_pos + 1, length - 1));
44 | std::string peptide = line.substr(delim_pos + length + 1, std::string::npos);
45 |
46 | precursor_idx->add_precursor_record(precursor(id, rank, mz, charge, peptide));
47 | }
48 | if (precursor_idx->get_size() != size) {
49 | std::cerr << "Wrong number of precursors" << std::endl;
50 | }
51 |
52 | return true;
53 | }
54 |
--------------------------------------------------------------------------------
/src/index_file_reader.h:
--------------------------------------------------------------------------------
1 | #ifndef SIMPLE_EXAMPLE_INDEX_FILE_READER_H
2 | #define SIMPLE_EXAMPLE_INDEX_FILE_READER_H
3 |
4 |
5 | #include
6 | #include "precursor_index.h"
7 |
8 | class index_file_reader {
9 | public:
10 | static bool read_file_into_precursor_index(const std::string &file_path, const std::shared_ptr& precursor_idx);
11 | };
12 |
13 |
14 | #endif //SIMPLE_EXAMPLE_INDEX_FILE_READER_H
15 |
--------------------------------------------------------------------------------
/src/index_file_writer.cpp:
--------------------------------------------------------------------------------
1 | #include "index_file_writer.h"
2 | #include "DefineConstants.h"
3 | #include
4 | #include
5 |
6 |
7 | bool index_file_writer::stream_peaks_to_file(std::fstream &f, unsigned int parent_id, const std::shared_ptr& spec) {
8 |
9 | std::string delimiter = ";";
10 | for (int i = 0; i < spec->binned_peaks.size(); ++i) {
11 | int bin = spec->binned_peaks[i];
12 | float intensity = spec->binned_intensities[i];
13 |
14 | f << parent_id << delimiter << bin << delimiter << intensity << "\n";
15 |
16 | }
17 |
18 |
19 | return true;
20 | }
21 |
22 | bool index_file_writer::save_precursor_index(const std::string& file_path, std::vector &precursors) {
23 |
24 | std::ofstream f(file_path, std::ofstream::out);
25 | std::string delimiter = ";";
26 | //f.precision(FLOAT_OUTPUT_PRECISION);
27 |
28 | //Have num precursors as header (needed for efficient parsing)
29 | f << "Num: " << precursors.size() << "\n";
30 |
31 | /*
32 | * ENCODING: ID;RANK;MZ;CHARGE;PEPTIDE
33 | */
34 | for (precursor &p : precursors) {
35 | f << p.id << delimiter << p.rank << delimiter << p.mz << delimiter << p.charge << delimiter << p.peptide << "\n";
36 | }
37 |
38 | f.close();
39 |
40 | return true;
41 | }
42 |
43 | bool index_file_writer::save_matches_to_file(const std::string &file_path, std::vector &matches) {
44 | std::fstream outfile;
45 | std::string delimiter = ";";
46 |
47 |
48 | outfile.open(file_path, std::ios::out);
49 | if (!outfile.good())
50 | return false;
51 |
52 | // Add header
53 | outfile << "spectrum"+delimiter+"match"+delimiter+"peptide"+delimiter+"dot-product"+delimiter+"mass-difference\n";
54 |
55 | // Go through matches and parse relevant information for each
56 | for (int i = 0; i < matches.size(); ++i) {
57 | match psm = matches[i];
58 | //TODO
59 | outfile << psm.query_spectrum->name << delimiter << psm.matched_spectrum->name << delimiter << psm.matched_spectrum->peptide << delimiter << psm.dot_product << delimiter << psm.mass_difference << "\n";
60 | }
61 |
62 | outfile.close();
63 | return true;
64 | }
65 |
66 | bool index_file_writer::stream_peaks_to_binary_file(std::fstream &f, unsigned int parent_id,
67 | const std::shared_ptr &spec) {
68 | for (int i = 0; i < spec->intensities.size(); ++i) {
69 | float mz = spec->peak_positions[i];
70 | float intensity = spec->intensities[i];
71 |
72 | f.write((char *) &parent_id, sizeof(unsigned int));
73 | f.write((char *) &mz, sizeof(float));
74 | f.write((char *) &intensity, sizeof(float));
75 |
76 | }
77 |
78 |
79 | return true;
80 | }
81 |
82 | bool index_file_writer::save_precursor_index_to_binary_file(const std::string &file_path,
83 | std::vector &precursors) {
84 |
85 | std::fstream f(file_path, std::ios::binary | std::ofstream::out);
86 |
87 |
88 | //Save #of precursors as first element (needed for efficient parsing)
89 | unsigned int size = precursors.size();
90 | f.write((char *) &size, sizeof(unsigned int));
91 |
92 | /*
93 | * ENCODING: ID;RANK;MZ;CHARGE;SIZE_OF_PEPTIDE;PEPTIDE (No semi-colons in binary)
94 | */
95 | for (precursor &p : precursors) {
96 | f.write((char *) &p.id, sizeof(unsigned int));
97 | f.write((char *) &p.rank, sizeof(unsigned int));
98 | f.write((char *) &p.mz, sizeof(float));
99 | f.write((char *) &p.charge, sizeof(int));
100 | size_t pep_size = p.peptide.size();
101 | f.write((char *) &pep_size, sizeof(pep_size));
102 | f.write(p.peptide.c_str(), pep_size);
103 | }
104 |
105 | f.close();
106 |
107 | return true;
108 | }
109 |
--------------------------------------------------------------------------------
/src/index_file_writer.h:
--------------------------------------------------------------------------------
1 | #ifndef SIMPLE_EXAMPLE_INDEX_FILE_WRITER_H
2 | #define SIMPLE_EXAMPLE_INDEX_FILE_WRITER_H
3 |
4 |
5 | #include
6 | #include "spectrum.h"
7 | #include "precursor_index.h"
8 | #include "match.h"
9 |
10 | class index_file_writer {
11 | public:
12 |
13 | //TODO static std::string delimiter;
14 |
15 | static bool stream_peaks_to_file(std::fstream &f, unsigned int parent_id, const std::shared_ptr& spec);
16 | static bool stream_peaks_to_binary_file(std::fstream &f, unsigned int parent_id, const std::shared_ptr& spec);
17 | static bool save_precursor_index(const std::string& file_path, std::vector &precursors);
18 | static bool save_precursor_index_to_binary_file(const std::string& file_path, std::vector &precursors);
19 | static bool save_matches_to_file(const std::string& file_path, std::vector &matches);
20 | };
21 |
22 |
23 | #endif //SIMPLE_EXAMPLE_INDEX_FILE_WRITER_H
24 |
--------------------------------------------------------------------------------
/src/indexing_manager.cpp:
--------------------------------------------------------------------------------
1 | #include "indexing_manager.h"
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include "msp_reader.h"
7 | #include "mgf_reader.h"
8 | #include "index_file_writer.h"
9 | #include "DefineConstants.h"
10 | #include "fragment_ion_index.h"
11 |
12 | using namespace std;
13 |
14 | indexing_manager::indexing_manager() {
15 | cout << "Empty Constructor not in use" << endl;
16 | exit(1);
17 | }
18 |
19 | indexing_manager::indexing_manager(string path) {
20 | cout << "! Using IndexingManager without config parameter is deprecated" << endl;
21 | exit(1);
22 | /*
23 | * Init
24 | * TODO delete (eventually)
25 | */
26 |
27 | precursorIndex = make_unique();
28 |
29 | config->sub_idx_range = (STANDARD_PARENT_UPPER_MZ - STANDARD_PARENT_LOWER_MZ) / config->num_indices;
30 | for (int i = 1; i < config->num_indices; ++i) { //Starting from 1
31 | cout << "LIMIT: " << STANDARD_PARENT_LOWER_MZ + config->sub_idx_range * i << endl;
32 | config->sub_idx_limits.push_back(STANDARD_PARENT_LOWER_MZ + config->sub_idx_range * i);
33 | }
34 |
35 |
36 | for (const auto & entry : std::filesystem::directory_iterator(path)) {
37 | if (entry.path().extension() == ".msp") {
38 | lib_files.push_back(entry);
39 | }
40 | }
41 | }
42 |
43 |
44 | indexing_manager::indexing_manager(std::vector &input_paths, std::shared_ptr config) : input_paths(input_paths), config(config) {
45 |
46 | /*
47 | * Init
48 | */
49 |
50 | precursorIndex = make_unique();
51 | pool = std::make_shared(config->num_build_threads - 1);
52 |
53 | config->sub_idx_range = (STANDARD_PARENT_UPPER_MZ - STANDARD_PARENT_LOWER_MZ) / config->num_indices;
54 | for (int i = 1; i < config->num_indices; ++i) { //Starting from 1
55 | //cout << "LIMIT: " << STANDARD_PARENT_LOWER_MZ + config->sub_idx_range * i << endl;
56 | config->sub_idx_limits.push_back(STANDARD_PARENT_LOWER_MZ + config->sub_idx_range * i);
57 | }
58 |
59 |
60 | for (std::string &path : input_paths) {
61 |
62 | if (!std::filesystem::exists(path)) {
63 | std::cerr << "Bad file" << std::endl;
64 | std::cerr << path << " is broken or does not exist!" << std::endl;
65 | exit(1);
66 | }
67 |
68 | if (std::filesystem::is_directory(path)) {
69 | for (const auto & entry : std::filesystem::directory_iterator(path)) {
70 | if (entry.path().extension() == ".msp") {
71 | lib_files.push_back(entry.path());
72 | file_format = MSP;
73 | } else if (entry.path().extension() == ".mgf") {
74 | lib_files.push_back(entry.path());
75 | file_format = MGF;
76 | }
77 | }
78 | } else {
79 | std::filesystem::path file_path = path;
80 | if (file_path.extension() == ".msp") {
81 | file_format = MSP;
82 | lib_files.push_back(file_path);
83 | } else if (file_path.extension() == ".mgf") {
84 | file_format = MGF;
85 | lib_files.push_back(file_path);
86 | } else {
87 | std::cerr << "Unsupported file extension" << std::endl;
88 | std::cerr << file_path << " file is not supported" << std::endl;
89 | exit(1);
90 | }
91 | }
92 |
93 | }
94 | if (!std::filesystem::exists(config->idx_path) || !std::filesystem::is_directory(config->idx_path)) {
95 | std:cerr << "Bad output directory" << std::endl;
96 | std::cerr << config->idx_path << " is not a directory or does not exist!" << std::endl;
97 | exit(1);
98 | }
99 | if (!config->idx_path.ends_with('/')) {
100 | config->idx_path += "/";
101 | }
102 |
103 | }
104 |
105 |
106 | bool indexing_manager::build_indices() {
107 |
108 | /*
109 | * Prepare in/output
110 | */
111 | set_up_output_streams();
112 |
113 |
114 | /*
115 | * Parsing files and creating preliminary indices
116 | */
117 |
118 | cout << "Parsing "<< lib_files.size() <<" library files ..." << endl;
119 | auto start = chrono::high_resolution_clock::now();
120 | for (int i = 0; i < lib_files.size(); ++i) {
121 | //cout << "Parsing library file no. " << i << " (" << lib_files[i].path().filename() << ")" << endl;
122 | parse_file(i); //TODO has multi-threading (experimental)
123 | }
124 | if (pool->get_size() > 0) {
125 | std::cout << "Waiting for threads to finish processing" << std::endl;
126 | pool->add_thread(); //Have "main" thread help out with the computation
127 | pool->wait_for_all_threads();
128 | }
129 | auto stop = chrono::high_resolution_clock::now();
130 | auto duration = duration_cast(stop - start);
131 | cout << "Loading Time: " << duration.count() << " seconds" << endl;
132 |
133 | /*
134 | * Storing and rebuilding indices
135 | */
136 |
137 | //Precursor index
138 | cout << "Sorting precursors index" << endl;
139 | precursorIndex->sort_index();
140 | cout << "Saving ..." << endl;
141 | //precursorIndex->save_index_to_file(config->idx_path + "precursor_idx.csv");
142 | precursorIndex->save_index_to_binary_file(config->idx_path + "precursor_idx.bin");
143 |
144 | config->save_configuration_to_file(config->idx_path + "config.txt");
145 |
146 | //Closing output streams and reopening them as input streams
147 | for (int i = 0; i < output_streams.size(); ++i) {
148 | output_streams[i].close();
149 | }
150 |
151 | cout << "Sorting fragment ion indices" << endl;
152 | for (int i = 0; i < config->sub_idx_file_names.size(); ++i) {
153 | string file_name = config->idx_path + config->sub_idx_file_names[i];
154 |
155 | fragment_ion_index frag_index;
156 | frag_index.load_preliminary_index_from_binary_file(file_name);
157 | frag_index.sort_index(precursorIndex);
158 | frag_index.save_index_to_binary_file(file_name);
159 | }
160 | cout << "Done" << endl;
161 |
162 | return true;
163 | }
164 |
165 | bool indexing_manager::set_up_output_streams() {
166 |
167 | for (int i = 0; i < config->num_indices; ++i) {
168 | string file_name = config->idx_path + "frag_idx_" + to_string(i) + ".bin";
169 | //cout << file_name << endl;
170 | config->sub_idx_file_names.push_back("frag_idx_" + to_string(i) + ".bin");
171 | output_streams.emplace_back(fstream(file_name, std::ios::binary | std::ofstream::out));
172 | }
173 |
174 |
175 | return true;
176 | }
177 |
178 | bool indexing_manager::parse_file(unsigned int file_num) {
179 | string file_path = lib_files[file_num].string();
180 |
181 | ifstream f(file_path, ios::in);
182 | //f.precision(FLOAT_OUTPUT_PRECISION);
183 |
184 | string buffer;
185 |
186 | /*
187 | * Main loop reading library and creating preliminary indices on the fly
188 | */
189 |
190 | while (!f.eof()) {
191 |
192 | //Read spectrum from file and pre-processing
193 | bool read_successfully = false;
194 | if (file_format == MSP) {
195 | read_successfully = msp_reader::read_next_entry_into_buffer(f, buffer);
196 | } else if (file_format == MGF) {
197 | read_successfully = mgf_reader::read_next_entry_into_buffer(f, buffer);
198 | }
199 | if (!read_successfully)
200 | continue;
201 |
202 | auto read_and_stream = [this, buffer]() {
203 |
204 | shared_ptr tmp_spectrum;
205 | if (file_format == MSP) {
206 | tmp_spectrum = msp_reader::read_spectrum_from_buffer(buffer);
207 | } else if (file_format == MGF) {
208 | tmp_spectrum = mgf_reader::read_spectrum_from_buffer(buffer);
209 | }
210 |
211 | if (tmp_spectrum->peptide.length() < config->minimum_peptide_length) {
212 | return;
213 | }
214 |
215 |
216 | //Lock for recording and streaming
217 | std::lock_guard guard(pool->mtx);
218 |
219 | //Save bookmark in precursor index
220 | precursor &bookmark = precursorIndex->record_new_precursor(tmp_spectrum);
221 |
222 | //Stream (binned) peaks into corresponding sub-index file
223 |
224 | unsigned int idx_num = config->assign_to_index(bookmark.mz);
225 | index_file_writer::stream_peaks_to_binary_file(output_streams[idx_num], bookmark.id, tmp_spectrum);
226 | //std::cout << bookmark.mz << std::endl;
227 |
228 | };
229 |
230 |
231 | if (pool->get_size() > 0) {
232 | pool->enqueue(read_and_stream);
233 | } else {
234 | read_and_stream();
235 | }
236 | }
237 |
238 | return true;
239 | }
240 |
241 | bool indexing_manager::parse_file_buffered(unsigned int file_num) {
242 | string file_path = lib_files[file_num].string();
243 |
244 | ifstream f(file_path, ios::in);
245 | //f.precision(FLOAT_OUTPUT_PRECISION); //TODO
246 |
247 | unsigned int buffer_size = 40960;//1048576; //Byte //TODO fix if too small
248 | unsigned int carryover_pos = 0;
249 | string buffer;
250 | buffer.resize(buffer_size);
251 |
252 | /*
253 | * Main loop reading library and creating preliminary indices on the fly
254 | */
255 | while (!f.eof()) {
256 |
257 | //Read large char buffer
258 | f.read(&buffer[carryover_pos], buffer_size - carryover_pos);
259 | unsigned int last_pos = buffer.rfind("Name:");
260 | unsigned int current_pos = buffer.find("Name:");
261 |
262 | // Parse spectra within the buffer
263 | while (current_pos != last_pos) {
264 | unsigned int next_pos = buffer.find("Name:", current_pos + 1);
265 | shared_ptr tmp_spectrum = msp_reader::read_spectrum_from_buffer(buffer.substr(current_pos, next_pos - current_pos));
266 |
267 | //Save bookmark in precursor index
268 | precursor &bookmark = precursorIndex->record_new_precursor(tmp_spectrum);
269 |
270 | //Stream (binned) peaks into corresponding sub-index file
271 | unsigned int idx_num = config->assign_to_index(bookmark.mz);
272 | index_file_writer::stream_peaks_to_binary_file(output_streams[idx_num], bookmark.id, tmp_spectrum);
273 | current_pos = next_pos;
274 | }
275 |
276 | carryover_pos = buffer_size - current_pos;
277 | buffer.replace(0, carryover_pos, buffer.substr(current_pos, std::string::npos)); //keep ms2 carryover in the buffer
278 | current_pos = 0; //TODO why is this 0 in the first place
279 | //buffer = buffer.substr(current_pos, std::string::npos));
280 | //buffer.resize(buffer_size);
281 | }
282 | return true;
283 | }
284 |
--------------------------------------------------------------------------------
/src/indexing_manager.h:
--------------------------------------------------------------------------------
1 | #ifndef SIMPLE_EXAMPLE_INDEXING_MANAGER_H
2 | #define SIMPLE_EXAMPLE_INDEXING_MANAGER_H
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include "configuration.h"
9 | #include "precursor_index.h"
10 | #include "thread_pool.h"
11 |
12 | enum FILE_FORMAT {
13 | MSP, MGF
14 | };
15 |
16 | class indexing_manager {
17 |
18 | std::vector input_paths;
19 | std::vector lib_files;
20 | FILE_FORMAT file_format;
21 |
22 |
23 | //Precursor Index
24 | std::unique_ptr precursorIndex;
25 |
26 | /*
27 | * (Sub-) Indices
28 | */
29 |
30 | std::shared_ptr config = std::make_shared();
31 | std::vector output_streams;
32 |
33 |
34 | /*
35 | * Threading
36 | */
37 |
38 | std::shared_ptr pool;
39 |
40 | public:
41 | indexing_manager();
42 | explicit indexing_manager(std::string path);
43 | indexing_manager(std::vector &input_paths, std::shared_ptr config);
44 |
45 |
46 | bool build_indices();
47 | bool set_up_output_streams();
48 | bool parse_file(unsigned int file_num);
49 | bool parse_file_buffered(unsigned int file_num);
50 |
51 |
52 |
53 |
54 | };
55 |
56 |
57 | #endif //SIMPLE_EXAMPLE_INDEXING_MANAGER_H
58 |
--------------------------------------------------------------------------------
/src/library.cpp:
--------------------------------------------------------------------------------
1 | #include "library.h"
2 |
3 | #include
4 | #include
5 | #include
6 | #include "msp_reader.h"
7 | #include "mgf_reader.h"
8 | #include "settings.h"
9 |
10 | using namespace std;
11 |
12 | library::library() {
13 |
14 | }
15 |
16 | library::~library() {
17 | /*for (int i = 0; i < spectrum_list.size(); ++i) {
18 | delete spectrum_list[i];
19 | }*/
20 | spectrum_list.clear();
21 | }
22 |
23 |
24 | library::library(string &path) {
25 | if (path[path.length() - 1] == '/' || path[path.length() - 1] == '\\') {
26 | cout << "Loading library from directory:" << endl;
27 | load_library_from_directory(path);
28 | }
29 | cout << "Loading library from single file:" << endl;
30 | load_spectra_from_file(path);
31 | }
32 |
33 | bool library::construct(string &path) {
34 | if (path[path.length() - 1] == '/' || path[path.length() - 1] == '\\') {
35 | cout << "Loading library from directory:" << endl;
36 | load_library_from_directory(path);
37 | }
38 | cout << "Loading library from single file:" << endl;
39 | load_spectra_from_file(path);
40 |
41 | return true;
42 | }
43 |
44 | bool library::load_library_from_directory(string &path) {
45 |
46 | if (settings::load_batches) {
47 | cerr << "Warning: Batch search for multiple files not fully implemented" << endl;
48 | exit(1);
49 | }
50 | for (const auto & entry : std::filesystem::directory_iterator(path)) {
51 | load_spectra_from_file(entry.path().string());
52 | //cout << spectrum_list.size() << endl;
53 | }
54 | return true;
55 | }
56 |
57 | bool library::load_spectra_from_file(string path) {
58 |
59 | string extension = path.substr(path.rfind('.') + 1, string::npos);
60 | cout << "Loading spectra from file: " << path << endl;
61 | if (extension == "msp") {
62 | if (!msp_reader::read_file(path, spectrum_list)) {
63 | cout << "Error reading file: " << path << endl;
64 | return false;
65 | }
66 | }
67 | else if (extension == "mgf") {
68 | if (settings::load_batches) {
69 | file_stream.open(path, ios::in);
70 | last_batch = mgf_reader::read_file_batch(file_stream, spectrum_list, settings::batch_size);
71 | }
72 | else if (!mgf_reader::read_file(path, spectrum_list)) {
73 | cout << "Error reading file: " << path << endl;
74 | return false;
75 | }
76 | }
77 | else {
78 | cout << "Unknown file extension" << endl;
79 | return false;
80 | }
81 | std::cout << "\t" << spectrum_list.size() << " scans loaded" << std::endl;
82 |
83 | return true;
84 | }
85 |
86 |
87 |
88 |
89 | bool library::build_library_index() {
90 | cout << "Building precursor index" << endl;
91 | precursor_idx = new precursor_index();
92 | //TODO refactoring get up to date
93 | cout << "Building fragment ion index" << endl;
94 | fragment_ion_idx = new class fragment_ion_index(precursor_idx);
95 | is_indexed = true;
96 | return false;
97 | }
98 |
99 | library::library(vector> &spectra) {
100 | spectrum_list = spectra;
101 | }
102 |
103 | bool library::load_next_batch() {
104 | last_batch = mgf_reader::read_file_batch(file_stream, spectrum_list, settings::batch_size);
105 | return last_batch;
106 | }
107 |
108 |
--------------------------------------------------------------------------------
/src/library.h:
--------------------------------------------------------------------------------
1 | #ifndef SIMPLE_EXAMPLE_LIBRARY_H
2 | #define SIMPLE_EXAMPLE_LIBRARY_H
3 | #include
4 | #include
5 | #include "spectrum.h"
6 | #include "precursor_index.h"
7 | #include "fragment_ion_index.h"
8 |
9 |
10 | class library {
11 | public:
12 | library();
13 | library(std::string &path);
14 | library(std::vector> &spectra);
15 | bool construct(std::string &path);
16 | ~library();
17 |
18 | bool load_spectra_from_file(std::string path);
19 | bool load_library_from_directory(std::string &path);
20 | bool load_next_batch();
21 |
22 | bool build_library_index();
23 |
24 | std::vector> spectrum_list;
25 | precursor_index* precursor_idx;
26 | fragment_ion_index* fragment_ion_idx;
27 | bool is_indexed = false;
28 | bool last_batch = true;
29 | private:
30 | std::fstream file_stream;
31 |
32 | };
33 |
34 |
35 | #endif //SIMPLE_EXAMPLE_LIBRARY_H
36 |
--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include "spectrum.h"
6 | #include "msp_reader.h"
7 | #include "scores.h"
8 | #include "spectral_search.h"
9 | #include "library.h"
10 | //#include
11 | //#include
12 |
13 | //using namespace OpenMS;
14 | //using namespace OpenMSExternal;
15 |
16 | using namespace std;
17 |
18 | int main() {
19 | cout << "Welcome, welcome" << endl;
20 | //FeatureMap fm;
21 | //Feature feature;
22 | //PeakSpectrum p;
23 |
24 | //fm.push_back(feature);
25 |
26 |
27 | //string msp_file = "/home/ynowatzk/data/pyro_fur/PyroFur_reproduced.msp";
28 | string msp_file = "/home/ynowatzk/data/9MM/msp/";
29 | string mgf_file = "/home/ynowatzk/data/9MM/mgf/9MM_FASP.mgf"; //
30 |
31 |
32 | //library *search_lib = new library(mgf_file);
33 |
34 | auto start = chrono::high_resolution_clock::now();
35 | library *lib = new library(msp_file);
36 | auto stop = chrono::high_resolution_clock::now();
37 | auto duration = duration_cast(stop - start);
38 | cout << "Loading Time: " << duration.count() << " seconds" << endl;
39 |
40 | lib->build_library_index();
41 |
42 |
43 | /*
44 | * SEARCH
45 | */
46 |
47 | //TODO recosntruct
48 |
49 | //spectral_search search(search_lib, lib);
50 |
51 | /*
52 | //Rescoring of spectrast results
53 | cout << "Reading in" << endl;
54 | search.read_results_from_file(R"(C:\Users\ynowatzk\Desktop\data\pyrococcus_furiosus\results\reproduced\sp5_neighbors_matches.tsv)");
55 | cout << "Rescoring" << endl;
56 | search.rescore_matches();
57 | cout << "Saving" << endl;
58 | search.save_results_to_file(R"(C:\Users\ynowatzk\Desktop\data\pyrococcus_furiosus\results\reproduced\sp5_neighbors_rescored.tsv)");
59 | */
60 |
61 |
62 | //cout << "Searching fragment ion index" << endl;
63 | //start = chrono::high_resolution_clock::now();
64 | //search.search_target_library();
65 | //stop = chrono::high_resolution_clock::now();
66 | //duration = duration_cast(stop - start);
67 |
68 | //cout << "Search Time: " << duration.count() << " seconds" << endl;
69 | //search.save_results_to_file("FIIndex.csv");
70 | //exit(12);
71 |
72 | /*
73 | auto start = chrono::high_resolution_clock::now();
74 | search.search_target_library();
75 | auto stop = chrono::high_resolution_clock::now();
76 | auto duration = duration_cast(stop - start);
77 |
78 | cout << "Search Time: " << duration.count() << " seconds" << endl;
79 |
80 |
81 | vector matches = search.get_results();
82 |
83 | for (int i = 0; i < 10; ++i) {
84 | cout << matches[i].query_spectrum->name << " " << matches[i].matched_spectrum->peptide << " " << matches[i].dot_product << endl;
85 | }
86 |
87 | search.save_results_to_file("./out.csv");
88 |
89 | */
90 | return 0;
91 | }
92 |
--------------------------------------------------------------------------------
/src/match.cpp:
--------------------------------------------------------------------------------
1 | #include "match.h"
2 | #include
3 | #include
4 |
5 | using namespace std;
6 |
7 | match::match() {
8 |
9 | }
10 |
11 | match::match(unsigned int query_id, unsigned int target_id) : query_id(query_id), target_id(target_id) {
12 |
13 | }
14 |
15 | /*
16 | * Deprecated constructors
17 | */
18 |
19 | match::match(std::shared_ptr search_spectrum, std::shared_ptr matched_spectrum) : query_spectrum(search_spectrum), matched_spectrum(matched_spectrum) {
20 | mass_difference = matched_spectrum->precursor_mass - search_spectrum->precursor_mass;
21 | }
22 |
23 | match::match(std::shared_ptr search_spectrum, std::shared_ptr matched_spectrum, float dot_product, int hit_rank) : query_spectrum(search_spectrum), matched_spectrum(matched_spectrum), dot_product(dot_product), hit_rank(hit_rank) {
24 | mass_difference = matched_spectrum->precursor_mass - search_spectrum->precursor_mass;
25 | }
26 |
27 | match::match(unsigned int query_id, unsigned int target_id, float dot_product, float mass_difference, int hit_rank) : query_id(query_id), target_id(target_id), dot_product(dot_product), mass_difference(mass_difference), hit_rank(hit_rank) {
28 |
29 | }
30 |
31 | match::match(unsigned int query_id, unsigned int target_id, float similarity_score, float dot_product,
32 | float mass_difference, int hit_rank) : query_id(query_id), target_id(target_id), similarity(similarity_score), dot_product(dot_product), mass_difference(mass_difference), hit_rank(hit_rank) {
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/src/match.h:
--------------------------------------------------------------------------------
1 | #ifndef SIMPLE_EXAMPLE_MATCH_H
2 | #define SIMPLE_EXAMPLE_MATCH_H
3 |
4 | #include "spectrum.h"
5 | #include
6 |
7 | class match {
8 | public:
9 | std::shared_ptr query_spectrum;
10 | std::shared_ptr matched_spectrum;
11 |
12 | unsigned int query_id;
13 | unsigned int target_id;
14 |
15 | float mass_difference;
16 | float abs_mass_difference;
17 | float ppm_difference;
18 | unsigned int charge;
19 | std::vector isomers; //tracking homologous peptides
20 |
21 | // Default scores
22 | float similarity;
23 | float bias;
24 | float dot_product;
25 |
26 |
27 | //Annotation scores
28 | float annotation_similarity;
29 | float annotation_bias;
30 | float annotation_sim2;
31 |
32 | //Contrast angles
33 | float dot_contrast_angle;
34 | float similarity_contrast_angle;
35 | float annotation_contrast_angle;
36 |
37 | //Additional score
38 | float delta_dot;
39 | float delta_similarity;
40 | float delta_annotation_sim;
41 | float delta_sim2;
42 | int peak_count_query;
43 | int peak_count_target;
44 | float peak_mz_standard_deviation;
45 | float peak_mz_weighted_standard_deviation;
46 |
47 | //Advanced scores
48 | float avg_bias_adj_similarity;
49 | float delta_avg;
50 | float sim2;
51 | float spectraST_score;
52 | float spectraST_score_dot;
53 | double x_hunter_score; //all peaks, not just top 20
54 | double x_lgamma;
55 | double x_hunter_score_dot; //all peaks, not just top 20
56 | double x_lgamma_dot;
57 |
58 | int hit_rank;
59 | int query_index; //todo implement
60 | int num_matched_peaks;
61 |
62 | match();
63 | match(unsigned int query_id, unsigned int target_id);
64 |
65 | /*
66 | * Deprecated constructors
67 | */
68 | match(std::shared_ptr search_spectrum, std::shared_ptr matched_spectrum);
69 | match(std::shared_ptr search_spectrum, std::shared_ptr matched_spectrum, float dot_product, int hit_rank);
70 | match(unsigned int query_id, unsigned int target_id, float dot_product, float mass_difference, int hit_rank);
71 | match(unsigned int query_id, unsigned int target_id, float similarity_score, float dot_product, float mass_difference, int hit_rank);
72 |
73 | };
74 |
75 |
76 | #endif //SIMPLE_EXAMPLE_MATCH_H
77 |
--------------------------------------------------------------------------------
/src/mgf_reader.cpp:
--------------------------------------------------------------------------------
1 | #include "mgf_reader.h"
2 | #include
3 | #include
4 | #include
5 | #include "settings.h"
6 |
7 | using namespace std;
8 |
9 | mgf_reader::mgf_reader() {
10 |
11 | }
12 |
13 | bool mgf_reader::read_file(string path, vector> &output_spectra) {
14 |
15 | fstream infile;
16 |
17 | infile.open(path, ios::in);
18 | if (!infile) {
19 | return false;
20 | }
21 |
22 | std::shared_ptr c_spectrum = std::make_shared();
23 | while (!infile.eof()) {
24 | string line;
25 | getline(infile, line);
26 |
27 | if (line == "END IONS") {
28 | // Post-process and save the current spectrum
29 | //c_spectrum->intensity_bin_spanning_factor = -1.f; //TODO figure out if neighbor_spanning here
30 | //c_spectrum->bin_peaks(true,true); //TODO comment out
31 | if (settings::apply_topX_in_window_denoising)
32 | c_spectrum->denoise_mz_window(settings::peaks_per_window, settings::window_size); //TODO this exists only for .mgf search file spectra
33 | c_spectrum->bin_peaks_sparse(true, true);
34 | c_spectrum->root_scale_intensities();
35 | c_spectrum->normalize_intensities(); //TODO put into one somehow
36 | output_spectra.push_back(c_spectrum);
37 | continue;
38 | }
39 |
40 | if (line == "BEGIN IONS") {
41 | c_spectrum = std::make_shared();
42 | continue;
43 | }
44 |
45 | // split up line to identify comment tags
46 | string tag, value;
47 | size_t separator_pos = line.find('=');
48 |
49 | if (separator_pos != string::npos) {
50 | tag = line.substr(0, separator_pos);
51 | value = line.substr(separator_pos + 1, string::npos);
52 |
53 | //parse information
54 | if (tag == "TITLE") {
55 | c_spectrum->name = value;
56 | } else if (tag == "PEPMASS") {
57 | c_spectrum->precursor_mass = stof(value); //todo check if that's actually true
58 | } else if (tag == "RTINSECONDS") {
59 |
60 | } else if (tag == "CHARGE") {
61 | c_spectrum->charge = stoi(value);
62 | }
63 | }
64 | else { // TODO make sure this works with long numbers. Otherwise implement like in msp reader
65 | // No separator: Assume peak information is noted down in the line
66 |
67 |
68 | if (line.empty())
69 | continue;
70 |
71 | std::size_t space_pos = line.find(' ');
72 | if (space_pos == string::npos)
73 | continue;
74 |
75 |
76 | float pos = stof(line.substr(0, space_pos));
77 | c_spectrum->peak_positions.push_back(pos);
78 |
79 |
80 | float intensity = stof(line.substr(space_pos, string::npos));
81 | c_spectrum->intensities.push_back(intensity);
82 |
83 |
84 | /*
85 | * alternatively run (but it is slower)
86 | istringstream ss(line);
87 | float pos, intensity;
88 | ss >> pos >> intensity;
89 | */
90 | }
91 | }
92 | infile.close();
93 | return true;
94 | }
95 |
96 | bool mgf_reader::read_next_entry_into_buffer(ifstream &f, string &buffer) { //TODO
97 | /*
98 | * Requires open filestream and reads until end of a new entry
99 | */
100 |
101 | buffer.clear();
102 |
103 | string line;
104 |
105 | if(!getline(f, line)) {
106 | return false;
107 | }
108 |
109 | if (line == "BEGIN IOONS") {
110 | cout << line << endl;
111 | cerr << "entry does not start with BEGIN IONS" << endl;
112 | return false;
113 | }
114 | buffer.append(line + "\n");
115 |
116 | while (getline(f, line)) {
117 |
118 | buffer.append(line + "\n");
119 | if (line.rfind("END IONS", 0) == 0) {
120 | return true;
121 | }
122 | }
123 |
124 |
125 | return true;
126 | }
127 |
128 |
129 | shared_ptr mgf_reader::read_spectrum_from_buffer(const string& buffer) { //TODO
130 |
131 | std::string line, tag, value;
132 | shared_ptr c_spectrum = std::make_shared();
133 |
134 | stringstream ss(buffer);
135 | while (getline(ss, line)) { // what if no colon -> colon_pos == string::npos
136 | if (ss.eof())
137 | return nullptr;
138 |
139 | // parse information
140 | if (line == "END IONS") {
141 | //c_spectrum->bin_peaks_sparse(true, true); // TODO YES NO?
142 | c_spectrum->root_scale_intensities();
143 | c_spectrum->normalize_intensities();
144 | return c_spectrum;
145 | }
146 | if (line == "BEGIN IONS") {
147 | continue;
148 | }
149 |
150 | // split up line to identify comment tags
151 | size_t separator_pos = line.find('=');
152 |
153 | if (separator_pos != string::npos) {
154 | tag = line.substr(0, separator_pos);
155 | value = line.substr(separator_pos + 1, string::npos);
156 |
157 | //parse information
158 | if (tag == "TITLE") {
159 | c_spectrum->name = value;
160 | c_spectrum->peptide = value.substr(0, value.find('/')); //TODO Does this work?
161 | } else if (tag == "PEPMASS") {
162 | c_spectrum->precursor_mass = stof(value); //todo check if that's actually true
163 | } else if (tag == "RTINSECONDS") {
164 |
165 | } else if (tag == "CHARGE") {
166 | c_spectrum->charge = stoi(value);
167 | }
168 | }
169 | else { // TODO make sure this works with long numbers. Otherwise implement like in msp reader
170 | // No separator: Assume peak information is noted down in the line
171 |
172 |
173 | if (line.empty())
174 | continue;
175 |
176 | std::size_t space_pos = line.find_first_of(" \t"); //Finds space or tab
177 | if (space_pos == string::npos)
178 | continue;
179 |
180 |
181 | float pos = stof(line.substr(0, space_pos));
182 | c_spectrum->peak_positions.push_back(pos);
183 |
184 |
185 | float intensity = stof(line.substr(space_pos, string::npos));
186 | c_spectrum->intensities.push_back(intensity);
187 |
188 | }
189 | }
190 | return nullptr;
191 | }
192 |
193 |
194 |
195 | bool mgf_reader::read_file_batch(fstream &infile, vector> &output_spectra, int batch_size) {
196 |
197 | if (!infile) {
198 | cerr << "Error reading file" << endl;
199 | exit(1);
200 | }
201 |
202 | int count = 0;
203 | std::shared_ptr c_spectrum = std::make_shared();
204 | while (!infile.eof()) {
205 | string line;
206 | getline(infile, line);
207 |
208 | if (line == "END IONS") {
209 | // Post-process and save the current spectrum
210 | //c_spectrum->intensity_bin_spanning_factor = -1.f; //TODO figure out if neighbor_spanning here
211 | //c_spectrum->bin_peaks(true,true); //TODO comment out
212 | c_spectrum->bin_peaks_sparse(true, true);
213 | c_spectrum->root_scale_intensities();
214 | c_spectrum->normalize_intensities(); //TODO put into one somehow
215 | output_spectra.push_back(c_spectrum);
216 | ++count;
217 | if (count==batch_size) {
218 | return false; //Indicating more batches to come
219 | }
220 | continue;
221 | }
222 |
223 | if (line == "BEGIN IONS") {
224 | c_spectrum = std::make_shared();
225 | continue;
226 | }
227 |
228 | // split up line to identify comment tags
229 | string tag, value;
230 | size_t separator_pos = line.find('=');
231 |
232 | if (separator_pos != string::npos) {
233 | tag = line.substr(0, separator_pos);
234 | value = line.substr(separator_pos + 1, string::npos);
235 |
236 | //parse information
237 | if (tag == "TITLE") {
238 | c_spectrum->name = value;
239 | } else if (tag == "PEPMASS") {
240 | c_spectrum->precursor_mass = stof(value); //todo check if that's actually true
241 | } else if (tag == "RTINSECONDS") {
242 |
243 | } else if (tag == "CHARGE") {
244 | c_spectrum->charge = stoi(value);
245 | }
246 | }
247 | else { // TODO make sure this works with long numbers. Otherwise implement like in msp reader
248 | // No separator: Assume peak information is noted down in the line
249 |
250 |
251 | if (line.empty())
252 | continue;
253 |
254 | std::size_t space_pos = line.find(' ');
255 | if (space_pos == string::npos)
256 | continue;
257 |
258 |
259 | float pos = stof(line.substr(0, space_pos));
260 | c_spectrum->peak_positions.push_back(pos);
261 |
262 |
263 | float intensity = stof(line.substr(space_pos, string::npos));
264 | c_spectrum->intensities.push_back(intensity);
265 |
266 |
267 | /*
268 | * alternatively run (but it is slower)
269 | istringstream ss(line);
270 | float pos, intensity;
271 | ss >> pos >> intensity;
272 | */
273 | }
274 | }
275 | infile.close();
276 | return true; //Indicating last batch was reached
277 |
278 | }
279 |
--------------------------------------------------------------------------------
/src/mgf_reader.h:
--------------------------------------------------------------------------------
1 | #ifndef SIMPLE_EXAMPLE_MGF_READER_H
2 | #define SIMPLE_EXAMPLE_MGF_READER_H
3 |
4 | #include
5 | #include "spectrum.h"
6 |
7 |
8 |
9 | class mgf_reader {
10 | public:
11 | mgf_reader();
12 |
13 | static bool read_file(std::string path, std::vector> &output_spectra);
14 | static bool read_file_batch(std::fstream &infile, std::vector> &output_spectra, int batch_size);
15 | static bool read_next_entry_into_buffer(std::ifstream &f, std::string &buffer);
16 | static std::shared_ptr read_spectrum_from_buffer(const std::string& buffer);
17 |
18 | };
19 |
20 |
21 | #endif //SIMPLE_EXAMPLE_MGF_READER_H
22 |
--------------------------------------------------------------------------------
/src/msp_reader.cpp:
--------------------------------------------------------------------------------
1 | #include "msp_reader.h"
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | using namespace std;
10 |
11 | std::fstream msp_reader::infile;
12 |
13 |
14 | bool msp_reader::read_file(string &path, vector> &output_spectra) {
15 |
16 | infile.open(path, ios::in);
17 | if (!infile) {
18 | cerr << "Could not open file at " << path << endl;
19 | return false;
20 | }
21 |
22 | string line;
23 | while (!infile.eof()) {
24 | string tag, value;
25 | std::shared_ptr c_spectrum = std::make_shared();
26 | while (tag != "Num peaks") { // what if no colon -> colon_pos == string::npos
27 | if (infile.eof())
28 | return false;
29 | getline(infile, line);
30 |
31 | // split up line to identify comment tags
32 | size_t colon_pos = line.find(':');
33 |
34 | tag = line.substr(0, colon_pos);
35 | value = line.substr(colon_pos + 2, string::npos);
36 |
37 | // parse information
38 | if (tag == "Name") {
39 | c_spectrum = std::make_shared();
40 | c_spectrum->name = value;
41 | c_spectrum->peptide = value.substr(0, value.find('/'));
42 | c_spectrum->charge = stoi(value.substr(value.rfind('/') + 1, string::npos));
43 | } else if (tag == "MW") {
44 | c_spectrum->precursor_mass = stof(value);
45 | } else if (tag == "Comment") {
46 |
47 | }
48 | }
49 | //else case: tag = Num peak_positions
50 | int num_peaks = stoi(value);
51 | for (int i = 0; i < num_peaks; ++i) {
52 | getline(infile, value, '\t');
53 | float peak = stof(value);
54 | getline(infile, value, '\t');
55 | float intensity = stof(value);
56 |
57 | getline(infile, value);
58 |
59 |
60 |
61 |
62 | // parse peak_positions and intensities
63 |
64 | /*std::size_t tab_pos = line.find('\t');
65 | float peak = stof(line.substr(0, tab_pos));
66 | float intensity = stof(line.substr(tab_pos + 1, line.find('\t')));
67 | */
68 |
69 |
70 | c_spectrum->peak_positions.push_back(peak);
71 | c_spectrum->intensities.push_back(intensity);
72 | }
73 | //c_spectrum->intensity_bin_spanning_factor = -1.f; //TODO figure out if neighbor_spanning here
74 | c_spectrum->bin_peaks(true,true);
75 | //c_spectrum->bin_peaks_sparse(true, true); TODO uncomment!!!
76 | output_spectra.push_back(c_spectrum);
77 |
78 | }
79 |
80 |
81 | infile.close();
82 | return true;
83 | }
84 |
85 | bool msp_reader::read_spectra_from_positions(string &path, vector &precursor_list, vector &output_spectra) {
86 |
87 | output_spectra.reserve(precursor_list.size());
88 | infile.open(path);
89 | for (int i = 0; i < precursor_list.size(); ++i) {
90 |
91 | unsigned long start = precursor_list[i]->offset_begin;
92 | unsigned long end = precursor_list[i]->offset_end;
93 |
94 | infile.seekg(start);
95 | std::string s;
96 | //cout << precursor_list[i]->name << endl;
97 | //cout << start << " " << end << " " << end - start << endl;
98 | //if (end > 1844674407)
99 | // continue;
100 | s.resize(end - start);
101 | infile.read(&s[0], end - start);
102 |
103 | output_spectra.push_back(read_spectrum_from_buffer(s).get()); //TODO .get() is messing with pointers again!!! REFACTOR
104 |
105 | delete output_spectra.back();
106 | output_spectra.pop_back();
107 | }
108 |
109 | return false;
110 | }
111 |
112 | shared_ptr msp_reader::read_spectrum_from_buffer(const string& buffer) {
113 |
114 | string line, tag, value;
115 | shared_ptr c_spectrum = std::make_shared();
116 |
117 | stringstream ss(buffer);
118 | while (tag != "Num peaks") { // what if no colon -> colon_pos == string::npos
119 | if (ss.eof())
120 | return nullptr;
121 | getline(ss, line);
122 | // split up line to identify comment tags
123 | size_t colon_pos = line.find(':');
124 |
125 | tag = line.substr(0, colon_pos);
126 | value = line.substr(colon_pos + 2, string::npos);
127 |
128 | // parse information
129 | if (tag == "Name") {
130 | c_spectrum = std::make_shared();
131 | c_spectrum->name = value;
132 | c_spectrum->peptide = value.substr(0, value.find('/'));
133 | c_spectrum->charge = stoi(value.substr(value.rfind('/') + 1, string::npos));
134 | } else if (tag == "MW") {
135 | c_spectrum->precursor_mass = stof(value);
136 | } else if (tag == "Comment") {
137 |
138 | }
139 | }
140 | //else case: tag = Num peak_positions
141 | int num_peaks = stoi(value);
142 | for (int i = 0; i < num_peaks; ++i) {
143 | getline(ss, value, '\t');
144 | //std::cout << "X" << value << "X" << std::endl;
145 |
146 | float peak = stof(value);
147 | getline(ss, value, '\n');
148 |
149 | //std::cout << "X" << value << "X" << std::endl;
150 | float intensity = stof(value);
151 | //std::cout << intensity << std::endl;
152 |
153 | //getline(ss, value);
154 |
155 |
156 |
157 |
158 | // parse peak_positions and intensities
159 |
160 | /*std::size_t tab_pos = line.find('\t');
161 | float peak = stof(line.substr(0, tab_pos));
162 | float intensity = stof(line.substr(tab_pos + 1, line.find('\t')));
163 | */
164 |
165 |
166 | c_spectrum->peak_positions.push_back(peak);
167 | c_spectrum->intensities.push_back(intensity);
168 | }
169 | //c_spectrum->intensity_bin_spanning_factor = -1.f; //TODO figure out if neighbor_spanning here
170 | //c_spectrum->bin_peaks(true,true);
171 | //c_spectrum->bin_peaks_sparse(true, true);
172 | c_spectrum->root_scale_intensities();
173 | c_spectrum->normalize_intensities();
174 | return c_spectrum;
175 | }
176 |
177 |
178 | bool msp_reader::read_file_precursors(string &path, vector &precursor_list) {
179 |
180 | infile.open(path, ios::in);
181 | if (!infile) {
182 | cerr << "Could not open file at " << path << endl;
183 | return false;
184 | }
185 |
186 | string line;
187 | while (!infile.eof()) {
188 | string tag, value;
189 | precursor *parent;
190 | while (tag != "Num peaks") { // what if no colon -> colon_pos == string::npos
191 | if (infile.eof())
192 | return false;
193 | getline(infile, line);
194 |
195 | // split up line to identify comment tags
196 | size_t colon_pos = line.find(':');
197 |
198 | tag = line.substr(0, colon_pos);
199 | value = line.substr(colon_pos + 2, string::npos);
200 |
201 | // parse information
202 | if (tag == "Name") {
203 | parent = new precursor();
204 | parent->offset_begin = infile.tellg();
205 | parent->offset_begin -= (line.length() + 1);
206 | parent->name = value;
207 | //parent->peptide = value.substr(0, value.find('/'));
208 | parent->charge = stoi(value.substr(value.rfind('/') + 1, string::npos));
209 | } else if (tag == "MW") {
210 | parent->mz = stof(value);
211 | } else if (tag == "Comment") {
212 |
213 | }
214 | }
215 | //else case: tag = Num peak_positions
216 | int num_peaks = stoi(value);
217 | for (int i = 0; i < num_peaks; ++i) {
218 | //Skip lines
219 | getline(infile, value);
220 | }
221 | parent->offset_end = infile.tellg();
222 | precursor_list.push_back(parent);
223 | }
224 | infile.clear();
225 | infile.seekg(0, ios::end);
226 | precursor_list.back()->offset_end = infile.tellg();
227 |
228 |
229 | infile.close();
230 | return true;
231 | }
232 |
233 | bool msp_reader::read_file_precursors_efficient(string &path, vector &precursor_list) {
234 |
235 |
236 |
237 | infile.open(path, ios::in);
238 | if (!infile) {
239 | cerr << "Could not open file at " << path << endl;
240 | return false;
241 | }
242 |
243 |
244 | precursor *parent = new precursor();
245 | string line;
246 | unsigned int id = 0;
247 | while (!infile.eof()) {
248 | getline(infile, line);
249 |
250 | if (line.rfind("Name:", 0) == 0) { // rightmost match, but starting at pos 0 (or earlier), i.e. prefix
251 |
252 | unsigned long start = infile.tellg();
253 | start -= (line.length() + 1);
254 |
255 | //Push back previous element
256 | parent->offset_end = start - 1;
257 | precursor_list.push_back(parent);
258 |
259 | //Init new element
260 | parent = new precursor();
261 | parent->id = id;
262 | parent->offset_begin = start;
263 | parent->charge = stoi(line.substr(line.rfind('/') + 1, string::npos));
264 | ++id;
265 | }
266 |
267 | if (line.rfind("MW:", 0) == 0) {
268 | parent->mz = stof(line.substr(4, string::npos));
269 | }
270 |
271 | }
272 |
273 | // Adding last element
274 | infile.clear();
275 | infile.seekg(0, ios::end);
276 | parent->offset_end = infile.tellg();
277 | precursor_list.push_back(parent);
278 |
279 | // Deleting first element (dummy)
280 | precursor_list.erase(precursor_list.begin()); //inefficient (but called once)
281 |
282 | infile.close();
283 |
284 | return true;
285 | }
286 |
287 | bool msp_reader::read_next_entry_into_buffer(ifstream &f, string &buffer) {
288 | /*
289 | * Requires open filestream and reads until end a new entry
290 | */
291 |
292 | buffer.clear();
293 |
294 | string line;
295 |
296 | if(!getline(f, line)) {
297 | return false;
298 | }
299 |
300 | if (line.rfind("Name:", 0) != 0) {
301 | cout << line << endl;
302 | cerr << "entry does not start with Name:" << endl;
303 | return false;
304 | }
305 | buffer.append(line + "\n");
306 |
307 | while (getline(f, line)) {
308 | if (line.rfind("Name:", 0) == 0) { // rightmost match, but starting at pos 0 (or earlier), i.e. prefix
309 |
310 | //Jump back to line beginning and return buffer
311 | f.seekg(-(line.length() + 1), ios::cur);
312 | return true;
313 |
314 | }
315 |
316 | buffer.append(line + "\n");
317 |
318 | }
319 |
320 |
321 | return true;
322 | }
323 |
324 | bool msp_reader::continue_read_file(vector> &output_spectra) {
325 |
326 |
327 | return false;
328 | }
329 |
330 |
--------------------------------------------------------------------------------
/src/msp_reader.h:
--------------------------------------------------------------------------------
1 | #ifndef SIMPLE_EXAMPLE_MSP_READER_H
2 | #define SIMPLE_EXAMPLE_MSP_READER_H
3 |
4 | #include "spectrum.h"
5 | #include "precursor_index.h"
6 | #include
7 |
8 |
9 | class msp_reader {
10 |
11 | static std::fstream infile;
12 |
13 |
14 | public:
15 |
16 | static bool read_file(std::string &path, std::vector> &output_spectra);
17 | static bool continue_read_file(std::vector> &output_spectra);
18 | static bool read_file_precursors(std::string &path, std::vector &precursor_list);
19 | static bool read_file_precursors_efficient(std::string &path, std::vector &precursor_list);
20 |
21 | static bool read_spectra_from_positions(std::string &path, std::vector &precursor_list, std::vector &output_spectra);
22 | static bool read_next_entry_into_buffer(std::ifstream &f, std::string &buffer);
23 | static std::shared_ptr read_spectrum_from_buffer(const std::string& buffer);
24 |
25 |
26 | };
27 |
28 |
29 | #endif //SIMPLE_EXAMPLE_MSP_READER_H
30 |
--------------------------------------------------------------------------------
/src/naive_search.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include "spectrum.h"
4 | #include "msp_reader.h"
5 | #include "scores.h"
6 | #include "spectral_search.h"
7 | #include "library.h"
8 |
9 |
10 | using namespace std;
11 |
12 | int main() {
13 | cout << "This is a naive search" << endl;
14 | //FeatureMap fm;
15 | //Feature feature;
16 | //PeakSpectrum p;
17 |
18 | //fm.push_back(feature);
19 |
20 |
21 | //string msp_file = "/home/ynowatzk/data/pyro_fur/PyroFur_reproduced.msp";
22 | string msp_file = "/home/ynowatzk/data/9MM/msp/";
23 | string mgf_file = "/home/ynowatzk/data/9MM/mgf/9MM_FASP.mgf"; //
24 |
25 |
26 | shared_ptr search_lib = make_shared(mgf_file);
27 |
28 | auto start = chrono::high_resolution_clock::now();
29 | shared_ptr lib = make_shared(msp_file);
30 | auto stop = chrono::high_resolution_clock::now();
31 | auto duration = duration_cast(stop - start);
32 | cout << "Loading Time: " << duration.count() << " seconds" << endl;
33 |
34 |
35 |
36 |
37 | /*
38 | * Search
39 | */
40 |
41 | start = chrono::high_resolution_clock::now();
42 | spectral_search search(search_lib, lib);
43 | search.search_target_library();
44 | stop = chrono::high_resolution_clock::now();
45 | duration = duration_cast(stop - start);
46 | cout << "Loading Time: " << duration.count() << " seconds" << endl;
47 |
48 | search.save_results_to_file("./naive_search_results.csv");
49 | //lib->build_library_index();
50 |
51 |
52 | /*
53 | * SEARCH
54 | */
55 |
56 | /*
57 | //Rescoring of spectrast results
58 | cout << "Reading in" << endl;
59 | search.read_results_from_file(R"(C:\Users\ynowatzk\Desktop\data\pyrococcus_furiosus\results\reproduced\sp5_neighbors_matches.tsv)");
60 | cout << "Rescoring" << endl;
61 | search.rescore_matches();
62 | cout << "Saving" << endl;
63 | search.save_results_to_file(R"(C:\Users\ynowatzk\Desktop\data\pyrococcus_furiosus\results\reproduced\sp5_neighbors_rescored.tsv)");
64 | */
65 |
66 |
67 | //cout << "Searching fragment ion index" << endl;
68 | //start = chrono::high_resolution_clock::now();
69 | //search.search_target_library();
70 | //stop = chrono::high_resolution_clock::now();
71 | //duration = duration_cast(stop - start);
72 |
73 | //cout << "Search Time: " << duration.count() << " seconds" << endl;
74 | //search.save_results_to_file("FIIndex.csv");
75 | //exit(12);
76 |
77 | /*
78 | auto start = chrono::high_resolution_clock::now();
79 | search.search_target_library();
80 | auto stop = chrono::high_resolution_clock::now();
81 | auto duration = duration_cast(stop - start);
82 |
83 | cout << "Search Time: " << duration.count() << " seconds" << endl;
84 |
85 |
86 | vector matches = search.get_results();
87 |
88 | for (int i = 0; i < 10; ++i) {
89 | cout << matches[i].query_spectrum->name << " " << matches[i].matched_spectrum->peptide << " " << matches[i].dot_product << endl;
90 | }
91 |
92 | search.save_results_to_file("./out.csv");
93 |
94 | */
95 | return 0;
96 | }
97 |
98 |
--------------------------------------------------------------------------------
/src/precursor_index.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include "precursor_index.h"
6 | #include "index_file_writer.h"
7 | #include "index_file_reader.h"
8 |
9 |
10 | using namespace std;
11 |
12 | int precursor_index::get_size() {
13 | return precursors.size();
14 | }
15 |
16 | /*spectrum *precursor_index::get_spectrum(int i) {
17 | return spectra[i];
18 | }*/
19 |
20 | float precursor_index::get_max_precursor_mass() {
21 | return precursors[ranking.back()].mz;
22 | }
23 | /*
24 | //TODO add self chosen bounds
25 | int precursor_index::get_lower_bound(int charge, float min_mass) {
26 |
27 | int lb = std::lower_bound(spectra.begin(), spectra.end(), make_pair(charge, min_mass), [](spectrum *s, pair charge_mass_tuple) {
28 | return *s < charge_mass_tuple;
29 | }) - spectra.begin();
30 |
31 | return lb;
32 | }
33 |
34 |
35 | int precursor_index::get_upper_bound(int charge, float max_mass) {
36 |
37 | int ub = std::upper_bound(spectra.begin(), spectra.end(), make_pair(charge, max_mass), [](pair charge_mass_tuple, spectrum *s) {
38 | return !(*s <= charge_mass_tuple);
39 | }) - spectra.begin();
40 |
41 | return ub - 1;
42 | }
43 | */
44 |
45 | precursor_index::precursor_index() {
46 |
47 | }
48 |
49 | int precursor_index::get_lower_bound(int charge, float min_mass) {
50 | int lb = std::lower_bound(ranking.begin(), ranking.end(), make_pair(charge, min_mass), [&](unsigned int rank, pair charge_mass_tuple) {
51 | return precursors[rank] < charge_mass_tuple;
52 | }) - ranking.begin();
53 |
54 | return lb;
55 | }
56 |
57 |
58 | int precursor_index::get_upper_bound(int charge, float max_mass) {
59 | int ub = std::upper_bound(ranking.begin(), ranking.end(), make_pair(charge, max_mass), [&](pair charge_mass_tuple, unsigned int rank) {
60 | return !(precursors[rank] <= charge_mass_tuple);
61 | }) - ranking.begin();
62 |
63 | return ub - 1;
64 | }
65 |
66 | bool precursor_index::sort_index() {
67 |
68 | if (!(id_counter == precursors.size() && id_counter == ranking.size())) {
69 | cerr << "Number of recorded precursors does not match precursor ids :: Required to warrant correct mapping of id to precursors" << endl;
70 | return false;
71 | }
72 |
73 |
74 | sort(ranking.begin(), ranking.end(), [&](unsigned int a, unsigned int b) {
75 | return precursors[a] < precursors[b];
76 | });
77 |
78 | for (int i = 0; i < ranking.size(); ++i) {
79 | unsigned int id = ranking[i];
80 | precursors[id].rank = i;
81 | }
82 |
83 | return true;
84 | }
85 |
86 | precursor &precursor_index::get_precursor(unsigned int id) {
87 | return precursors[id];
88 | }
89 |
90 | precursor &precursor_index::get_precursor_by_rank(unsigned int id) {
91 | return precursors[ranking[id]];
92 | }
93 |
94 | precursor &precursor_index::record_new_precursor(const shared_ptr& spec) {
95 | precursors.emplace_back(precursor(id_counter, spec->precursor_mass, spec->charge, spec->peptide));
96 | ranking.push_back(id_counter);
97 | ++id_counter;
98 | return precursors.back();
99 | }
100 |
101 | precursor &precursor_index::record_new_precursor(float mz, int charge, std::string peptide) {
102 | precursors.emplace_back(precursor(id_counter, mz, charge, peptide));
103 | ranking.push_back(id_counter);
104 | ++id_counter;
105 | return precursors.back();
106 | }
107 |
108 | unsigned int precursor_index::get_rank(unsigned int id) {
109 | return precursors[id].rank;
110 | }
111 |
112 | bool precursor_index::save_index_to_file(const string &file_path) {
113 |
114 | //Saving spectrum bookmarks (precursor info)
115 | index_file_writer::save_precursor_index(file_path, precursors);
116 |
117 |
118 | return true;
119 | }
120 |
121 | bool precursor_index::load_index_from_file(const string &file_path) {
122 | //index_file_reader::read_file_into_precursor_index(file_path, make_shared(*this));
123 |
124 | std::ifstream f(file_path, std::ios::in);
125 | std::string delimiter = ";";
126 | std::string line;
127 |
128 | if (!getline(f, line)) {
129 | return false;
130 | }
131 |
132 | if (line.rfind("Num: ", 0) != 0) {
133 | std::cerr << "Incorrect file format" << std::endl;
134 | return false;
135 | }
136 |
137 | //Read header
138 | unsigned int size = std::stoi(line.substr(5, std::string::npos)); //TODO check 4 or 5
139 | set_size(size);
140 |
141 | // Parse precursors line by line
142 | //precursor_idx->add_precursor_record(p);
143 | while (getline(f, line)) {
144 |
145 | size_t delim_pos = line.find(delimiter);
146 | unsigned int id = std::stoi(line.substr(0, delim_pos));
147 |
148 | size_t length = line.find(delimiter, delim_pos + 1) - delim_pos;
149 | unsigned int rank = std::stoi(line.substr(delim_pos + 1, length - 1));
150 |
151 | delim_pos = delim_pos + length;
152 | length = line.find(delimiter, delim_pos + 1) - delim_pos;
153 | float mz = std::stof(line.substr(delim_pos + 1, length - 1));
154 |
155 | delim_pos = delim_pos + length;
156 | length = line.find(delimiter, delim_pos + 1) - delim_pos;
157 | int charge = std::stoi(line.substr(delim_pos + 1, length - 1));
158 | std::string peptide = line.substr(delim_pos + length + 1, std::string::npos);
159 |
160 | add_precursor_record(precursor(id, rank, mz, charge, peptide));
161 | }
162 |
163 | f.close();
164 |
165 | if (get_size() != size) {
166 | std::cerr << "Wrong number of precursors" << std::endl;
167 | }
168 |
169 | //TODO delete this if not used anymore
170 | for (auto & precursor : precursors) {
171 | to_rank.push_back(precursor.rank);
172 | }
173 |
174 | return true;
175 | }
176 |
177 | bool precursor_index::add_precursor_record(const precursor& p) {
178 | precursors.emplace_back(p); //TODO test this
179 | ranking[p.rank] = p.id;
180 | return true;
181 | }
182 |
183 | bool precursor_index::set_size(unsigned int size) {
184 | precursors.reserve(size);
185 | ranking.resize(size);
186 |
187 | return true;
188 | }
189 |
190 | bool precursor_index::save_index_to_binary_file(const string &file_path) {
191 |
192 | //Saving spectrum bookmarks (precursor info)
193 | index_file_writer::save_precursor_index_to_binary_file(file_path, precursors);
194 |
195 | return true;
196 |
197 | }
198 |
199 | bool precursor_index::load_index_from_binary_file(const string &file_path) {
200 | ifstream f(file_path, ios::binary | ios::in);
201 |
202 | //Read header
203 | unsigned int size;
204 | f.read((char *) &size, sizeof(unsigned int));
205 | set_size(size);
206 |
207 | unsigned int id, rank;
208 | float mz;
209 | int charge;
210 | size_t pep_size;
211 | std::string peptide;
212 | while (f.read((char *) &id, sizeof(unsigned int))) {
213 | f.read((char *) &rank, sizeof(unsigned int));
214 | f.read((char *) &mz, sizeof(float));
215 | f.read((char *) &charge, sizeof(int));
216 | f.read((char *) &pep_size, sizeof(size_t));
217 | peptide.resize(pep_size);
218 | f.read((char *) &peptide[0], pep_size);
219 |
220 | add_precursor_record(precursor(id, rank, mz, charge, peptide));
221 | }
222 |
223 | f.close();
224 |
225 | if (get_size() != size) {
226 | std::cerr << "Wrong number of precursors" << std::endl;
227 | std::cout << get_size() << " " << size << endl;
228 | }
229 |
230 | //TODO delete this if not used anymore
231 | for (auto & precursor : precursors) {
232 | to_rank.push_back(precursor.rank);
233 | }
234 |
235 | return true;
236 |
237 | }
238 |
239 |
240 |
--------------------------------------------------------------------------------
/src/precursor_index.h:
--------------------------------------------------------------------------------
1 | #ifndef SIMPLE_EXAMPLE_PRECURSOR_INDEX_H
2 | #define SIMPLE_EXAMPLE_PRECURSOR_INDEX_H
3 | #include
4 | #include
5 | #include "spectrum.h"
6 |
7 |
8 |
9 | struct precursor {
10 | /*
11 | * Key values
12 | */
13 | unsigned int id;
14 | unsigned int rank; //unsure if needed here
15 | float mz;
16 | int charge;
17 | std::string peptide;
18 |
19 | /*
20 | * special cases
21 | */
22 |
23 | unsigned long offset_begin;
24 | unsigned long offset_end;
25 | std::string name;
26 |
27 | precursor() {};
28 | precursor(unsigned int id, float mass, int charge, std::string peptide="") : id(id), mz(mass), charge(charge), peptide(peptide) {};
29 | precursor(unsigned int id, unsigned int rank, float mass, int charge, std::string peptide="") : id(id), rank(rank), mz(mass), charge(charge), peptide(peptide) {};
30 |
31 | bool operator<(const precursor &other) const {
32 | return charge < other.charge || (charge == other.charge && mz < other.mz);
33 | };
34 |
35 | bool operator<(std::pair charge_mass_tuple) const {
36 | return charge < charge_mass_tuple.first || (charge == charge_mass_tuple.first && mz < charge_mass_tuple.second);
37 | }
38 |
39 | bool operator<=(std::pair charge_mass_tuple) const {
40 | return charge < charge_mass_tuple.first || (charge == charge_mass_tuple.first && mz <= charge_mass_tuple.second);
41 | }
42 |
43 |
44 | };
45 |
46 |
47 | class precursor_index {
48 |
49 | // Contains all spectrum bookmarks (precursors), sorted first by charge, then by precursor mz
50 | std::vector precursors;
51 | std::vector ranking;
52 | unsigned int id_counter = 0;
53 |
54 |
55 | public:
56 | std::vector to_rank;
57 | precursor_index(); //Init empty index
58 | precursor& record_new_precursor(const std::shared_ptr& spec);
59 | precursor& record_new_precursor(float mz, int charge, std::string peptide);
60 | bool add_precursor_record(const precursor& p);
61 |
62 | bool sort_index();
63 | bool save_index_to_file(const std::string &file_path);
64 | bool save_index_to_binary_file(const std::string &file_path);
65 | bool load_index_from_file(const std::string &file_path);
66 | bool load_index_from_binary_file(const std::string &file_path);
67 |
68 |
69 | int get_size();
70 | bool set_size(unsigned int size);
71 | int get_lower_bound(int charge, float min_mass);
72 | int get_upper_bound(int charge, float max_mass);
73 | float get_max_precursor_mass();
74 |
75 | precursor& get_precursor(unsigned int id);
76 | precursor& get_precursor_by_rank(unsigned int id);
77 | unsigned int get_rank(unsigned int id);
78 |
79 | };
80 |
81 |
82 | #endif //SIMPLE_EXAMPLE_PRECURSOR_INDEX_H
83 |
--------------------------------------------------------------------------------
/src/quick_scan.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include "scanner.h"
4 | #include "msp_reader.h"
5 | #include "library.h"
6 | #include "spectral_search.h"
7 |
8 | using namespace std;
9 |
10 | int main() {
11 |
12 | cout << "Hello Quick Scan" << endl;
13 |
14 | string directory = "/home/ynowatzk/data/9MM/msp/Brevibacillus+laterosporus.msp";
15 | scanner *sc = new scanner();
16 |
17 | cout << "Scanning input:" << endl;
18 | auto start = chrono::high_resolution_clock::now();
19 | sc->scan_file(directory);
20 | auto stop = chrono::high_resolution_clock::now();
21 | auto duration = duration_cast(stop - start);
22 | cout << "Scan Time: " << duration.count() << " seconds" << endl;
23 |
24 | sc->analyze();
25 | sc->print_scan_results();
26 | sc->save_precursor_distribution_to_file("./precursors.txt");
27 |
28 |
29 | cout << "Loading spectra from saved positions" << endl;
30 | start = chrono::high_resolution_clock::now();
31 | msp_reader::read_spectra_from_positions(directory, sc->parents, sc->specs);
32 | stop = chrono::high_resolution_clock::now();
33 | duration = duration_cast(stop - start);
34 | cout << "Loading Time: " << duration.count() << " seconds" << endl;
35 |
36 |
37 | // COMPARE SEARCH RESULTS to make sure reading worked
38 | string mgf_file = "/home/ynowatzk/data/9MM/mgf/9MM_FASP.mgf";
39 | library *search_lib = new library(mgf_file);
40 | library *lib = new library(sc->specs);
41 | lib->build_library_index();
42 |
43 | spectral_search search(search_lib, lib);
44 | cout << "Searching fragment ion index" << endl;
45 | start = chrono::high_resolution_clock::now();
46 | search.search_target_library();
47 | stop = chrono::high_resolution_clock::now();
48 | duration = duration_cast(stop - start);
49 |
50 | cout << "Search Time: " << duration.count() << " seconds" << endl;
51 | search.save_results_to_file("FIIndex2.csv");
52 |
53 |
54 | return 0;
55 | }
--------------------------------------------------------------------------------
/src/scanner.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "scanner.h"
5 | #include "msp_reader.h"
6 | #include "mgf_reader.h"
7 |
8 | using namespace std;
9 |
10 | scanner::scanner() {
11 |
12 | }
13 |
14 | bool scanner::scan_directory(string path) {
15 | cout << "Scanning directory: " << path << endl;
16 | for (const auto & entry : std::filesystem::directory_iterator(path)) {
17 | scan_file(entry.path().string());
18 | }
19 | return true;
20 | }
21 |
22 | bool scanner::scan_file(string path) {
23 | cout << "Scanning: " << path << endl;
24 |
25 |
26 | string extension = path.substr(path.rfind('.') + 1, string::npos);
27 |
28 | if (extension == "msp") {
29 | if (!msp_reader::read_file_precursors_efficient(path, parents)) {
30 | cout << "Error reading file: " << path << endl;
31 | return false;
32 | }
33 |
34 | ++no_files_read;
35 | int size = std::filesystem::file_size(path) / 1024;
36 | cout << "File size " << size << " KB" << endl;
37 | kb_lib_size += size;
38 | }
39 | else if (extension == "mgf") {
40 | /*if (!mgf_reader::read_file(path, soondeleted())) {
41 | cout << "Error reading file: " << path << endl;
42 | return false;
43 | }*/
44 | cout << ".mgf quick scan not implemented yet" << endl;
45 | return false;
46 | }
47 | else {
48 | cout << "Unknown file extension" << endl;
49 | return false;
50 | }
51 |
52 | return true;
53 |
54 |
55 | }
56 |
57 | bool scanner::analyze() {
58 |
59 | sort(parents.begin(), parents.end(), [](const precursor *a, const precursor *b) {
60 | return *a < *b;
61 | });
62 |
63 | return true;
64 | }
65 |
66 | bool scanner::save_precursor_distribution_to_file(string path, string delimiter) {
67 |
68 | fstream outfile;
69 | outfile.open(path, ios::out);
70 |
71 | if (!outfile.good())
72 | return false;
73 |
74 | // Add header
75 | outfile << "mz"+delimiter+"charge" << endl;
76 |
77 | // Go through matches and parse relevant information for each
78 | for (int i = 0; i < parents.size(); ++i) {
79 | precursor *p = parents[i];
80 | outfile << p->mz << delimiter << p->charge << endl;
81 | }
82 |
83 | outfile.close();
84 | return true;
85 |
86 | return false;
87 | }
88 |
89 | bool scanner::print_scan_results() {
90 |
91 | cout << "Readable files detected: " << no_files_read << endl;
92 | cout << "Total size: " << kb_lib_size / 1024 << " MB (" << float(kb_lib_size) / float(1024*1024) << " GB)" << endl;
93 |
94 | return false;
95 | }
96 |
--------------------------------------------------------------------------------
/src/scanner.h:
--------------------------------------------------------------------------------
1 | #ifndef SIMPLE_EXAMPLE_SCANNER_H
2 | #define SIMPLE_EXAMPLE_SCANNER_H
3 |
4 | #include
5 | #include "spectrum.h"
6 |
7 | class scanner {
8 | private:
9 |
10 | int no_files_read = 0;
11 | int kb_lib_size = 0;
12 |
13 |
14 | public:
15 | scanner();
16 | bool scan_directory(std::string path);
17 | bool scan_file(std::string path);
18 |
19 | bool analyze();
20 | bool save_precursor_distribution_to_file(std::string path, std::string delimiter="\t");
21 |
22 | bool print_scan_results();
23 |
24 | std::vector parents;
25 | std::vector specs;
26 | };
27 |
28 |
29 | #endif //SIMPLE_EXAMPLE_SCANNER_H
30 |
--------------------------------------------------------------------------------
/src/scores.cpp:
--------------------------------------------------------------------------------
1 | #include "scores.h"
2 | #include
3 | #include
4 |
5 | using namespace std;
6 |
7 | float scores::dot_product(vector &target_bins, vector &other_bins) {
8 | float dot = 0.f;
9 | int num_bins = 0;
10 |
11 | for (int i = 0; i < target_bins.size(); ++i) {
12 | dot += target_bins[i] * other_bins[i];
13 | /*if (target_bins[i] * other_bins[i] > 0) {
14 | //cout << target_bins[i] << " * " << other_bins[i] << " = " << target_bins[i] * other_bins[i] << endl;
15 | ++num_bins;
16 | }*/
17 | } //TODO try and compare runtime for iterator
18 |
19 | /*float m1 = 0;
20 | for (float f:target_bins) {
21 | m1 += f*f;
22 | }
23 | m1 = sqrt(m1);
24 |
25 | float m2 = 0;
26 | for (float f: other_bins) {
27 | m2 += f*f;
28 | }
29 | m2 = sqrt(m2);
30 |
31 | cout << m1 << " " << m2 << " " << dot << " " << dot / (m1 * m2) << endl;*/
32 | //cout << " no. " << num_bins << " ";
33 | return dot;
34 | }
35 |
--------------------------------------------------------------------------------
/src/scores.h:
--------------------------------------------------------------------------------
1 | #ifndef SIMPLE_EXAMPLE_SCORES_H
2 | #define SIMPLE_EXAMPLE_SCORES_H
3 | #include
4 |
5 | class scores {
6 | public:
7 | static float dot_product(std::vector &target_bins, std::vector &other_bins);
8 | };
9 |
10 |
11 | #endif //SIMPLE_EXAMPLE_SCORES_H
12 |
--------------------------------------------------------------------------------
/src/search_index.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include "settings.h"
6 | #include "search_manager.h"
7 | #include "thread_pool.h"
8 |
9 | using namespace std;
10 |
11 | cxxopts::ParseResult parseArgs(int argc, const char* argv[]) {
12 | try {
13 | for (int i = 0; i < argc; ++i) {
14 | settings::search_command += argv[i];
15 | settings::search_command += " ";
16 | }
17 | settings::search_command.pop_back();
18 |
19 | cxxopts::Options options("mistle-search", "Search experimental mass spectra in mistle fragment ion index");
20 |
21 | options.positional_help("[optional args]").show_positional_help();
22 |
23 | options.add_options()
24 | ("h, help", "Print this help message")
25 | ("s,search", "search file or directory ", cxxopts::value(), "PATH")
26 | ("i,index", "index directory (must contain config.txt and binary index files)", cxxopts::value(), "PATH")
27 | ("o,output", "output path", cxxopts::value()->default_value("./results.csv"), "NAME")
28 | ("t,threads", "number of threads", cxxopts::value()->default_value("1"), "NUM")
29 | ("p,ppm_tolerance", "precursor mz tolerance given in ppm", cxxopts::value()->default_value("10"), "NUM")
30 | ("m,mz_tolerance", "precursor mz tolerance (absolut value in Da)", cxxopts::value(), "NUM")
31 | ("b,bin_size", "bin size for fragment ion binning (in Da)", cxxopts::value()->default_value("1"), "NUM")
32 | ("hits_per_spectrum", "number of output matches per input spectrum", cxxopts::value()->default_value("1"), "NUM")
33 | ("reduce_noise_in_window", "Apply noise reduction with the top X peaks in window w approach (default: off)", cxxopts::value()->default_value("false"))
34 | ("peaks_per_window", "number of peaks per window", cxxopts::value()->default_value("5"), "NUM")
35 | ("window_size", "window size", cxxopts::value()->default_value("100.0"), "NUM");
36 | //("neighbors", "number of neighboring bins intensity is carried over (on search spectrum peaks)", cxxopts::value()->default_value("0"), "NUM")
37 | //("neighbors_intensity_factor", "fraction [0, 1] of intensity carried over to neighboring bin(s)", cxxopts::value()->default_value("0.5"), "NUM")
38 | //("B,batch_size", "number of mass spectra loaded in a batch", cxxopts::value(), "NUM");
39 |
40 |
41 | options.parse_positional({"search", "index"});
42 |
43 | auto result = options.parse(argc,argv);
44 |
45 |
46 | if (result.count("help")) {
47 | std::cout << options.help() << std::endl;
48 | exit(0);
49 | }
50 | if (result.count("search")) {
51 | settings::search_path = result["search"].as();
52 | } else {
53 | std::cerr << "Missing input: -s/--search" << std::endl;
54 | exit(1);
55 | }
56 | if (result.count("index")) {
57 | settings::index_path = result["index"].as();
58 | if (!settings::index_path.ends_with('/')) {
59 | settings::index_path += "/";
60 | }
61 | } else {
62 | std::cerr << "Missing input: -i/--index" << std::endl;
63 | exit(1);
64 | }
65 | settings::output_path = result["output"].as();
66 |
67 | if (result.count("threads")) {
68 | settings::num_threads = result["threads"].as();
69 | settings::parallel = (settings::num_threads > 1);
70 | }
71 | if (result.count("hits_per_spectrum")) {
72 | settings::num_hit_ranks = result["hits_per_spectrum"].as();
73 | }
74 | if (result.count("mz_tolerance")) {
75 | settings::mz_tolerance = result["mz_tolerance"].as();
76 | settings::use_ppm_tolerance = false;
77 | }
78 | if (result.count("ppm_tolerance")) {
79 | settings::use_ppm_tolerance = true;
80 | settings::ppm_tolerance = result["ppm_tolerance"].as();
81 | settings::ppm_factor = settings::ppm_tolerance / 1000000.0f;
82 | if (result.count("mz_tolerance")) {
83 | cerr << "Precursor mz tolerance given in ppm and Dalton. Please choose one." << endl;
84 | exit(1);
85 | }
86 | }
87 | if (result.count("bin_size")) {
88 | settings::bin_size = result["bin_size"].as();
89 | }
90 | if (result.count("bin_size")) {
91 | settings::bin_size = result["bin_size"].as();
92 | }
93 | if (result.count("reduce_noise_in_window")) {
94 | settings::apply_topX_in_window_denoising = true;
95 | settings::peaks_per_window = result["peaks_per_window"].as();
96 | settings::window_size = result["window_size"].as();
97 | }
98 |
99 | /*if (result.count("neighbors")) {
100 | settings::neighbors = result["neighbors"].as();
101 | settings::neighbors_intensity_factor = result["neighbors_intensity_factor"].as();
102 | }
103 | if (result.count("batch_size")) {
104 | settings::batch_size = result["batch_size"].as