├── .gitmodules ├── CMakeLists.txt ├── CMakeModules └── AppendCompilerFlags.cmake ├── LICENSE ├── README.md ├── build └── .gitignore ├── collections ├── README.md ├── cluewebB │ └── index │ │ ├── space_usage_IDX_D.html │ │ ├── space_usage_IDX_D1R1.html │ │ └── space_usage_IDX_DR.html ├── gov2 │ └── index │ │ ├── space_usage_IDX_D.html │ │ ├── space_usage_IDX_D1R1.html │ │ └── space_usage_IDX_DR.html ├── speeches │ ├── .gitignore │ ├── dict.txt │ ├── doc_names.txt │ └── text_int_SURF.sdsl └── wikishort │ ├── .gitignore │ ├── dict.txt │ ├── doc_names.txt │ ├── text_int_SURF.sdsl │ └── wikishort-src.tar.gz ├── config ├── IDX-D-BM25.config ├── IDX-D-LMDS.config ├── IDX-D-SANSLEN.config ├── IDX-D-TFIDF.config ├── IDX-D.config ├── IDX-D1R1-BM25.config ├── IDX-D1R1-LMDS.config ├── IDX-D1R1-TFIDF.config ├── IDX-D1R1.config ├── IDX-D1R1MTF.config ├── IDX-DR-BM25.config ├── IDX-DR-LMDS.config ├── IDX-DR-SANSLEN.config ├── IDX-DR-TFIDF.config ├── IDX-DR.config ├── INVIDX-E-BM25.config ├── INVIDX-E-LMDS.config ├── INVIDX-E-TFIDF.config ├── INVIDX-E.config └── INVIDX-W.config ├── experiments ├── check_equivalence.sh ├── doclen-clueweb.csv ├── doclen-gov2.csv ├── eval.R ├── eval_3.R ├── mem_info.csv ├── mem_used.sh ├── nodes_evaluated.csv ├── nodes_evaluated.sh ├── nodes_evaluated_2005.csv ├── nodes_evaluated_2006.csv ├── nodes_evaluated_and_2005.csv ├── nodes_evaluated_and_2006.csv ├── phrase_time_2005.csv ├── phrase_time_2006.csv ├── phrase_time_and_2005.csv ├── phrase_time_and_2006.csv ├── phrases_time.sh ├── rank_times.sh ├── ranker_times_2005.csv ├── ranker_times_2006.csv ├── run.sh ├── sbatch_mem_used.sh ├── sbatch_nodes_evaluated.sh ├── time_per_wtnode.R ├── trec-2005-and-profile-IDX_SAWIT2.csv ├── trec-2005-and-time-IDX_SAWIT2.csv ├── trec-2005-or-profile-IDX_SAWIT2.csv ├── trec-2005-or-time-IDX_SAWIT2.csv ├── trec-2005-time-ex-and-wt.csv ├── trec-2005-time-ex-or-wt.csv ├── trec-2005.csv ├── trec-2005.dr.csv ├── trec-2006-and-profile-IDX_SAWIT2.csv ├── trec-2006-and-time-IDX_SAWIT2.csv ├── trec-2006-or-profile-IDX_SAWIT2.csv ├── trec-2006-or-time-IDX_SAWIT2.csv ├── trec-2006-time-ex-and-wt.csv ├── trec-2006-time-ex-or-wt.csv ├── trec-2006.csv ├── trec-2006.dr.csv └── wikishort.qry ├── extras ├── clueweb-collection.indricfg ├── gov2-collection.indricfg ├── speeches-collection.indricfg ├── trec8-collection.indricfg ├── wikishort-collection.indricfg └── wt10g-collection.indricfg ├── include └── surf │ ├── .gitignore │ ├── block_postings_list.hpp │ ├── comm.hpp │ ├── config.hpp │ ├── construct_DUP2.hpp │ ├── construct_U.hpp │ ├── construct_col_len.hpp │ ├── construct_darray.hpp │ ├── construct_doc_border.hpp │ ├── construct_doc_cnt.hpp │ ├── construct_doc_lengths.hpp │ ├── construct_doc_perm.hpp │ ├── construct_invidx.hpp │ ├── df_sada.hpp │ ├── doc_perm.hpp │ ├── idx_d.hpp │ ├── idx_d1r1.hpp │ ├── idx_d1r1mtf.hpp │ ├── idx_dr.hpp │ ├── idx_invfile.hpp │ ├── indexes.hpp │ ├── phrase_parser.hpp │ ├── query.hpp │ ├── query_parser.hpp │ ├── rank_functions.hpp │ └── util.hpp ├── queries ├── trec0406-adhoc.qry ├── trec2005-efficiency-10.qry ├── trec2005-efficiency-100.qry ├── trec2005-efficiency-1000.qry ├── trec2005-efficiency.qry ├── trec2006-efficiency-10.qry ├── trec2006-efficiency-100.qry ├── trec2006-efficiency-1000.qry └── trec2006-efficiency.qry ├── results ├── trec8_wtdup_stat.R ├── trec8_wtdup_stat.pdf └── trec8_wtdup_stat.txt ├── src ├── .gitignore ├── doc_lengths.cpp ├── surf_daemon.cpp ├── surf_index.cpp ├── surf_query.cpp ├── surf_search.cpp ├── surf_trec.cpp ├── test.cpp └── test_postings_list.cpp ├── tools ├── Makefile ├── convert_results_to_trec.cpp ├── create_surf_collection.cpp ├── extract_document.cpp ├── extract_documents.cpp ├── indri_stem_krovetz.cpp ├── indri_to_surf.cpp ├── select_random_queries.cpp └── surf_collection_info.cpp └── update-sdsl.sh /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "external/fastpfor"] 2 | path = external/fastpfor 3 | url = https://github.com/lemire/FastPFor.git 4 | [submodule "external/sdsl-lite"] 5 | path = external/sdsl-lite 6 | url = https://github.com/simongog/sdsl-lite.git 7 | [submodule "external/zeromq"] 8 | path = external/zeromq 9 | url = https://github.com/zeromq/libzmq.git 10 | [submodule "external/cppzmq"] 11 | path = external/cppzmq 12 | url = https://github.com/zeromq/cppzmq.git 13 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | cmake_policy(SET CMP0015 NEW) 3 | set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules") 4 | include(AppendCompilerFlags) 5 | include(ExternalProject) 6 | 7 | project(SURF CXX C) 8 | 9 | INCLUDE_DIRECTORIES(${CMAKE_HOME_DIRECTORY}/include 10 | ${CMAKE_HOME_DIRECTORY}/external/fastpfor/headers/ 11 | ${CMAKE_HOME_DIRECTORY}/external/cppzmq/ 12 | ${CMAKE_HOME_DIRECTORY}/external/zeromq/include/ 13 | ${CMAKE_BINARY_DIR}/external/sdsl-lite/include 14 | ${CMAKE_BINARY_DIR}/external/sdsl-lite/external/libdivsufsort-2.0.1/include 15 | ) 16 | 17 | LINK_DIRECTORIES(${CMAKE_BINARY_DIR}/external/sdsl-lite/lib 18 | ${CMAKE_BINARY_DIR}/external/zeromq/lib 19 | ) 20 | 21 | append_cxx_compiler_flags("-msse4.2 -std=c++11 -Wall -DNDEBUG" "GCC" CMAKE_CXX_FLAGS) 22 | append_cxx_compiler_flags("-O3 -ffast-math -funroll-loops" "GCC" CMAKE_CXX_FLAGS) 23 | append_cxx_compiler_flags("-msse4.2 -std=c++11 -g -funroll-loops -DNDEBUG -stdlib=libc++" "CLANG" CMAKE_CXX_FLAGS) 24 | 25 | 26 | ADD_SUBDIRECTORY(external/zeromq) 27 | SET_PROPERTY(DIRECTORY external/zeromq PROPERTY ZMQ_BUILD_TESTS FALSE) 28 | 29 | ADD_SUBDIRECTORY(external/sdsl-lite) 30 | 31 | ADD_LIBRARY(fastpfor_lib STATIC external/fastpfor/src/bitpacking.cpp 32 | external/fastpfor/src/bitpackingaligned.cpp 33 | external/fastpfor/src/bitpackingunaligned.cpp 34 | external/fastpfor/src/simdunalignedbitpacking.cpp 35 | external/fastpfor/src/simdbitpacking.cpp) 36 | 37 | # # read the index configs 38 | file(GLOB index_config_files RELATIVE ${CMAKE_HOME_DIRECTORY}/config/ "${CMAKE_HOME_DIRECTORY}/config/*.config") 39 | foreach(f ${index_config_files}) 40 | file(STRINGS ${CMAKE_HOME_DIRECTORY}/config/${f} config_contents) 41 | set(compile_defs "") 42 | foreach(keyvalue ${config_contents}) 43 | string(REGEX REPLACE "^[ ]+" "" keyvalue ${keyvalue}) 44 | string(REGEX MATCH "^[^=]+" key ${keyvalue}) 45 | string(REPLACE "${key}=" "" value ${keyvalue}) 46 | set(${key} "${value}") 47 | list(APPEND compile_defs ${key}=${value}) 48 | endforeach(keyvalue) 49 | 50 | ADD_EXECUTABLE(surf_index-${NAME} src/surf_index.cpp) 51 | TARGET_LINK_LIBRARIES(surf_index-${NAME} sdsl divsufsort divsufsort64 pthread fastpfor_lib) 52 | set_property(TARGET surf_index-${NAME} PROPERTY COMPILE_DEFINITIONS IDXNAME="${NAME}" ${compile_defs}) 53 | 54 | ADD_EXECUTABLE(surf_search-${NAME} src/surf_search.cpp) 55 | TARGET_LINK_LIBRARIES(surf_search-${NAME} sdsl divsufsort divsufsort64 pthread fastpfor_lib) 56 | set_property(TARGET surf_search-${NAME} PROPERTY COMPILE_DEFINITIONS IDXNAME="${NAME}" ${compile_defs}) 57 | 58 | ADD_EXECUTABLE(surf_daemon-${NAME} src/surf_daemon.cpp) 59 | TARGET_LINK_LIBRARIES(surf_daemon-${NAME} sdsl divsufsort divsufsort64 pthread fastpfor_lib libzmq) 60 | set_property(TARGET surf_daemon-${NAME} PROPERTY COMPILE_DEFINITIONS IDXNAME="${NAME}" ${compile_defs}) 61 | 62 | endforeach(f) 63 | 64 | ADD_EXECUTABLE(doc_lengths src/doc_lengths.cpp) 65 | TARGET_LINK_LIBRARIES(doc_lengths sdsl) 66 | 67 | ADD_EXECUTABLE(surf_query src/surf_query.cpp) 68 | TARGET_LINK_LIBRARIES(surf_query libzmq sdsl) 69 | 70 | ADD_EXECUTABLE(test src/test.cpp) 71 | TARGET_LINK_LIBRARIES(test sdsl divsufsort divsufsort64 pthread) 72 | 73 | ADD_EXECUTABLE(select_random_queries tools/select_random_queries.cpp) 74 | TARGET_LINK_LIBRARIES(select_random_queries sdsl divsufsort divsufsort64 pthread) 75 | 76 | ADD_EXECUTABLE(test_postings_list src/test_postings_list.cpp) 77 | TARGET_LINK_LIBRARIES(test_postings_list sdsl divsufsort divsufsort64 pthread fastpfor_lib) 78 | 79 | ADD_EXECUTABLE(create_surf_collection tools/create_surf_collection.cpp) 80 | TARGET_LINK_LIBRARIES(create_surf_collection sdsl divsufsort divsufsort64 pthread fastpfor_lib) 81 | 82 | ADD_EXECUTABLE(convert_results_to_trec tools/convert_results_to_trec.cpp) 83 | TARGET_LINK_LIBRARIES(convert_results_to_trec sdsl divsufsort divsufsort64 pthread fastpfor_lib) 84 | 85 | ADD_EXECUTABLE(extract_documents tools/extract_documents.cpp) 86 | TARGET_LINK_LIBRARIES(extract_documents sdsl divsufsort divsufsort64 pthread fastpfor_lib) 87 | 88 | ADD_EXECUTABLE(extract_document tools/extract_document.cpp) 89 | TARGET_LINK_LIBRARIES(extract_document sdsl divsufsort divsufsort64 pthread fastpfor_lib) 90 | 91 | ADD_EXECUTABLE(surf_collection_info tools/surf_collection_info.cpp) 92 | TARGET_LINK_LIBRARIES(surf_collection_info sdsl divsufsort divsufsort64 pthread fastpfor_lib) 93 | 94 | -------------------------------------------------------------------------------- /CMakeModules/AppendCompilerFlags.cmake: -------------------------------------------------------------------------------- 1 | include(CheckCSourceCompiles) 2 | include(CheckCXXSourceCompiles) 3 | 4 | macro(append_c_compiler_flags _flags _name _result) 5 | set(SAFE_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS}) 6 | string(REGEX REPLACE "[-+/ ]" "_" cname "${_name}") 7 | string(TOUPPER "${cname}" cname) 8 | foreach(flag ${_flags}) 9 | string(REGEX REPLACE "^[-+/ ]+(.*)[-+/ ]*$" "\\1" flagname "${flag}") 10 | string(REGEX REPLACE "[-+/ ]" "_" flagname "${flagname}") 11 | string(TOUPPER "${flagname}" flagname) 12 | set(have_flag "HAVE_${cname}_${flagname}") 13 | set(CMAKE_REQUIRED_FLAGS "${flag}") 14 | check_c_source_compiles("int main() { return 0; }" ${have_flag}) 15 | if(${have_flag}) 16 | set(${_result} "${${_result}} ${flag}") 17 | endif(${have_flag}) 18 | endforeach(flag) 19 | set(CMAKE_REQUIRED_FLAGS ${SAFE_CMAKE_REQUIRED_FLAGS}) 20 | endmacro(append_c_compiler_flags) 21 | 22 | macro(append_cxx_compiler_flags _flags _name _result) 23 | set(SAFE_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS}) 24 | string(REGEX REPLACE "[-+/ ]" "_" cname "${_name}") 25 | string(TOUPPER "${cname}" cname) 26 | foreach(flag ${_flags}) 27 | string(REGEX REPLACE "^[-+/ ]+(.*)[-+/ ]*$" "\\1" flagname "${flag}") 28 | string(REGEX REPLACE "[-+/ ]" "_" flagname "${flagname}") 29 | string(TOUPPER "${flagname}" flagname) 30 | set(have_flag "HAVE_${cname}_${flagname}") 31 | set(CMAKE_REQUIRED_FLAGS "${flag}") 32 | check_cxx_source_compiles("int main() { return 0; }" ${have_flag}) 33 | if(${have_flag}) 34 | set(${_result} "${${_result}} ${flag}") 35 | endif(${have_flag}) 36 | endforeach(flag) 37 | set(CMAKE_REQUIRED_FLAGS ${SAFE_CMAKE_REQUIRED_FLAGS}) 38 | endmacro(append_cxx_compiler_flags) 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | surf 2 | ==== 3 | 4 | the SUccinct Retrival Framework. 5 | 6 | ## requirements 7 | 8 | * gcc 4.7 or clang 4.3 9 | * [indri](http://www.lemurproject.org/indri/) to convert indri indexes to surf input format 10 | 11 | ## installation 12 | 13 | ``` 14 | cd surf 15 | git submodule init 16 | git submodule update 17 | cd build 18 | cmake .. 19 | make 20 | ``` 21 | 22 | ## building an index 23 | 24 | ``` 25 | cd surf/build 26 | ./surf_index-IDX_D -c ../collections/wikishort/ 27 | ``` 28 | 29 | ## querying an index 30 | 31 | ``` 32 | cd surf/build 33 | ./surf_search -c ../collections/wikishort/ -q -k 10 34 | ``` 35 | 36 | ## creating an indri index and converting it into surf format 37 | 38 | ### create the indri index 39 | 40 | ``` 41 | cd ./indri-5.6/ 42 | ./configure 43 | make 44 | cd buildindex 45 | # change indri config to correct storage locations 46 | ./IndriBuildIndex ./surf/extras/gov2.indricfg 47 | ``` 48 | 49 | ### convert the index into surf format 50 | 51 | ``` 52 | cd surf/tools 53 | # change path of indri source code in Makefile 54 | make 55 | # ./indri_to_surf 56 | ./indri_to_surf ../collections/gov2indi ../collections/gov2/ 57 | ``` 58 | 59 | ## starting a daemon 60 | 61 | an index daemon can be started in the background and listen on a specific port for search requests 62 | 63 | ``` 64 | cd build 65 | ./surf_daemon-IDX_D -c ../collections/wikishort/ -p 12345 66 | ``` 67 | 68 | ## querying the search daemon 69 | 70 | the daemon can be queried via the network or localhost 71 | 72 | ``` 73 | cd build 74 | ./surf_query -q -h 127.0.0.1:12345 -k 10 75 | ``` 76 | 77 | ## shutting down the daemon 78 | 79 | the daemon can be terminated via the query client. 80 | 81 | ``` 82 | cd build 83 | ./surf_query -q -h 127.0.0.1:12345 -k 1 -s 84 | ``` 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /build/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /collections/README.md: -------------------------------------------------------------------------------- 1 | Each collection is located in its own subdirectory. 2 | The subdirectories name is the identifier or the collection. 3 | One collections consists of three files: 4 | 5 | * dict.txt: Contains all words of the collection, 6 | one word at a line. Line format: word id. 7 | * doc_names.txt: Contains the title of each document. 8 | * text_int.sdsl: A concatenation of the documents. 9 | -------------------------------------------------------------------------------- /collections/gov2/index/space_usage_IDX_D.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | sdsl data structure visualization 5 | 6 | 14 | 15 | 16 | 17 |
18 | 478 | 479 | 480 | -------------------------------------------------------------------------------- /collections/speeches/.gitignore: -------------------------------------------------------------------------------- 1 | index 2 | -------------------------------------------------------------------------------- /collections/speeches/dict.txt: -------------------------------------------------------------------------------- 1 | 35 3 2 | a 4 3 | able 5 4 | about 6 5 | abundance 7 6 | accept 8 7 | again 9 8 | ago 10 9 | alabama 11 10 | all 12 11 | am 13 12 | america 14 13 | an 15 14 | and 16 15 | any 17 16 | appomattox 18 17 | are 19 18 | as 20 19 | ask 21 20 | assault 22 21 | at 23 22 | atlantic 24 23 | back 25 24 | bare 26 25 | basic 27 26 | be 28 27 | beautiful 29 28 | because 30 29 | been 31 30 | believe 32 31 | beloved 33 32 | berlin 34 33 | berliner 35 34 | best 36 35 | between 37 36 | bin 38 37 | black 39 38 | boast 40 39 | both 41 40 | boys 42 41 | brother 43 42 | brotherhood 44 43 | brutal 45 44 | but 46 45 | by 47 46 | carlyle’s 48 47 | cause 49 48 | century 50 49 | challenge 51 50 | chancellor 52 51 | children 53 52 | choose 54 53 | city 55 54 | civi 56 55 | clay 57 56 | climb 58 57 | colors 59 58 | come 60 59 | committed 61 60 | common 62 61 | company 63 62 | concord 64 63 | conflict 65 64 | conquest 66 65 | conviction 67 66 | convocation 68 67 | cooperation 69 68 | country 70 69 | cries 71 70 | crisis 72 71 | crooked 73 72 | day 74 73 | debate 75 74 | decade 76 75 | democracy 77 76 | democratic 78 77 | denial 79 78 | depression 80 79 | deserve 81 80 | despair 82 81 | destiny 83 82 | die 84 83 | difference 85 84 | dignity 86 85 | discord 87 86 | distinguished 88 87 | do 89 88 | down 90 89 | dream 91 90 | dripping 92 91 | during 93 92 | earth 94 93 | easy 95 94 | ein 96 95 | energy 97 96 | engaged 98 97 | envisage 99 98 | equal 100 99 | establish 101 100 | ever 102 101 | every 103 102 | everybody 104 103 | exalted 105 104 | fail 106 105 | faith 107 106 | fate 108 107 | father’s 109 108 | federal 110 109 | fellow 111 110 | fight 112 111 | flesh 113 112 | fly 114 113 | for 115 114 | forget 116 115 | fought 117 116 | france 118 117 | free 119 118 | freedom 120 119 | from 121 120 | general 122 121 | genuine 123 122 | germany 124 123 | girl 125 124 | glory 126 125 | go 127 126 | goal 128 127 | god 129 128 | god’s 130 129 | good 131 130 | government 132 131 | governor 133 132 | great 134 133 | greatest 135 134 | growth 136 135 | guest 137 136 | hand 138 137 | happen 139 138 | happening 140 139 | hard 141 140 | has 142 141 | have 143 142 | hazard 144 143 | heart 145 144 | here 146 145 | hew 147 146 | highest 148 147 | hill 149 148 | hindu 150 149 | his 151 150 | history 152 151 | hope 153 152 | hostile 154 153 | hymn 155 154 | i 156 155 | ich 157 156 | ideal 158 157 | if 159 158 | in 160 159 | inasmuch 161 160 | independence 162 161 | indian 163 162 | intend 164 163 | interposition 165 164 | into 166 165 | invite 167 166 | is 168 167 | issue 169 168 | it 170 169 | its 171 170 | itself 172 171 | jail 173 172 | jangle 174 173 | jawaharlal 175 174 | join 176 175 | justice 177 176 | kill 178 177 | knowing 179 178 | land 180 179 | last 181 180 | lay 182 181 | let 183 182 | lexington 184 183 | liberty 185 184 | lip 186 185 | little 187 186 | live 188 187 | lives 189 188 | long 190 189 | lord 191 190 | low 192 191 | made 193 192 | majesty 194 193 | man 195 194 | mankind 196 195 | many 197 196 | marked 198 197 | master 199 198 | may 200 199 | mayor 201 200 | me 202 201 | meaning 203 202 | measure 204 203 | meet 205 204 | member 206 205 | men 207 206 | met 208 207 | million 209 208 | mission 210 209 | moments 211 210 | moon 212 211 | more 213 212 | most 214 213 | mountain 215 214 | mountainside 216 215 | muslim 217 216 | my 218 217 | nation 219 218 | national 220 219 | need 221 220 | never 222 221 | new 223 222 | no 224 223 | non 225 224 | not 226 225 | nullify 227 226 | of 228 227 | oldest 229 228 | on 230 229 | once 231 230 | one 232 231 | only 233 232 | opportunity 234 233 | oppress 235 234 | or 236 235 | organize 237 236 | other 238 237 | our 239 238 | ours 240 239 | out 241 240 | outer 242 241 | own 243 242 | pain 244 243 | pandit 245 244 | party 246 245 | peace 247 246 | peaceful 248 247 | people 249 248 | pilgrim’s 250 249 | place 251 250 | plain 252 251 | play 253 252 | point 254 253 | postpone 255 254 | pray 256 255 | prejudice 257 256 | pride 258 257 | prison 259 258 | progress 260 259 | prosperity 261 260 | protest 262 261 | proud 263 262 | proudest 264 263 | purpose 265 264 | racist 266 265 | rarely 267 266 | rather 268 267 | read 269 268 | realize 270 269 | religion 271 270 | republic 272 271 | resolution 273 272 | reveal 274 273 | revolution 275 274 | rice 276 275 | right 277 276 | rights 278 277 | ring 279 278 | romanu 280 279 | rough 281 280 | russia 282 281 | satisfaction 283 282 | say 284 283 | search 285 284 | secret 286 285 | section 287 286 | security 288 287 | see 289 288 | self 290 289 | selma 291 290 | serve 292 291 | shall 293 292 | shape 294 293 | sing 295 294 | single 296 295 | sister 297 296 | skill 298 297 | so 299 298 | some 300 299 | something 301 300 | south 302 301 | space 303 302 | speak 304 303 | spirit 305 304 | stand 306 305 | stone 307 306 | straight 308 307 | strife 309 308 | struggle 310 309 | such 311 310 | suffering 312 311 | sum 313 312 | summon 314 313 | sweet 315 314 | symbolize 316 315 | symphony 317 316 | texas 318 317 | than 319 318 | that 320 319 | the 321 320 | thee 322 321 | their 323 322 | there 324 323 | these 325 324 | they 326 325 | things 327 326 | think 328 327 | this 329 328 | thousand 330 329 | throughout 331 330 | time 332 331 | times 333 332 | to 334 333 | today 335 334 | together 336 335 | told 337 336 | tonight 338 337 | too 339 338 | transform 340 339 | turning 341 340 | two 342 341 | unend 343 342 | unwill 344 343 | up 345 344 | urge 346 345 | us 347 346 | valley 348 347 | value 349 348 | vicious 350 349 | violence 351 350 | visit 352 351 | war 353 352 | was 354 353 | we 355 354 | weapon 356 355 | week 357 356 | welfare 358 357 | well 359 358 | were 360 359 | west 361 360 | what 362 361 | when 363 362 | where 364 363 | which 365 364 | while 366 365 | white 367 366 | who 368 367 | why 369 368 | will 370 369 | willing 371 370 | win 372 371 | with 373 372 | women 374 373 | words 375 374 | work 376 375 | world 377 376 | wrong 378 377 | years 379 378 | yet 380 379 | you 381 380 | your 382 381 | yourself 383 382 | – 384 383 | ’tis 385 384 | “my 386 385 | -------------------------------------------------------------------------------- /collections/speeches/doc_names.txt: -------------------------------------------------------------------------------- 1 | /devhome6/mpetri/collections/speeches/lyndon_b_johnson-we_shall_overcome.txt 2 | /devhome6/mpetri/collections/speeches/john_f_kennedy_-the_decision_to_go_to_the_moon.txt 3 | /devhome6/mpetri/collections/speeches/mahatma_gandhi-quit_india.txt 4 | /devhome6/mpetri/collections/speeches/martin_luther_king_jr-i_have_a_dream.txt 5 | /devhome6/mpetri/collections/speeches/john_f_kennedy_-ich_bin_ein_berliner.txt 6 | -------------------------------------------------------------------------------- /collections/speeches/text_int_SURF.sdsl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simongog/surf/c8caa199391793395df85dede6df88c17514097e/collections/speeches/text_int_SURF.sdsl -------------------------------------------------------------------------------- /collections/wikishort/.gitignore: -------------------------------------------------------------------------------- 1 | index 2 | -------------------------------------------------------------------------------- /collections/wikishort/text_int_SURF.sdsl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simongog/surf/c8caa199391793395df85dede6df88c17514097e/collections/wikishort/text_int_SURF.sdsl -------------------------------------------------------------------------------- /collections/wikishort/wikishort-src.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simongog/surf/c8caa199391793395df85dede6df88c17514097e/collections/wikishort/wikishort-src.tar.gz -------------------------------------------------------------------------------- /config/IDX-D-BM25.config: -------------------------------------------------------------------------------- 1 | NAME=IDX_D_BM25 2 | CSA_TYPE=sdsl::csa_wt>,1000000,1000000> 3 | WTD_TYPE=sdsl::wt_int, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>> 4 | DF_TYPE=surf::df_sada> 5 | RANK_TYPE=surf::rank_bm25<> 6 | INDEX_TYPE=surf::idx_d 7 | PHRASE_SUPPORT=1 8 | -------------------------------------------------------------------------------- /config/IDX-D-LMDS.config: -------------------------------------------------------------------------------- 1 | NAME=IDX_D_LMDS 2 | CSA_TYPE=sdsl::csa_wt>,1000000,1000000> 3 | WTD_TYPE=sdsl::wt_int, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>> 4 | DF_TYPE=surf::df_sada> 5 | RANK_TYPE=surf::rank_lmds<> 6 | INDEX_TYPE=surf::idx_d 7 | PHRASE_SUPPORT=1 8 | -------------------------------------------------------------------------------- /config/IDX-D-SANSLEN.config: -------------------------------------------------------------------------------- 1 | NAME=IDX_D_SANSLEN 2 | CSA_TYPE=sdsl::csa_wt>,1000000,1000000> 3 | WTD_TYPE=sdsl::wt_int, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>> 4 | DF_TYPE=surf::df_sada> 5 | RANK_TYPE=surf::rank_bm25_simple_est<120,75> 6 | INDEX_TYPE=surf::idx_d 7 | PHRASE_SUPPORT=1 8 | -------------------------------------------------------------------------------- /config/IDX-D-TFIDF.config: -------------------------------------------------------------------------------- 1 | NAME=IDX_D_TFIDF 2 | CSA_TYPE=sdsl::csa_wt>,1000000,1000000> 3 | WTD_TYPE=sdsl::wt_int, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>> 4 | DF_TYPE=surf::df_sada> 5 | RANK_TYPE=surf::rank_tfidf 6 | INDEX_TYPE=surf::idx_d 7 | PHRASE_SUPPORT=1 8 | -------------------------------------------------------------------------------- /config/IDX-D.config: -------------------------------------------------------------------------------- 1 | NAME=IDX_D 2 | CSA_TYPE=sdsl::csa_wt>,1000000,1000000> 3 | WTD_TYPE=sdsl::wt_int, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>> 4 | DF_TYPE=surf::df_sada> 5 | RANK_TYPE=surf::rank_bm25<> 6 | INDEX_TYPE=surf::idx_d 7 | PHRASE_SUPPORT=1 8 | -------------------------------------------------------------------------------- /config/IDX-D1R1-BM25.config: -------------------------------------------------------------------------------- 1 | NAME=IDX_D1R1_BM25 2 | CSA_TYPE=sdsl::csa_wt>,1000000,1000000> 3 | DF_TYPE=surf::df_sada> 4 | WTP_TYPE=sdsl::wt_int> 5 | WTU_TYPE=sdsl::wt_int, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>> 6 | RANK_TYPE=surf::rank_bm25<> 7 | INDEX_TYPE=surf::idx_d1r1 8 | -------------------------------------------------------------------------------- /config/IDX-D1R1-LMDS.config: -------------------------------------------------------------------------------- 1 | NAME=IDX_D1R1_LMDS 2 | CSA_TYPE=sdsl::csa_wt>,1000000,1000000> 3 | DF_TYPE=surf::df_sada> 4 | WTP_TYPE=sdsl::wt_int> 5 | WTU_TYPE=sdsl::wt_int, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>> 6 | RANK_TYPE=surf::rank_lmds<> 7 | INDEX_TYPE=surf::idx_d1r1 8 | -------------------------------------------------------------------------------- /config/IDX-D1R1-TFIDF.config: -------------------------------------------------------------------------------- 1 | NAME=IDX_D1R1_TFIDF 2 | CSA_TYPE=sdsl::csa_wt>,1000000,1000000> 3 | DF_TYPE=surf::df_sada> 4 | WTP_TYPE=sdsl::wt_int> 5 | WTU_TYPE=sdsl::wt_int, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>> 6 | RANK_TYPE=surf::rank_tfidf 7 | INDEX_TYPE=surf::idx_d1r1 8 | -------------------------------------------------------------------------------- /config/IDX-D1R1.config: -------------------------------------------------------------------------------- 1 | NAME=IDX_D1R1 2 | CSA_TYPE=sdsl::csa_wt>,1000000,1000000> 3 | DF_TYPE=surf::df_sada> 4 | WTP_TYPE=sdsl::wt_int> 5 | WTU_TYPE=sdsl::wt_int, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>> 6 | RANK_TYPE=surf::rank_bm25<> 7 | INDEX_TYPE=surf::idx_d1r1 8 | -------------------------------------------------------------------------------- /config/IDX-D1R1MTF.config: -------------------------------------------------------------------------------- 1 | NAME=IDX_D1R1MTF 2 | CSA_TYPE=sdsl::csa_wt>,1000000,1000000> 3 | DF_TYPE=surf::df_sada> 4 | WTP_TYPE=sdsl::wt_int> 5 | WTU_TYPE=sdsl::wt_int, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>> 6 | INDEX_TYPE=surf::idx_d1r1mtf 7 | -------------------------------------------------------------------------------- /config/IDX-DR-BM25.config: -------------------------------------------------------------------------------- 1 | NAME=IDX_DR_BM25 2 | CSA_TYPE=sdsl::csa_wt>,1000000,1000000> 3 | DF_TYPE=surf::df_sada> 4 | WTD_TYPE=sdsl::wt_int, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>> 5 | WTR_TYPE=sdsl::wt_int> 6 | RANK_TYPE=surf::rank_bm25<> 7 | INDEX_TYPE=surf::idx_dr 8 | PHRASE_SUPPORT=1 9 | -------------------------------------------------------------------------------- /config/IDX-DR-LMDS.config: -------------------------------------------------------------------------------- 1 | NAME=IDX_DR_LMDS 2 | CSA_TYPE=sdsl::csa_wt>,1000000,1000000> 3 | DF_TYPE=surf::df_sada> 4 | WTD_TYPE=sdsl::wt_int, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>> 5 | WTR_TYPE=sdsl::wt_int> 6 | RANK_TYPE=surf::rank_lmds<> 7 | INDEX_TYPE=surf::idx_dr 8 | PHRASE_SUPPORT=1 9 | -------------------------------------------------------------------------------- /config/IDX-DR-SANSLEN.config: -------------------------------------------------------------------------------- 1 | NAME=IDX_DR_SANSLEN 2 | CSA_TYPE=sdsl::csa_wt>,1000000,1000000> 3 | DF_TYPE=surf::df_sada> 4 | WTD_TYPE=sdsl::wt_int, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>> 5 | WTR_TYPE=sdsl::wt_int> 6 | RANK_TYPE=surf::rank_bm25_simple_est<120,75> 7 | INDEX_TYPE=surf::idx_dr 8 | PHRASE_SUPPORT=1 9 | -------------------------------------------------------------------------------- /config/IDX-DR-TFIDF.config: -------------------------------------------------------------------------------- 1 | NAME=IDX_DR_TFIDF 2 | CSA_TYPE=sdsl::csa_wt>,1000000,1000000> 3 | DF_TYPE=surf::df_sada> 4 | WTD_TYPE=sdsl::wt_int, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>> 5 | WTR_TYPE=sdsl::wt_int> 6 | RANK_TYPE=surf::rank_tfidf 7 | INDEX_TYPE=surf::idx_dr 8 | PHRASE_SUPPORT=1 9 | -------------------------------------------------------------------------------- /config/IDX-DR.config: -------------------------------------------------------------------------------- 1 | NAME=IDX_DR 2 | CSA_TYPE=sdsl::csa_wt>,1000000,1000000> 3 | DF_TYPE=surf::df_sada> 4 | WTD_TYPE=sdsl::wt_int, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>> 5 | WTR_TYPE=sdsl::wt_int> 6 | RANK_TYPE=surf::rank_bm25<> 7 | INDEX_TYPE=surf::idx_dr 8 | PHRASE_SUPPORT=1 9 | -------------------------------------------------------------------------------- /config/INVIDX-E-BM25.config: -------------------------------------------------------------------------------- 1 | NAME=INVIDX_E_BM25 2 | PLIST_TYPE=surf::block_postings_list<128> 3 | RANK_TYPE=surf::rank_bm25<> 4 | INDEX_TYPE=surf::idx_invfile 5 | -------------------------------------------------------------------------------- /config/INVIDX-E-LMDS.config: -------------------------------------------------------------------------------- 1 | NAME=INVIDX_E_LMDS 2 | PLIST_TYPE=surf::block_postings_list<128> 3 | RANK_TYPE=surf::rank_lmds<> 4 | INDEX_TYPE=surf::idx_invfile 5 | -------------------------------------------------------------------------------- /config/INVIDX-E-TFIDF.config: -------------------------------------------------------------------------------- 1 | NAME=INVIDX_E_TFIDF 2 | PLIST_TYPE=surf::block_postings_list<128> 3 | RANK_TYPE=surf::rank_tfidf 4 | INDEX_TYPE=surf::idx_invfile 5 | -------------------------------------------------------------------------------- /config/INVIDX-E.config: -------------------------------------------------------------------------------- 1 | NAME=INVIDX_E 2 | PLIST_TYPE=surf::block_postings_list<128> 3 | RANK_TYPE=surf::rank_bm25<> 4 | INDEX_TYPE=surf::idx_invfile 5 | -------------------------------------------------------------------------------- /config/INVIDX-W.config: -------------------------------------------------------------------------------- 1 | NAME=INVIDX_W 2 | PLIST_TYPE=surf::block_postings_list<128> 3 | RANK_TYPE=surf::rank_bm25<> 4 | INDEX_TYPE=surf::idx_invfile 5 | -------------------------------------------------------------------------------- /experiments/check_equivalence.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | COLLECTION="../collections/wikishort/" 4 | 5 | RANKERS="BM25 LMDS TFIDF" 6 | 7 | INDEXES="IDX_D IDX_DR IDX_D1R1 INVIDX_E" 8 | 9 | SURF_PATH="../" 10 | QRYBIN=$SURF_PATH/build/surf_query 11 | QRYFILE="wikishort.qry" 12 | 13 | for col in $COLLECTION 14 | do 15 | for rank in $RANKERS 16 | do 17 | OUTPUT_FILES_OR="" 18 | OUTPUT_FILES_AND="" 19 | for idx in $INDEXES 20 | do 21 | DAEMONBIN="$SURF_PATH/build/surf_daemon-${idx}_$rank" 22 | $DAEMONBIN -c $col > /dev/null 2>&1 & 23 | $QRYBIN -q $QRYFILE -k 10 -r 1 -R 1> OUT_OR_${idx}_$rank 2>/dev/null 24 | $QRYBIN -q $QRYFILE -k 10 -r 1 -R -a 2>/dev/null 1> OUT_AND_${idx}_$rank 25 | OUTPUT_FILES_OR="$OUTPUT_FILES_OR OUT_OR_${idx}_$rank" 26 | OUTPUT_FILES_AND="$OUTPUT_FILES_AND OUT_AND_${idx}_$rank" 27 | $QRYBIN -q $QRYFILE -s > /dev/null 2>&1 1> /dev/null 28 | done 29 | 30 | # cmp for equality now... 31 | diff $OUTPUT_FILES_OR > /dev/null 2>&1 32 | if [ $? -eq 1 ] 33 | then 34 | echo "output for OR NOT EQUAL for $rank $col !!!" 35 | else 36 | echo "all good for OR and $rank" 37 | fi 38 | diff $OUTPUT_FILES_AND > /dev/null 2>&1 39 | if [ $? -eq 1 ] 40 | then 41 | echo "output for AND NOT EQUAL for $rank $col !!!" 42 | else 43 | echo "all good for AND and $rank" 44 | fi 45 | done 46 | done 47 | 48 | 49 | # cleanup 50 | #rm -f OUT_* 51 | -------------------------------------------------------------------------------- /experiments/eval.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | d <- read.csv(file="trec-2005-and-profile-IDX_SAWIT.csv",sep=";") 4 | d <- cbind(d,qry_and="RANKED-AND") 5 | f <- read.csv(file="trec-2005-profile-IDX_SAWIT.csv",sep=";") 6 | f <- cbind(f,qry_and="RANKED-OR") 7 | 8 | i <- read.csv(file="trec-2006-and-profile-IDX_SAWIT.csv",sep=";") 9 | i <- cbind(i,qry_and="RANKED-AND") 10 | j <- read.csv(file="trec-2006-profile-IDX_SAWIT.csv",sep=";") 11 | j <- cbind(j,qry_and="RANKED-OR") 12 | 13 | d2 <- read.csv(file="trec-2005-and-profile-IDX_SAWIT2.csv",sep=";") 14 | d2 <- cbind(d2,qry_and="RANKED-AND") 15 | f2 <- read.csv(file="trec-2005-profile-IDX_SAWIT2.csv",sep=";") 16 | f2 <- cbind(f2,qry_and="RANKED-OR") 17 | 18 | i2 <- read.csv(file="trec-2006-and-profile-IDX_SAWIT2.csv",sep=";") 19 | i2 <- cbind(i2,qry_and="RANKED-AND") 20 | j2 <- read.csv(file="trec-2006-profile-IDX_SAWIT2.csv",sep=";") 21 | j2 <- cbind(j2,qry_and="RANKED-OR") 22 | 23 | g <- rbind(d,f) 24 | g <- cbind(g,qryfile="trec2005") 25 | h <- rbind(i,j) 26 | h <- cbind(h,qryfile="trec2006") 27 | l <- rbind(g,h) 28 | 29 | g2 <- rbind(d2,f2) 30 | g2 <- cbind(g2,qryfile="trec2005") 31 | h2 <- rbind(i2,j2) 32 | h2 <- cbind(h2,qryfile="trec2006") 33 | l2 <- rbind(g2,h2) 34 | 35 | q2 <- rbind(l,l2) 36 | 37 | 38 | p <- ggplot(q2,aes(factor(k),qry_time/1000,fill=index)) 39 | p <- p + geom_boxplot() 40 | p <- p + facet_grid(qryfile ~ qry_and) 41 | p <- p + scale_y_log10(limits=c(0.1, 10000),breaks=c(1,10,100,1000,10000)) 42 | p <- p + annotation_logticks(sides = "lr") 43 | print(p) -------------------------------------------------------------------------------- /experiments/eval_3.R: -------------------------------------------------------------------------------- 1 | eval_3.R -------------------------------------------------------------------------------- /experiments/mem_info.csv: -------------------------------------------------------------------------------- 1 | IDX_D; gov2; 22332391964; 77923692196;2485266971; 0;157532402 2 | IDX_DR; gov2; 22332391964; 77923692196;2485266971;32641953354;157532402 3 | IDX_D1R1;gov2; 22332391964; 10218453986;2485266971;16665796154;157532402 4 | IDX_D; cluewebB; 49302242460;140127439444;3305861323; 0;326432770 5 | IDX_DR; cluewebB; 49302242460;140127439444;3305861323;53089093770;326432770 6 | IDX_D1R1;cluewebB; 49302242460; 30655284746;3305861323;31069499370;326432770 7 | -------------------------------------------------------------------------------- /experiments/mem_used.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CUR_DIR=`pwd` 3 | MY_DIR="$( cd "$( dirname "$0" )" && pwd )" # gets the directory where the script is located in 4 | cd "${MY_DIR}" 5 | MY_DIR=`pwd` 6 | SURF_PATH=/scratch/VR0052/ESA2014/surf 7 | 8 | COLLECTIONS="$SURF_PATH/collections/gov2 $SURF_PATH/collections/cluewebB" 9 | EXP_DIR="$SURF_PATH/experiments" 10 | PORT=12345 11 | 12 | INDEXES="IDX_D IDX_DR IDX_D1R1" 13 | 14 | for col in $COLLECTIONS 15 | do 16 | echo $col 17 | for idx in $INDEXES 18 | do 19 | echo $idx 20 | $SURF_PATH/build_turpin/surf_index-$idx -c $col 21 | done 22 | done 23 | 24 | cd "${CUR_DIR}" 25 | -------------------------------------------------------------------------------- /experiments/nodes_evaluated.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CUR_DIR=`pwd` 3 | MY_DIR="$( cd "$( dirname "$0" )" && pwd )" # gets the directory where the script is located in 4 | cd "${MY_DIR}" 5 | MY_DIR=`pwd` 6 | #SURF_PATH=$MY_DIR/.. 7 | SURF_PATH=/scratch/VR0052/ESA2014/surf 8 | 9 | COLLECTIONS="$SURF_PATH/collections/gov2 $SURF_PATH/collections/cluewebB" 10 | EXP_DIR="$SURF_PATH/experiments" 11 | PORT=12345 12 | 13 | INDEXES="IDX_DR IDX_D IDX_D_SANSLEN" 14 | 15 | echo "qryid;collection;index;qrymode;k;qrylen;res_size;qry_time;search_time;nodes_evaluated;nodes_total;postings_evaluated;postings_total;client_time" > $EXP_DIR/nodes_evaluated.csv 16 | 17 | for col in $COLLECTIONS 18 | do 19 | for idx in $INDEXES 20 | do 21 | $SURF_PATH/build_turpin/surf_daemon-$idx -c $col -p $PORT & 22 | for k in 10 100 1000 23 | do 24 | $SURF_PATH/build_turpin/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2005-efficiency-1000.qry -k $k -r 1 -p >> $EXP_DIR/nodes_evaluated_2005.csv 25 | $SURF_PATH/build_turpin/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2006-efficiency-1000.qry -k $k -r 1 -p >> $EXP_DIR/nodes_evaluated_2006.csv 26 | done 27 | # shut down daemon 28 | $SURF_PATH/build_turpin/surf_query -h localhost:$PORT -q $SURF_PATH/queries/wiki.q -k 1 -s > /dev/null 29 | done 30 | done 31 | 32 | cd "${CUR_DIR}" 33 | -------------------------------------------------------------------------------- /experiments/phrases_time.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CUR_DIR=`pwd` 3 | MY_DIR="$( cd "$( dirname "$0" )" && pwd )" # gets the directory where the script is located in 4 | cd "${MY_DIR}" 5 | MY_DIR=`pwd` 6 | SURF_PATH=$MY_DIR/.. 7 | #SURF_PATH=/scratch/VR0052/ESA2014/surf 8 | 9 | #COLLECTIONS="$SURF_PATH/collections/gov2 $SURF_PATH/collections/cluewebB" 10 | COLLECTIONS="/devhome3/sgog/ESA2014/surf/collections/gov2" 11 | EXP_DIR="$SURF_PATH/experiments" 12 | PORT=12345 13 | 14 | INDEXES="IDX_D" 15 | 16 | echo "qryid;collection;index;qrymode;k;qrylen;res_size;qry_time;search_time;nodes_evaluated;nodes_total;postings_evaluated;postings_total;client_time" > $EXP_DIR/phrase_time_2005-2.csv 17 | 18 | for col in $COLLECTIONS 19 | do 20 | for idx in $INDEXES 21 | do 22 | $SURF_PATH/build/surf_daemon-$idx -c $col -p $PORT & 23 | for k in 10 100 24 | do 25 | $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2006-efficiency-1000.qry -k $k -r 1 -P 10 -p >> $EXP_DIR/phrase_time_2006-2.csv 26 | done 27 | # shut down daemon 28 | $SURF_PATH/build_turpin/surf_query -h localhost:$PORT -q $SURF_PATH/queries/wiki.q -k 1 -s > /dev/null 29 | done 30 | done 31 | 32 | cd "${CUR_DIR}" 33 | -------------------------------------------------------------------------------- /experiments/rank_times.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CUR_DIR=`pwd` 3 | MY_DIR="$( cd "$( dirname "$0" )" && pwd )" # gets the directory where the script is located in 4 | cd "${MY_DIR}" 5 | MY_DIR=`pwd` 6 | #SURF_PATH=$MY_DIR/.. 7 | SURF_PATH=/scratch/VR0052/ESA2014/surf 8 | 9 | COLLECTIONS="$SURF_PATH/collections/gov2" 10 | EXP_DIR="$SURF_PATH/experiments" 11 | PORT=12345 12 | 13 | INDEXES="IDX_DR IDX_D IDX_D1R1 INVIDX_E" 14 | RANKERS="BM25 TFIDF LMDS" 15 | 16 | echo "qryid;collection;ranker;index;qrymode;k;qrylen;res_size;qry_time;search_time;nodes_evaluated;nodes_total;postings_evaluated;postings_total;client_time" > $EXP_DIR/nodes_evaluated.csv 17 | 18 | for col in $COLLECTIONS 19 | do 20 | for idx in $INDEXES 21 | do 22 | for rank in $RANKERS 23 | do 24 | $SURF_PATH/build/surf_daemon-${idx}_$rank -c $col -p $PORT & 25 | for k in 100 26 | do 27 | $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2005-efficiency-1000.qry -k $k -r 1 -a -p >> $EXP_DIR/ranker_times_2005.csv 28 | $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2006-efficiency-1000.qry -k $k -r 1 -a -p >> $EXP_DIR/ranker_times_2006.csv 29 | done 30 | # shut down daemon 31 | $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/wiki.q -k 1 -s > /dev/null 32 | done 33 | done 34 | done 35 | 36 | cd "${CUR_DIR}" 37 | -------------------------------------------------------------------------------- /experiments/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CUR_DIR=`pwd` 3 | MY_DIR="$( cd "$( dirname "$0" )" && pwd )" # gets the directory where the script is located in 4 | cd "${MY_DIR}" 5 | MY_DIR=`pwd` 6 | SURF_PATH="$MY_DIR/.." 7 | 8 | COLLECTIONS="$SURF_PATH/collections/gov2/" 9 | PORT=12345 10 | 11 | for col in $COLLECTIONS 12 | do 13 | for idx in $SURF_PATH/build/surf_daemon-IDX_SAWIT2 14 | do 15 | IDXNAME=$(echo $idx | sed 's/.*-\(.*\)/\1/g') 16 | $idx -c $col -p $PORT & 17 | for k in 10 100 1000 18 | do 19 | $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2005-efficiency-100.qry -k $k -r 1 -p >> trec-2005.csv 20 | $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2006-efficiency-100.qry -k $k -r 1 -p >> trec-2006.csv 21 | $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2005-efficiency-100.qry -k $k -r 1 -p -a >> trec-2005.csv 22 | $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2006-efficiency-100.qry -k $k -r 1 -p -a >> trec-2006.csv 23 | done 24 | # shut down daemon 25 | $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/wiki.q -k 1 -s > /dev/null 26 | done 27 | done 28 | 29 | cd "${CUR_DIR}" 30 | -------------------------------------------------------------------------------- /experiments/sbatch_mem_used.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # use sbatch to lauch script 4 | 5 | BASE_DIR=/scratch/VR0052/ESA2014/surf/experiments 6 | #SBATCH -p turpin 7 | #SBATCH --job-name=ex_wtd 8 | #SBATCH --account="VR0280" 9 | #SBATCH --nodes=1 10 | #SBATCH --ntasks=1 11 | #SBATCH --time=10-12:00:00 12 | #SBATCH --mem=512GB 13 | #SBATCH --mail-user simon.gog@unimelb.edu.au 14 | #SBATCH --mail-type=BEGIN 15 | #SBATCH --mail-type=END 16 | #SBATCH --mail-type=FAIL 17 | 18 | module load gcc 19 | module load cmake 20 | 21 | $BASE_DIR/mem_used.sh 22 | 23 | ## --exclusive 24 | -------------------------------------------------------------------------------- /experiments/sbatch_nodes_evaluated.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # use sbatch to lauch script 4 | 5 | BASE_DIR=/scratch/VR0052/ESA2014/surf/experiments 6 | 7 | #SBATCH -p turpin 8 | #SBATCH --job-name=nodes_evaluated.sh 9 | #SBATCH --account="VR0280" 10 | #SBATCH --nodes=1 11 | #SBATCH --ntasks=1 12 | #SBATCH --time=10-12:00:00 13 | #SBATCH --mem=300GB 14 | #SBATCH --mail-user simon.gog@unimelb.edu.au 15 | #SBATCH --mail-type=BEGIN 16 | #SBATCH --mail-type=END 17 | #SBATCH --mail-type=FAIL 18 | 19 | module load gcc 20 | module load cmake 21 | 22 | $BASE_DIR/nodes_evaluated.sh 23 | 24 | -------------------------------------------------------------------------------- /experiments/time_per_wtnode.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | d <- read.csv(file="trec-2005-and-profile-IDX_SAWIT.csv",sep=";") 4 | d <- cbind(d,qry_and="RANKED-AND") 5 | f <- read.csv(file="trec-2005-profile-IDX_SAWIT.csv",sep=";") 6 | f <- cbind(f,qry_and="RANKED-OR") 7 | 8 | i <- read.csv(file="trec-2006-and-profile-IDX_SAWIT.csv",sep=";") 9 | i <- cbind(i,qry_and="RANKED-AND") 10 | j <- read.csv(file="trec-2006-profile-IDX_SAWIT.csv",sep=";") 11 | j <- cbind(j,qry_and="RANKED-OR") 12 | 13 | d2 <- read.csv(file="trec-2005-and-profile-IDX_SAWIT2.csv",sep=";") 14 | d2 <- cbind(d2,qry_and="RANKED-AND") 15 | f2 <- read.csv(file="trec-2005-profile-IDX_SAWIT2.csv",sep=";") 16 | f2 <- cbind(f2,qry_and="RANKED-OR") 17 | 18 | i2 <- read.csv(file="trec-2006-and-profile-IDX_SAWIT2.csv",sep=";") 19 | i2 <- cbind(i2,qry_and="RANKED-AND") 20 | j2 <- read.csv(file="trec-2006-profile-IDX_SAWIT2.csv",sep=";") 21 | j2 <- cbind(j2,qry_and="RANKED-OR") 22 | 23 | g <- rbind(d,f) 24 | g <- cbind(g,qryfile="trec2005") 25 | h <- rbind(i,j) 26 | h <- cbind(h,qryfile="trec2006") 27 | l <- rbind(g,h) 28 | 29 | g2 <- rbind(d2,f2) 30 | g2 <- cbind(g2,qryfile="trec2005") 31 | h2 <- rbind(i2,j2) 32 | h2 <- cbind(h2,qryfile="trec2006") 33 | l2 <- rbind(g2,h2) 34 | 35 | q2 <- rbind(l,l2) 36 | 37 | p <- ggplot(q2,aes(wt_searched,qry_time,colour=index)) 38 | p <- p + geom_point() 39 | p <- p + facet_grid(qryfile ~ qry_and) 40 | #p <- p + scale_y_log10(limits=c(0.1, 10000)) 41 | print(p) -------------------------------------------------------------------------------- /experiments/wikishort.qry: -------------------------------------------------------------------------------- 1 | 1;of air 2 | 2;until death 3 | 3;history the first 4 | 4;was one born 5 | -------------------------------------------------------------------------------- /extras/clueweb-collection.indricfg: -------------------------------------------------------------------------------- 1 | 2 | 100G 3 | true 4 | cluewebB 5 | 6 | /collections/clueweb/CLUEWEB09_1/ClueWeb09_English_1 7 | warc 8 | 9 | krovetz 10 | 11 | -------------------------------------------------------------------------------- /extras/gov2-collection.indricfg: -------------------------------------------------------------------------------- 1 | 2 | 100G 3 | true 4 | gov2 5 | 6 | /collections/TREC/datasets/gov2/gov2-corpus 7 | trecweb 8 | 9 | krovetz 10 | 11 | -------------------------------------------------------------------------------- /extras/speeches-collection.indricfg: -------------------------------------------------------------------------------- 1 | 2 | 100G 3 | true 4 | speeches 5 | 6 | /devhome6/mpetri/collections/speeches/ 7 | txt 8 | 9 | krovetz 10 | 11 | -------------------------------------------------------------------------------- /extras/trec8-collection.indricfg: -------------------------------------------------------------------------------- 1 | 2 | 100G 3 | true 4 | trec8 5 | 6 | /devhome6/mpetri/collections/trec8/ 7 | trectext 8 | 9 | krovetz 10 | 11 | -------------------------------------------------------------------------------- /extras/wikishort-collection.indricfg: -------------------------------------------------------------------------------- 1 | 2 | 100G 3 | true 4 | wikishort 5 | 6 | /devhome6/mpetri/collections/wikishort/ 7 | txt 8 | 9 | krovetz 10 | 11 | -------------------------------------------------------------------------------- /extras/wt10g-collection.indricfg: -------------------------------------------------------------------------------- 1 | 2 | 100G 3 | true 4 | wt10g 5 | 6 | /collections/TREC/datasets/wt10g/ 7 | trecweb 8 | 9 | krovetz 10 | 11 | -------------------------------------------------------------------------------- /include/surf/.gitignore: -------------------------------------------------------------------------------- 1 | !*.hpp 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /include/surf/comm.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SURF_COMM_H 2 | #define SURF_COMM_H 3 | 4 | #define REQ_PARSE_ERROR 0 5 | #define REQ_RESPONE_OK 1 6 | 7 | #define REQ_TYPE_QRY_OR 0 8 | #define REQ_TYPE_QRY_AND 1 9 | #define REQ_TYPE_QUIT 2 10 | 11 | #define REQ_MODE_PROFILE 0 12 | #define REQ_MODE_TIME 1 13 | 14 | #define MAX_QRY_LEN 1024 15 | 16 | struct surf_time_resp { 17 | uint8_t status; 18 | uint64_t req_id; 19 | uint64_t qry_id; 20 | uint64_t qry_len; 21 | uint64_t k; 22 | uint64_t result_size; 23 | uint64_t qry_time; 24 | uint64_t search_time; 25 | uint64_t wt_search_space; 26 | uint64_t wt_nodes; 27 | uint64_t postings_evaluated; 28 | uint64_t postings_total; 29 | char index[256]; 30 | char collection[256]; 31 | char ranker[256]; 32 | }; 33 | 34 | struct surf_qry_request { 35 | uint8_t type; 36 | uint8_t mode; 37 | uint8_t phrases; 38 | double phrase_threshold; 39 | uint64_t id; 40 | uint64_t k; 41 | uint8_t output_results; 42 | uint8_t int_qry; 43 | char qry_str[MAX_QRY_LEN] = {0}; 44 | }; 45 | 46 | struct surf_results { 47 | uint64_t size; 48 | double data[0]; 49 | }; 50 | 51 | 52 | #endif -------------------------------------------------------------------------------- /include/surf/config.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SURF_CONFIG_HPP 2 | #define SURF_CONFIG_HPP 3 | 4 | #include "sdsl/config.hpp" 5 | #include 6 | #include 7 | 8 | namespace surf{ 9 | 10 | const std::string TEXT_FILENAME = "text_int_SURF.sdsl"; 11 | const std::string DICT_FILENAME = "dict.txt"; 12 | const std::string URL2ID_FILENAME = "url2id.txt"; 13 | const std::string DOCNAMES_FILENAME = "doc_names.txt"; 14 | const std::string SPACEUSAGE_FILENAME = "space_usage"; 15 | 16 | const std::string KEY_DOCWEIGHT = "docweights"; 17 | const std::string KEY_DARRAY = "darray"; 18 | const std::string KEY_U = "U"; 19 | const std::string KEY_WTU = "wtu"; 20 | const std::string KEY_UMARK = "Umark"; 21 | const std::string KEY_URANK = "Urank"; 22 | const std::string KEY_DOCPERM = "docperm"; 23 | const std::string KEY_SADADF = "sadadf"; 24 | const std::string KEY_WTD = "wtd"; 25 | const std::string KEY_C = "C"; 26 | const std::string KEY_WTC = "wtc"; 27 | const std::string KEY_TMPCST = "tempcst"; 28 | const std::string KEY_TMPDUP = "tmpdup"; 29 | const std::string KEY_WTDUP = "wtdup"; 30 | const std::string KEY_WTDUP2 = "wtdup2"; 31 | const std::string KEY_WTR = "wtr"; 32 | const std::string KEY_WTDP = "wtdp"; 33 | const std::string KEY_DUP = "dup"; 34 | const std::string KEY_R = "R"; // =R1 in the paper 35 | const std::string KEY_DUPMARK = "DUPmark"; 36 | const std::string KEY_DUPRANK = "DUPrank"; 37 | const std::string KEY_DUP2 = "dup2"; // =R in the paper 38 | const std::string KEY_DOCCNT = "doccnt"; 39 | const std::string KEY_COLLEN = "collen"; 40 | const std::string KEY_DOCBORDER = "docborder"; 41 | const std::string KEY_DOC_LENGTHS = "doclengths"; 42 | const std::string KEY_INVFILE_TERM_RANGES = "invfile_term_ranges"; 43 | const std::string KEY_INVFILE_PLISTS = "invfile_postings_lists"; 44 | const std::string KEY_INVFILE_DOCPERM = "invfile_docperm"; 45 | const std::string KEY_INVFILE_IDOCPERM = "invfile_inv_docperm"; 46 | const std::string KEY_F_T = "Ft"; 47 | const std::string KEY_H = "H"; 48 | const std::string KEY_CSA = "csa"; 49 | const std::string KEY_MAXTF = "maxtf"; 50 | 51 | std::vector storage_keys = {KEY_DOCCNT, 52 | KEY_DARRAY, 53 | KEY_DOCPERM, 54 | KEY_SADADF, 55 | KEY_WTD, 56 | KEY_C, 57 | KEY_WTC, 58 | KEY_TMPCST, 59 | KEY_TMPDUP, 60 | KEY_DUP, 61 | KEY_DUP2, 62 | KEY_R, 63 | KEY_WTDUP, 64 | KEY_WTDUP2, 65 | KEY_WTR, 66 | KEY_MAXTF, 67 | KEY_DOCCNT, 68 | KEY_DOC_LENGTHS, 69 | KEY_COLLEN, 70 | KEY_INVFILE_TERM_RANGES, 71 | KEY_INVFILE_PLISTS, 72 | KEY_H, 73 | KEY_U, 74 | KEY_WTU, 75 | KEY_UMARK, 76 | KEY_URANK, 77 | KEY_CSA, 78 | sdsl::conf::KEY_TEXT, 79 | sdsl::conf::KEY_TEXT_INT, 80 | sdsl::conf::KEY_SA, 81 | sdsl::conf::KEY_LCP, 82 | sdsl::conf::KEY_BWT, 83 | sdsl::conf::KEY_BWT_INT, 84 | sdsl::conf::KEY_PSI 85 | }; 86 | 87 | } // end namespace 88 | #endif 89 | -------------------------------------------------------------------------------- /include/surf/construct_DUP2.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SURF_CONSTRUCT_DUP2_HPP 2 | #define SURF_CONSTRUCT_DUP2_HPP 3 | 4 | #include 5 | 6 | namespace surf{ 7 | 8 | // generate the DUP2 array (= R in the paper) and 9 | // the KEY_DUPMARK bitvector 10 | template 11 | void construct_dup2(sdsl::cache_config& cc) 12 | { 13 | using namespace sdsl; 14 | using namespace std; 15 | 16 | string dup2_file = cache_file_name(surf::KEY_DUP2,cc); 17 | if (!cache_file_exists(surf::KEY_DUP2,cc)){ 18 | cout<<"......dup2 does not exist. Generate it..."< dup(cache_file_name(surf::KEY_DUP, cc)); 24 | cout<<".........dup.size()="<(surf::KEY_TMPCST, cc)); 40 | cout<<".........cst.size()="< buf; 51 | for (uint64_t i = std::get<1>(df_info); i <= std::get<2>(df_info); ++i) { 52 | buf.push_back(dup[i]); 53 | } 54 | for (uint64_t i = next_idx; i < std::get<1>(df_info); ++i){ 55 | dup_mark[i] = 0; 56 | } 57 | next_idx = std::get<2>(df_info)+1; 58 | for (size_t i=0; i < buf.size(); ++i){ 59 | dup2.push_back(buf[i]); 60 | } 61 | } 62 | for (uint64_t i = next_idx; i < dup_mark.size(); ++i){ 63 | dup_mark[i]=0; 64 | } 65 | } 66 | } 67 | 68 | }// end namespace 69 | 70 | #endif 71 | -------------------------------------------------------------------------------- /include/surf/construct_U.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SURF_CONSTRUCT_U_HPP 2 | #define SURF_CONSTRUCT_U_HPP 3 | 4 | #include 5 | #include 6 | 7 | namespace surf{ 8 | 9 | // generate the U array (= D^1 in the paper) and 10 | // the KEY_UMARK bitvector 11 | template 12 | void construct_u(sdsl::cache_config& cc) 13 | { 14 | using namespace sdsl; 15 | using namespace std; 16 | static_assert(std::is_same::value, "CST class expected"); 17 | 18 | string u_file = cache_file_name(surf::KEY_U,cc); 19 | if (!cache_file_exists(surf::KEY_U,cc)){ 20 | cout<<"......U does not exist. Generate it..."< D_array(cache_file_name(surf::KEY_DARRAY, cc)); 26 | cout<<".........D.size()="<(surf::KEY_TMPCST, cc)); 32 | cout<<".........cst.size()="< U(u_file, std::ios::out, 35 | 1024*1024, D_array.width()); 36 | string umark_file = cache_file_name(surf::KEY_UMARK, cc); 37 | int_vector_buffer<1> Umark(umark_file, std::ios::out); 38 | 39 | uint64_t doc_cnt = 0; 40 | load_from_cache(doc_cnt, KEY_DOCCNT, cc); 41 | cout << ".........doc_cnt = " << doc_cnt << endl; 42 | 43 | std::vector last_occ(doc_cnt+1, -1); 44 | 45 | auto root = cst.root(); 46 | for (auto& v : cst.children(root)){ 47 | auto lb = cst.lb(v); 48 | auto rb = cst.rb(v); 49 | std::vector buf; 50 | for (auto i = lb; i<=rb; ++i){ 51 | auto x = D_array[i]; 52 | if ( last_occ[x] < (int64_t)lb ){ 53 | buf.push_back(x); 54 | } 55 | last_occ[x] = i; 56 | } 57 | std::sort(buf.begin(), buf.end()); 58 | for (size_t i=0; i 5 | 6 | namespace surf{ 7 | 8 | template 9 | void construct_col_len(sdsl::cache_config& cc) 10 | { 11 | using namespace sdsl; 12 | using namespace std; 13 | static_assert(t_width == 0 or t_width == 8 , 14 | "construct_col_len: width must be `0` for integer alphabet and `8` for byte alphabet"); 15 | 16 | if ( !cache_file_exists(KEY_COLLEN, cc) ){ 17 | const char* KEY_TEXT = key_text_trait::KEY_TEXT; 18 | std::string text_file = cache_file_name(KEY_TEXT, cc); 19 | if (!cache_file_exists(KEY_TEXT, cc)) { 20 | std::cerr << "ERROR: construct_col_len: " << text_file 21 | << " does not exist. Abort." << std::endl; 22 | return; 23 | } 24 | uint64_t n = 0; 25 | int_vector_buffer text(text_file); 26 | n = text.size(); 27 | store_to_cache(n, KEY_COLLEN, cc); 28 | } 29 | } 30 | 31 | }// end namespace 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /include/surf/construct_darray.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SURF_DARRAY_HPP 2 | #define SURF_DARRAY_HPP 3 | 4 | #include "config.hpp" 5 | #include "construct_doc_perm.hpp" 6 | #include "construct_doc_border.hpp" 7 | #include 8 | #include 9 | 10 | namespace surf{ 11 | 12 | template 13 | void construct_darray(sdsl::cache_config& cc) 14 | { 15 | using namespace sdsl; 16 | using namespace std; 17 | if ( !cache_file_exists(KEY_DARRAY, cc) ) { 18 | bit_vector doc_border; 19 | construct_doc_border(cc); 20 | load_from_cache(doc_border, KEY_DOCBORDER, cc); 21 | 22 | int_vector_buffer<> sa(cache_file_name(conf::KEY_SA, cc)); 23 | 24 | rank_support_v<> doc_border_rank(&doc_border); 25 | uint64_t doc_cnt = doc_border_rank(doc_border.size()); 26 | 27 | construct_doc_perm(cc); 28 | doc_perm dp; 29 | load_from_cache(dp, KEY_DOCPERM,cc); 30 | 31 | int_vector<> darray(sa.size(), 0, bits::hi(doc_cnt)+1); 32 | for (uint64_t i=0; i 5 | #include 6 | 7 | namespace surf{ 8 | 9 | template 10 | void construct_doc_border(sdsl::cache_config& cc) 11 | { 12 | using namespace sdsl; 13 | using namespace std; 14 | static_assert(t_width == 0 or t_width == 8 , 15 | "construct_doc_border: width must be `0` for integer alphabet and `8` for byte alphabet"); 16 | 17 | if ( !cache_file_exists(KEY_DOCBORDER, cc) ) { 18 | const char* KEY_TEXT = key_text_trait::KEY_TEXT; 19 | std::string text_file = cache_file_name(KEY_TEXT, cc); 20 | if (!cache_file_exists(KEY_TEXT, cc)) { 21 | std::cerr << "ERROR: construct_doc_border: " << text_file 22 | << " does not exist. Abort." << std::endl; 23 | return; 24 | } 25 | int_vector_buffer text(text_file); 26 | bit_vector doc_border(text.size(), 0); 27 | for (uint64_t i=0; i < text.size(); ++i){ 28 | if ( 1 == text[i] ){ 29 | doc_border[i] = 1; 30 | } 31 | } 32 | store_to_cache(doc_border, KEY_DOCBORDER, cc); 33 | } 34 | } 35 | 36 | }// end namespace 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /include/surf/construct_doc_cnt.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SURF_CONSTRUCT_DOC_CNT_HPP 2 | #define SURF_CONSTRUCT_DOC_CNT_HPP 3 | 4 | #include 5 | #include 6 | 7 | namespace surf{ 8 | 9 | template 10 | void construct_doc_cnt(sdsl::cache_config& cc) 11 | { 12 | using namespace sdsl; 13 | using namespace std; 14 | static_assert(t_width == 0 or t_width == 8 , 15 | "construct_doc_cnt: width must be `0` for integer alphabet and `8` for byte alphabet"); 16 | 17 | if ( !cache_file_exists(KEY_DOCCNT, cc) ){ 18 | const char* KEY_TEXT = key_text_trait::KEY_TEXT; 19 | std::string text_file = cache_file_name(KEY_TEXT, cc); 20 | if (!cache_file_exists(KEY_TEXT, cc)) { 21 | std::cerr << "ERROR: construct_doc_cnt: " << text_file 22 | << " does not exist. Abort." << std::endl; 23 | return; 24 | } 25 | uint64_t doc_cnt = 0; 26 | int_vector_buffer text(text_file); 27 | doc_cnt = count_if(text.begin(), text.end(), 28 | [](decltype(*(text.begin())) y){ 29 | return y==1; 30 | }); 31 | store_to_cache(doc_cnt, KEY_DOCCNT, cc); 32 | } 33 | } 34 | 35 | }// end namespace 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /include/surf/construct_doc_lengths.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SURF_CONSTRUCT_DOC_LENGTHS_HPP 2 | #define SURF_CONSTRUCT_DOC_LENGTHS_HPP 3 | 4 | #include 5 | #include 6 | 7 | namespace surf{ 8 | 9 | template 10 | void construct_doc_lengths(sdsl::cache_config& cconfig) 11 | { 12 | using namespace sdsl; 13 | using namespace std; 14 | static_assert(t_width == 0 or t_width == 8 , 15 | "construct_doc_border: width must be `0` for integer alphabet and `8` for byte alphabet"); 16 | const char* KEY_TEXT = key_text_trait::KEY_TEXT; 17 | std::string text_file = cache_file_name(KEY_TEXT, cconfig); 18 | if (!cache_file_exists(KEY_TEXT, cconfig)) { 19 | std::cerr << "ERROR: construct_doc_cnt: " << text_file 20 | << " does not exist. Abort." << std::endl; 21 | return; 22 | } 23 | int_vector_buffer text(text_file); 24 | std::vector doc_lengths; 25 | for (uint64_t i=0, len=0; i < text.size(); ++i){ 26 | ++len; 27 | if ( 1 == text[i] ){ 28 | doc_lengths.push_back(len); 29 | len = 0; 30 | } 31 | } 32 | sdsl::int_vector<> sdsl_doc_len(doc_lengths.size()); 33 | for(size_t i=0;i 6 | #include 7 | #include 8 | 9 | namespace surf{ 10 | 11 | template 12 | void construct_doc_perm(sdsl::cache_config& cc) 13 | { 14 | using namespace sdsl; 15 | using namespace std; 16 | static_assert(t_width == 0 or t_width == 8 , 17 | "construct_doc_perm: width must be `0` for integer alphabet and `8` for byte alphabet"); 18 | 19 | if ( !cache_file_exists(KEY_DOCPERM, cc) ) { 20 | const char* KEY_TEXT = key_text_trait::KEY_TEXT; 21 | std::string text_file = cache_file_name(KEY_TEXT, cc); 22 | if (!cache_file_exists(KEY_TEXT, cc)) { 23 | std::cerr << "ERROR: construct_doc_perm: " << text_file 24 | << " does not exist. Abort." << std::endl; 25 | return; 26 | } 27 | int_vector_buffer text(text_file); 28 | 29 | std::cout<<"constructing doc_perm start"< tPII; 31 | std::vector len_id; 32 | for (uint64_t i=0, doc_len=0,id=0; i < text.size(); ++i){ 33 | ++doc_len; 34 | if ( 1 == text[i] ){ 35 | len_id.emplace_back(doc_len, id); 36 | ++id; 37 | doc_len = 0; 38 | } 39 | } 40 | std::cout<<"now sorting..."<(len_id.size(), 0, sdsl::bits::hi(len_id.size()-1)+1); 45 | dp.len2id = dp.id2len; 46 | for (size_t i=0; i& ids, sdsl::int_vector<>& sp, 16 | sdsl::int_vector<>& ep,sdsl::cache_config& cconfig) 17 | { 18 | if (!cache_file_exists(sdsl::conf::KEY_SA, cconfig)) { 19 | sdsl::construct_sa(cconfig); 20 | } 21 | register_cache_file(sdsl::conf::KEY_SA, cconfig); 22 | 23 | sdsl::int_vector_buffer<> sa(cache_file_name(sdsl::conf::KEY_SA,cconfig)); 24 | sdsl::int_vector<> T; 25 | load_from_cache(T,sdsl::conf::KEY_TEXT_INT,cconfig); 26 | size_t range_start = 0; 27 | std::vector> ranges; 28 | std::cout << "determine term ranges"<< std::endl; 29 | for(size_t i=1;i(range); 42 | sp[num_sym] = std::get<1>(range); 43 | ep[num_sym++] = std::get<2>(range); 44 | } 45 | } 46 | 47 | void construct_invidx_doc_permuations(sdsl::int_vector<>& id_mapping,sdsl::cache_config& cconfig) 48 | { 49 | surf::construct_doc_cnt(cconfig); 50 | uint64_t doc_cnt = 0; 51 | load_from_cache(doc_cnt, surf::KEY_DOCCNT, cconfig); 52 | sdsl::int_vector<> doc_mapping(doc_cnt); 53 | { 54 | auto url_file = cconfig.dir + "/../" + surf::URL2ID_FILENAME; 55 | std::ifstream ufs(url_file); 56 | if(ufs.is_open()) { 57 | /* load current/indri order */ 58 | std::unordered_map id_mapping; 59 | auto docnames_file = cconfig.dir + "/../" + surf::DOCNAMES_FILENAME; 60 | std::ifstream dfs(docnames_file); 61 | std::string name_mapping; 62 | size_t j=0; 63 | while( std::getline(dfs,name_mapping) ) { 64 | id_mapping[name_mapping] = j; 65 | j++; 66 | } 67 | /* load url sorted order */ 68 | std::string url_mapping; 69 | j=0; 70 | while( std::getline(ufs,url_mapping) ) { 71 | auto doc_name = url_mapping.substr(url_mapping.find(' ')+1); 72 | auto itr = id_mapping.find(doc_name); 73 | if(itr != id_mapping.end()) { 74 | doc_mapping[itr->second] = j; 75 | } else { 76 | std::cerr << "could not find mapping for '" << doc_name << "'" << std::endl; 77 | } 78 | j++; 79 | } 80 | } else { 81 | // identity permutation 82 | for(size_t i=0;i& F_t,sdsl::cache_config& cconfig) 96 | { 97 | // load term ranges 98 | sdsl::int_vector<> ids; sdsl::int_vector<> sp; sdsl::int_vector<> ep; 99 | if( cache_file_exists(surf::KEY_INVFILE_TERM_RANGES,cconfig) ) { 100 | std::ifstream ifs(cache_file_name(surf::KEY_INVFILE_TERM_RANGES,cconfig)); 101 | ids.load(ifs); 102 | sp.load(ifs); 103 | ep.load(ifs); 104 | } else { 105 | construct_term_ranges(ids,sp,ep,cconfig); 106 | std::ofstream ofs(cache_file_name(surf::KEY_INVFILE_TERM_RANGES,cconfig)); 107 | serialize(ids,ofs); 108 | serialize(sp,ofs); 109 | serialize(ep,ofs); 110 | } 111 | 112 | F_t.resize(ids.size()); 113 | for(size_t i=0;i 119 | void construct_postings_lists(std::vector& postings_lists,sdsl::cache_config& cconfig) 120 | { 121 | using namespace sdsl; 122 | using namespace std; 123 | 124 | // load term ranges 125 | sdsl::int_vector<> ids; sdsl::int_vector<> sp; sdsl::int_vector<> ep; 126 | if( cache_file_exists(surf::KEY_INVFILE_TERM_RANGES,cconfig) ) { 127 | std::ifstream ifs(cache_file_name(surf::KEY_INVFILE_TERM_RANGES,cconfig)); 128 | ids.load(ifs); 129 | sp.load(ifs); 130 | ep.load(ifs); 131 | } else { 132 | construct_term_ranges(ids,sp,ep,cconfig); 133 | std::ofstream ofs(cache_file_name(surf::KEY_INVFILE_TERM_RANGES,cconfig)); 134 | serialize(ids,ofs); 135 | serialize(sp,ofs); 136 | serialize(ep,ofs); 137 | } 138 | 139 | 140 | if (!cache_file_exists(surf::KEY_DOCBORDER, cconfig)){ 141 | construct_doc_border(cconfig); 142 | } 143 | if (!cache_file_exists(surf::KEY_DARRAY, cconfig)){ 144 | construct_darray(cconfig); 145 | } 146 | 147 | // load or construct D array 148 | std::cout << "stream D"<< std::endl; 149 | int_vector_buffer<> D(cache_file_name(surf::KEY_DARRAY,cconfig)); 150 | 151 | // load or construct rank function 152 | std::cout << "load rank"<< std::endl; 153 | t_rank ranker(cconfig); 154 | 155 | // load mapping if it exists 156 | std::cout << "load docid mapping" << std::endl; 157 | sdsl::int_vector<> doc_mapping; 158 | doc_perm dp; 159 | load_from_cache(dp, KEY_DOCPERM, cconfig); 160 | load_from_cache(doc_mapping, KEY_INVFILE_DOCPERM, cconfig); 161 | 162 | // construct plist for each range 163 | std::cout << "create postings lists"<< endl; 164 | size_t max_id = ids[ids.size()-1]; 165 | postings_lists.resize(max_id+1); 166 | for(size_t i=2;i tmpD(range_size); 169 | for(size_t j=sp[i];j<=ep[i];j++) tmpD[j-sp[i]] = doc_mapping[dp.len2id[D[j]]]; 170 | if(range_size>1000) std::cout << "(" << i << ") |<" << sp[i] << "," << ep[i] << ">| = " << range_size << std::endl; 171 | postings_lists[ids[i]] = t_pl(ranker,tmpD,0,range_size-1); 172 | } 173 | } 174 | 175 | }// end namespace 176 | 177 | #endif 178 | -------------------------------------------------------------------------------- /include/surf/doc_perm.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SURF_DOC_PERM_HPP 2 | #define SURF_DOC_PERM_HPP 3 | 4 | #include 5 | #include 6 | 7 | namespace surf{ 8 | 9 | struct doc_perm{ 10 | typedef typename sdsl::int_vector<>::size_type size_type; 11 | sdsl::int_vector<> id2len; // doc id to length ordered id 12 | sdsl::int_vector<> len2id; // length ordered id to doc id 13 | 14 | inline size_type serialize(std::ostream& out, sdsl::structure_tree_node* v = NULL, std::string name = "") const { 15 | using namespace sdsl; 16 | structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); 17 | size_type written_bytes = 0; 18 | written_bytes += id2len.serialize(out, child, "id2len"); 19 | written_bytes += len2id.serialize(out, child, "len2id"); 20 | structure_tree::add_size(child, written_bytes); 21 | return written_bytes; 22 | } 23 | 24 | inline void load(std::istream &in){ 25 | id2len.load(in); 26 | len2id.load(in); 27 | } 28 | }; 29 | 30 | } 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /include/surf/idx_d.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SURF_IDX_D_HPP 2 | #define SURF_IDX_D_HPP 3 | 4 | #include "sdsl/suffix_trees.hpp" 5 | #include "surf/df_sada.hpp" 6 | #include "surf/rank_functions.hpp" 7 | #include "surf/construct_col_len.hpp" 8 | #include 9 | #include 10 | #include 11 | 12 | namespace surf{ 13 | 14 | using range_type = sdsl::range_type; 15 | 16 | struct term_info{ 17 | std::vector t; // term_id 18 | uint64_t f_qt; // term_frequency 19 | uint64_t sp_Dt; // start of interval for term t in the suffix array 20 | uint64_t ep_Dt; // end of interval for term t in the suffix array 21 | uint64_t f_Dt; // number of distinct document the term occurs in 22 | 23 | term_info() = default; 24 | term_info(const std::vector& t, uint64_t f_qt, uint64_t sp_Dt, uint64_t ep_Dt, uint64_t f_Dt) : 25 | t(t), f_qt(f_qt), sp_Dt(sp_Dt), ep_Dt(ep_Dt), f_Dt(f_Dt) { 26 | 27 | } 28 | 29 | term_info(term_info&&) = default; 30 | term_info(const term_info&) = default; 31 | term_info& operator=(term_info&&) = default; 32 | term_info& operator=(const term_info&) = default; 33 | 34 | uint64_t F_Dt() const{ 35 | return ep_Dt-sp_Dt+1; 36 | } 37 | }; 38 | 39 | template 40 | struct s_state_t{ 41 | double score; 42 | t_wt_node v; 43 | std::vector t_ptrs; // pointers to term_info array 44 | std::vector r; // ranges 45 | 46 | s_state_t() = default; 47 | 48 | s_state_t(double score, const t_wt_node& v, 49 | const std::vector& t_ptrs, 50 | const std::vector& r): 51 | score(score), v(v), t_ptrs(t_ptrs), 52 | r(r){} 53 | 54 | s_state_t(s_state_t&&) = default; 55 | s_state_t(const s_state_t&) = default; 56 | 57 | s_state_t& operator=(s_state_t&&) = default; 58 | s_state_t& operator=(const s_state_t&) = default; 59 | 60 | bool operator<(const s_state_t& s)const{ 61 | if ( score != s.score ){ 62 | return score < s.score; 63 | } 64 | return v < s.v; 65 | } 66 | }; 67 | 68 | /*! Class idx_d consists of a 69 | * - CSA over the collection concatenation 70 | * - document frequency structure 71 | * - a WT over the D array 72 | */ 73 | template> 77 | class idx_d{ 78 | public: 79 | using size_type = sdsl::int_vector<>::size_type; 80 | typedef t_csa csa_type; 81 | typedef t_wtd wtd_type; 82 | typedef typename wtd_type::node_type node_type; 83 | typedef t_df df_type; 84 | typedef t_ranker ranker_type; 85 | public: 86 | csa_type m_csa; 87 | wtd_type m_wtd; 88 | df_type m_df; 89 | doc_perm m_docperm; 90 | ranker_type m_ranker; 91 | 92 | using state_type = s_state_t; 93 | public: 94 | 95 | result search(const std::vector& qry,size_t k,bool ranked_and = false,bool profile = false) const { 96 | typedef std::priority_queue pq_type; 97 | typedef std::priority_queue, std::greater> pq_min_type; 98 | std::vector terms; 99 | std::vector term_ptrs; 100 | std::vector ranges; 101 | result res; 102 | 103 | if(profile) { 104 | res.wt_nodes = 2*m_wtd.sigma-1; 105 | } 106 | 107 | for (size_t i=0; i 0 ) { 113 | auto f_Dt = std::get<0>(m_df(sp,ep)); // document frequency 114 | terms.emplace_back(qry[i].token_ids, qry[i].f_qt, sp, ep, f_Dt); 115 | ranges.emplace_back(sp, ep); 116 | } 117 | } 118 | term_ptrs.resize(terms.size()); 119 | for (size_type i=0; i& t_ptrs,node_type& v, 126 | std::vector& r, 127 | pq_min_type& pq_min, const size_t& k){ 128 | auto min_idx = m_wtd.sym(v) << (m_wtd.max_level - v.level); 129 | auto min_doc_len = m_ranker.doc_length(m_docperm.len2id[min_idx]); 130 | state_type t; // new state 131 | t.v = v; 132 | 133 | t.score = initial_term_num * m_ranker.calc_doc_weight(min_doc_len); 134 | 135 | bool eval = false; 136 | bool is_leaf = m_wtd.is_leaf(v); 137 | for (size_t i = 0; i < r.size(); ++i){ 138 | if ( !empty(r[i]) ){ 139 | eval = true; 140 | t.r.push_back(r[i]); 141 | t.t_ptrs.push_back(t_ptrs[i]); 142 | 143 | auto score = m_ranker.calculate_docscore( 144 | t.t_ptrs.back()->f_qt, 145 | size(t.r.back()), 146 | t.t_ptrs.back()->f_Dt, 147 | t.t_ptrs.back()->F_Dt(), 148 | min_doc_len, 149 | is_leaf 150 | ); 151 | t.score += score; 152 | } else if ( ranked_and ){ 153 | return; 154 | } 155 | } 156 | if (!eval){ 157 | return; 158 | } 159 | if ( pq_min.size() < k ){ // not yet k leaves in score queue 160 | pq.emplace(t); 161 | if (profile) res.wt_search_space++; 162 | if ( m_wtd.is_leaf(t.v) ) 163 | pq_min.push(t.score); 164 | } else { // more than k leaves in score queue 165 | if ( t.score > pq_min.top() ){ 166 | pq.emplace(t); 167 | if (profile) res.wt_search_space++; 168 | if ( m_wtd.is_leaf(t.v) ){ 169 | pq_min.pop(); 170 | pq_min.push(t.score); 171 | } 172 | } 173 | } 174 | }; 175 | 176 | constexpr double max_score = std::numeric_limits::max(); 177 | 178 | pq_min_type pq_min; 179 | pq_type pq; 180 | pq.emplace(max_score, m_wtd.root(), term_ptrs, ranges); 181 | if(profile) res.wt_search_space++; 182 | 183 | while ( !pq.empty() and res.list.size() < k ) { 184 | state_type s = pq.top(); 185 | pq.pop(); 186 | if ( m_wtd.is_leaf(s.v) ){ 187 | res.list.emplace_back(m_docperm.len2id[m_wtd.sym(s.v)], s.score); 188 | } else { 189 | //fast_expand: 190 | auto exp_v = m_wtd.expand(s.v); 191 | bool left_empty = m_wtd.empty(std::get<0>(exp_v)); 192 | bool right_empty = m_wtd.empty(std::get<1>(exp_v)); 193 | auto exp_r = m_wtd.expand(s.v, std::move(s.r)); 194 | if ( std::get<1>(exp_r).size() == 0 and std::get<0>(exp_r).size() > 0 and !m_wtd.is_leaf(std::get<0>(exp_v) )){ 195 | std::cout<<"easy"<(exp_v), std::get<0>(exp_r), pq_min, k); 200 | } else{ 201 | //std::cout<<"left_empty"<(exp_v), std::get<1>(exp_r), pq_min, k); 205 | } else{ 206 | //std::cout<<"right_empty"< 248 | void construct(idx_d& idx, 249 | const std::string&, 250 | sdsl::cache_config& cc, uint8_t num_bytes) 251 | { 252 | using namespace sdsl; 253 | using namespace std; 254 | 255 | construct_col_len(cc); 256 | 257 | cout<<"...CSA"<(surf::KEY_CSA, cc) ) 259 | { 260 | t_csa csa; 261 | construct(csa, "", cc, 0); 262 | store_to_cache(csa, surf::KEY_CSA, cc, true); 263 | } 264 | cout<<"...WTD"<(surf::KEY_WTD, cc) ){ 266 | construct_doc_perm(cc); 267 | construct_darray(cc); 268 | t_wtd wtd; 269 | construct(wtd, cache_file_name(surf::KEY_DARRAY, cc), cc); 270 | cout << "wtd.size() = " << wtd.size() << endl; 271 | cout << "wtd.sigma = " << wtd.sigma << endl; 272 | store_to_cache(wtd, surf::KEY_WTD, cc, true); 273 | } 274 | cout<<"...DF"<(surf::KEY_SADADF, cc)) 276 | { 277 | t_df df; 278 | construct(df, "", cc, 0); 279 | store_to_cache(df, surf::KEY_SADADF, cc, true); 280 | } 281 | } 282 | 283 | } // end namespace surf 284 | 285 | #endif 286 | -------------------------------------------------------------------------------- /include/surf/idx_dr.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SURF_IDX_DR_HPP 2 | #define SURF_IDX_DR_HPP 3 | 4 | #include "sdsl/suffix_trees.hpp" 5 | #include "surf/df_sada.hpp" 6 | #include "surf/rank_functions.hpp" 7 | #include "surf/idx_d.hpp" 8 | #include "surf/construct_col_len.hpp" 9 | #include "surf/construct_DUP2.hpp" 10 | #include 11 | #include 12 | #include 13 | 14 | namespace surf{ 15 | 16 | using range_type = sdsl::range_type; 17 | 18 | 19 | template 20 | struct s_state2_t{ 21 | double score; 22 | t_wtd_node v; // node in document array wavelet tree 23 | std::vector t_ptrs; // pointers to term_info array 24 | std::vector r_v; // ranges in v 25 | t_wtr_node w; // node in repetition array wavelet tree 26 | std::vector r_w; // ranges in w 27 | 28 | s_state2_t() = default; 29 | 30 | s_state2_t(double score, const t_wtd_node& v, 31 | const std::vector& t_ptrs, 32 | const std::vector& r_v, 33 | const t_wtr_node& w, 34 | const std::vector& r_w): 35 | score(score), v(v), t_ptrs(t_ptrs), 36 | r_v(r_v),w(w),r_w(r_w) 37 | {} 38 | 39 | s_state2_t(s_state2_t&&) = default; 40 | s_state2_t(const s_state2_t&) = default; 41 | 42 | s_state2_t& operator=(s_state2_t&&) = default; 43 | s_state2_t& operator=(const s_state2_t&) = default; 44 | 45 | bool operator<(const s_state2_t& s)const{ 46 | if ( score != s.score ){ 47 | return score < s.score; 48 | } 49 | return v < s.v; 50 | } 51 | }; 52 | 53 | 54 | template 55 | inline std::ostream& operator<<(std::ostream& os, const s_state2_t& state) 56 | { 57 | os << state.v.level << "-("<, 77 | typename t_rbv=sdsl::rrr_vector<63>, 78 | typename t_rrank=typename t_rbv::rank_1_type> 79 | class idx_dr{ 80 | public: 81 | using size_type = sdsl::int_vector<>::size_type; 82 | typedef t_csa csa_type; 83 | typedef t_wtd wtd_type; 84 | typedef typename wtd_type::node_type node_type; 85 | typedef t_df df_type; 86 | typedef t_wtr wtr_type; 87 | typedef typename wtr_type::node_type node2_type; 88 | typedef t_rbv rbv_type; 89 | typedef t_rrank rrank_type; 90 | typedef t_ranker ranker_type; 91 | public: 92 | csa_type m_csa; 93 | df_type m_df; 94 | wtr_type m_wtr; 95 | t_wtd m_wtd; 96 | rbv_type m_rbv; 97 | rrank_type m_rrank; 98 | doc_perm m_docperm; 99 | ranker_type m_ranker; 100 | 101 | using state_type = s_state2_t; 102 | public: 103 | 104 | result search(const std::vector& qry,size_t k,bool ranked_and = false,bool profile = false) const { 105 | typedef std::priority_queue pq_type; 106 | std::vector terms; 107 | std::vector term_ptrs; 108 | std::vector v_ranges; // ranges in wtd 109 | std::vector w_ranges; // ranges in wtdup 110 | result res; 111 | 112 | for (size_t i=0; i 0 ) { 118 | auto df_info = m_df(sp,ep); 119 | //std::cout<<"[sp,ep]=["<(df_info); // document frequency 121 | terms.emplace_back(qry[i].token_ids, qry[i].f_qt, sp, ep, f_Dt); 122 | //for(size_t k=sp; k<=ep; ++k){ std::cout<<".."<(df_info)), 126 | m_rrank(std::get<2>(df_info)+1)-1); 127 | 128 | } 129 | } 130 | term_ptrs.resize(terms.size()); 131 | for (size_type i=0; i& r_v, 137 | node2_type& w, std::vector& r_w){ 138 | auto min_idx = m_wtd.sym(v) << (m_wtd.max_level - v.level); 139 | auto min_doc_len = m_ranker.doc_length(m_docperm.len2id[min_idx]); 140 | state_type t; // new state 141 | t.v = v; 142 | t.w = w; 143 | t.score = initial_term_num * m_ranker.calc_doc_weight(min_doc_len); 144 | bool eval = false; 145 | bool is_leaf = m_wtd.is_leaf(v); 146 | for (size_t i = 0; i < r_v.size(); ++i){ 147 | if ( !empty(r_v[i]) ){ 148 | eval = true; 149 | t.r_v.push_back(r_v[i]); 150 | t.r_w.push_back(r_w[i]); 151 | t.t_ptrs.push_back(s.t_ptrs[i]); 152 | auto score = m_ranker.calculate_docscore( 153 | t.t_ptrs.back()->f_qt, 154 | size(t.r_w.back())+1, 155 | t.t_ptrs.back()->f_Dt, 156 | t.t_ptrs.back()->F_Dt(), 157 | min_doc_len, 158 | is_leaf 159 | ); 160 | t.score += score; 161 | } else if ( ranked_and ) { 162 | return; 163 | } 164 | } 165 | if (eval){ 166 | // std::cout << t << std::endl; 167 | if (profile) res.wt_search_space++; 168 | pq.emplace(t); 169 | } 170 | }; 171 | 172 | constexpr double max_score = std::numeric_limits::max(); 173 | 174 | pq_type pq; 175 | size_type search_space=0; 176 | pq.emplace(max_score, m_wtd.root(), term_ptrs, v_ranges, m_wtr.root(), w_ranges); 177 | // std::cout << "\n" << pq.top() << std::endl; 178 | if(profile) res.wt_search_space++; 179 | 180 | while ( !pq.empty() and res.list.size() < k ) { 181 | state_type s = pq.top(); 182 | pq.pop(); 183 | if ( m_wtd.is_leaf(s.v) ){ 184 | res.list.emplace_back(m_docperm.len2id[m_wtd.sym(s.v)], s.score); 185 | } else { 186 | auto exp_v = m_wtd.expand(s.v); 187 | auto exp_r_v = m_wtd.expand(s.v, s.r_v); 188 | auto exp_w = m_wtr.expand(s.w); 189 | auto exp_r_w = m_wtr.expand(s.w, s.r_w); 190 | 191 | if ( !m_wtd.empty(std::get<0>(exp_v)) ) { 192 | push_node(pq, s, std::get<0>(exp_v), std::get<0>(exp_r_v), 193 | std::get<0>(exp_w), std::get<0>(exp_r_w)); 194 | } 195 | if ( !m_wtd.empty(std::get<1>(exp_v)) ) { 196 | push_node(pq, s, std::get<1>(exp_v), std::get<1>(exp_r_v), 197 | std::get<1>(exp_w), std::get<1>(exp_r_w)); 198 | } 199 | } 200 | } 201 | return res; 202 | } 203 | 204 | void load(sdsl::cache_config& cc){ 205 | load_from_cache(m_csa, surf::KEY_CSA, cc, true); 206 | load_from_cache(m_df, surf::KEY_SADADF, cc, true); 207 | load_from_cache(m_wtr, surf::KEY_WTDUP2, cc, true); 208 | std::cerr<<"m_wtr.size()="< 256 | void construct(idx_dr& idx, 257 | const std::string&, 258 | sdsl::cache_config& cc, uint8_t num_bytes) 259 | { 260 | using namespace sdsl; 261 | using namespace std; 262 | 263 | construct_col_len(cc); 264 | 265 | cout<<"...CSA"<(surf::KEY_CSA, cc) ) 267 | { 268 | t_csa csa; 269 | construct(csa, "", cc, 0); 270 | store_to_cache(csa, surf::KEY_CSA, cc, true); 271 | } 272 | cout<<"...WTD"<(surf::KEY_WTD, cc) ){ 274 | construct_doc_perm(cc); 275 | construct_darray(cc); 276 | t_wtd wtd; 277 | construct(wtd, cache_file_name(surf::KEY_DARRAY, cc), cc); 278 | cout << "wtd.size() = " << wtd.size() << endl; 279 | cout << "wtd.sigma = " << wtd.sigma << endl; 280 | store_to_cache(wtd, surf::KEY_WTD, cc, true); 281 | } 282 | cout<<"...DF"<(surf::KEY_SADADF, cc)) 284 | { 285 | t_df df; 286 | construct(df, "", cc, 0); 287 | store_to_cache(df, surf::KEY_SADADF, cc, true); 288 | } 289 | cout<<"...WTR"<(surf::KEY_WTDUP2,cc)){ 291 | construct_dup2(cc); // construct DUP2 and DUPMARK 292 | t_wtr wtr; 293 | construct(wtr, cache_file_name(surf::KEY_DUP2, cc), cc); 294 | store_to_cache(wtr, surf::KEY_WTDUP2, cc, true); 295 | cout << "wtr.size() = " << wtr.size() << endl; 296 | cout << "wtr.sigma = " << wtr.sigma << endl; 297 | } 298 | cout<<"...R_BV"<(surf::KEY_DUPMARK, cc) ){ 300 | bit_vector bv; 301 | load_from_cache(bv, surf::KEY_DUPMARK, cc); 302 | t_rbv rbv(bv); 303 | store_to_cache(rbv, surf::KEY_DUPMARK, cc, true); 304 | t_rrank rrank(&rbv); 305 | store_to_cache(rrank, surf::KEY_DUPRANK, cc, true); 306 | } 307 | } 308 | 309 | } // end namespace surf 310 | 311 | #endif 312 | -------------------------------------------------------------------------------- /include/surf/indexes.hpp: -------------------------------------------------------------------------------- 1 | 2 | #ifndef SURF_INDEXES_HPP 3 | #define SURF_INDEXES_HPP 4 | 5 | #include "idx_invfile.hpp" 6 | #include "idx_d.hpp" 7 | #include "idx_dr.hpp" 8 | #include "idx_d1r1.hpp" 9 | #include "idx_d1r1mtf.hpp" 10 | 11 | #endif 12 | -------------------------------------------------------------------------------- /include/surf/phrase_parser.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SURF_PHRASE_PARSER_HPP 2 | #define SURF_PHRASE_PARSER_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "surf/config.hpp" 12 | #include "surf/query.hpp" 13 | 14 | namespace surf{ 15 | 16 | struct phrase_parser { 17 | phrase_parser() = delete; 18 | 19 | template 20 | static query_t phrase_segmentation(t_csa& csa, 21 | const std::vector& query_ids, 22 | const std::unordered_map& reverse_mapping, 23 | double threshold) 24 | { 25 | //compute single term probabilities 26 | std::vector P_single; 27 | for(size_t i=0;i> phrases; 35 | size_t start = 0; 36 | size_t stop = query_ids.size(); 37 | while(start < stop) { 38 | bool phrase_found = false; 39 | bool phrase_added = false; 40 | for(size_t i=start+1;i csa.size() ) { 45 | break; 46 | } 47 | 48 | auto cnt = sdsl::count(csa,query_ids.begin()+start,query_ids.begin()+i+1); 49 | double prob = (double)cnt / (double)csa.size(); 50 | 51 | // single 52 | double single = P_single[i]; 53 | for(size_t l=start;l<=i;l++) single *= P_single[l]; 54 | 55 | // calc ratio 56 | double assoc_ratio = log(prob)-log(single); 57 | 58 | // debug 59 | /* 60 | std::cout << "SCORE("; 61 | for(size_t l=start;l<=i;l++) { 62 | auto id = query_ids[l]; 63 | auto stritr = reverse_mapping.find(id); 64 | std::cout << stritr->second << " "; 65 | } 66 | std::cout << ") -> " << assoc_ratio << std::endl; 67 | */ 68 | 69 | if(assoc_ratio < threshold) { 70 | // not a phrase. if the prev one was a phrase we use it 71 | if(phrase_found) { 72 | std::vector phrase; 73 | for(size_t j=start;j phrase; 90 | for(size_t i=start;i single; 99 | single.push_back(query_ids[start]); 100 | phrases.push_back(single); 101 | start++; 102 | } 103 | } 104 | } 105 | 106 | // check if all phrases are uniq 107 | query_t q; 108 | auto itr = phrases.begin(); 109 | while(itr != phrases.end()) { 110 | auto cur_list = *itr; 111 | uint64_t num_equal = 0; 112 | auto next = itr+1; 113 | while(next != phrases.end()) { 114 | auto next_list = *next; 115 | if(std::equal(cur_list.begin(),cur_list.end(),next_list.begin())) { 116 | num_equal++; 117 | next = phrases.erase(next); 118 | } else { 119 | next++; 120 | } 121 | } 122 | 123 | /* get the string representation */ 124 | std::vector qry_str; 125 | for(const auto& id : cur_list) { 126 | auto rmitr = reverse_mapping.find(id); 127 | if(rmitr != reverse_mapping.end()) { 128 | qry_str.push_back(rmitr->second); 129 | } 130 | } 131 | std::get<1>(q).emplace_back(*itr,qry_str,num_equal); 132 | itr++; 133 | } 134 | std::sort(std::get<1>(q).begin(),std::get<1>(q).end()); // sort 135 | return q; 136 | } 137 | }; 138 | }// end namespace 139 | 140 | #endif 141 | -------------------------------------------------------------------------------- /include/surf/query.hpp: -------------------------------------------------------------------------------- 1 | 2 | #ifndef SURF_QUERY_HPP 3 | #define SURF_QUERY_HPP 4 | 5 | #include 6 | 7 | namespace surf { 8 | 9 | struct doc_score { 10 | uint64_t doc_id; 11 | double score; 12 | bool operator>(const doc_score& rhs) const { 13 | if(score == rhs.score) 14 | return doc_id > rhs.doc_id; 15 | return score > rhs.score; 16 | } 17 | doc_score() {}; 18 | doc_score(uint64_t did,double s) : doc_id(did) , score(s) {}; 19 | }; 20 | 21 | struct result { 22 | std::vector list; 23 | uint64_t wt_search_space = 0; 24 | uint64_t wt_nodes = 0; 25 | uint64_t postings_evaluated = 0; 26 | uint64_t postings_total = 0; 27 | }; 28 | 29 | struct query_token{ 30 | std::vector token_ids; 31 | std::vector token_strs; 32 | uint64_t f_qt; 33 | query_token(const std::vector& ids, 34 | const std::vector& strs, 35 | uint64_t f) : token_ids(ids), token_strs(strs), f_qt(f) 36 | { 37 | } 38 | bool operator<(const query_token& qt) const { 39 | return std::lexicographical_compare(token_ids.begin(), token_ids.end(), 40 | qt.token_ids.begin(), qt.token_ids.end()); 41 | } 42 | }; 43 | 44 | using query_t = std::tuple>; 45 | 46 | 47 | } 48 | 49 | #endif -------------------------------------------------------------------------------- /include/surf/query_parser.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SURF_QUERY_PARSER_HPP 2 | #define SURF_QUERY_PARSER_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "surf/config.hpp" 11 | #include "surf/query.hpp" 12 | 13 | namespace surf{ 14 | 15 | struct query_parser { 16 | query_parser() = delete; 17 | using mapping_t = std::pair, 18 | std::unordered_map 19 | >; 20 | 21 | static mapping_t 22 | load_dictionary(const std::string& collection_dir) 23 | { 24 | std::unordered_map id_mapping; 25 | std::unordered_map reverse_id_mapping; 26 | { 27 | auto dict_file = collection_dir + "/" + surf::DICT_FILENAME; 28 | std::ifstream dfs(dict_file); 29 | if(!dfs.is_open()) { 30 | std::cerr << "cannot load dictionary file."; 31 | exit(EXIT_FAILURE); 32 | } 33 | std::string term_mapping; 34 | while( std::getline(dfs,term_mapping) ) { 35 | auto sep_pos = term_mapping.find(' '); 36 | auto term = term_mapping.substr(0,sep_pos); 37 | auto idstr = term_mapping.substr(sep_pos+1); 38 | uint64_t id = std::stoull(idstr); 39 | id_mapping[term] = id; 40 | reverse_id_mapping[id] = term; 41 | } 42 | } 43 | return {id_mapping,reverse_id_mapping}; 44 | } 45 | 46 | static std::tuple> 47 | map_to_ids(const std::unordered_map& id_mapping, 48 | std::string query_str,bool only_complete,bool integers) 49 | { 50 | auto id_sep_pos = query_str.find(';'); 51 | auto qryid_str = query_str.substr(0,id_sep_pos); 52 | auto qry_id = std::stoull(qryid_str); 53 | auto qry_content = query_str.substr(id_sep_pos+1); 54 | 55 | std::vector ids; 56 | std::istringstream qry_content_stream(qry_content); 57 | for(std::string qry_token; std::getline(qry_content_stream,qry_token,' ');) { 58 | if(integers) { 59 | uint64_t id = std::stoull(qry_token); 60 | ids.push_back(id); 61 | } else { 62 | auto id_itr = id_mapping.find(qry_token); 63 | if(id_itr != id_mapping.end()) { 64 | ids.push_back(id_itr->second); 65 | } else { 66 | std::cerr << "ERROR: could not find '" << qry_token << "' in the dictionary." << std::endl; 67 | if(only_complete) { 68 | return std::make_tuple(false,qry_id,ids); 69 | } 70 | } 71 | } 72 | } 73 | return std::make_tuple(true,qry_id,ids); 74 | } 75 | 76 | static std::pair parse_query(const mapping_t& mapping, 77 | const std::string& query_str,bool only_complete = false,bool integers = false) 78 | { 79 | 80 | const auto& id_mapping = mapping.first; 81 | const auto& reverse_mapping = mapping.second; 82 | 83 | auto mapped_qry = map_to_ids(id_mapping,query_str,only_complete,integers); 84 | 85 | bool parse_ok = std::get<0>(mapped_qry); 86 | auto qry_id = std::get<1>(mapped_qry); 87 | if(parse_ok) { 88 | std::unordered_map qry_set; 89 | const auto& qids = std::get<2>(mapped_qry); 90 | for(const auto& qid : qids) { 91 | qry_set[qid] += 1; 92 | } 93 | std::vector query_tokens; 94 | for(const auto& qry_tok : qry_set) { 95 | std::vector term; 96 | term.push_back(qry_tok.first); 97 | auto rmitr = reverse_mapping.find(qry_tok.first); 98 | std::vector term_str; 99 | if(rmitr != reverse_mapping.end()) { 100 | std::string qry_str = rmitr->second; 101 | term_str.push_back(qry_str); 102 | } 103 | query_tokens.emplace_back(term,term_str,qry_tok.second); 104 | } 105 | std::sort(query_tokens.begin(),query_tokens.end()); // sort 106 | query_t q(qry_id,query_tokens); 107 | return {true,q}; 108 | } 109 | 110 | // error 111 | query_t q; 112 | return {false,q}; 113 | } 114 | 115 | static std::vector parse_queries(const std::string& collection_dir, 116 | const std::string& query_file,bool only_complete = false) 117 | { 118 | std::vector queries; 119 | 120 | /* load the mapping */ 121 | auto mapping = load_dictionary(collection_dir); 122 | 123 | /* parse queries */ 124 | std::ifstream qfs(query_file); 125 | if(!qfs.is_open()) { 126 | std::cerr << "cannot load query file."; 127 | exit(EXIT_FAILURE); 128 | } 129 | 130 | std::string query_str; 131 | while( std::getline(qfs,query_str) ) { 132 | auto parsed_qry = parse_query(mapping,query_str,only_complete); 133 | if(parsed_qry.first) { 134 | queries.emplace_back(parsed_qry.second); 135 | } 136 | } 137 | 138 | return queries; 139 | } 140 | }; 141 | 142 | }// end namespace 143 | 144 | #endif 145 | -------------------------------------------------------------------------------- /include/surf/rank_functions.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SURF_RANK_FUNCTIONS_HPP 2 | #define SURF_RANK_FUNCTIONS_HPP 3 | 4 | #include "construct_doc_lengths.hpp" 5 | #include "surf/config.hpp" 6 | #include 7 | #include "sdsl/int_vector.hpp" 8 | #include "surf/util.hpp" 9 | 10 | using namespace sdsl; 11 | 12 | namespace surf { 13 | 14 | template 15 | struct rank_bm25 { 16 | static const double k1; 17 | static const double b; 18 | static const double epsilon_score; 19 | size_t num_docs; 20 | size_t num_terms; 21 | double avg_doc_len; 22 | double min_doc_len; 23 | sdsl::int_vector<> doc_lengths; 24 | 25 | static std::string name() { 26 | return "bm25"; 27 | } 28 | 29 | rank_bm25(){} 30 | 31 | rank_bm25& operator=(const rank_bm25&) = default; 32 | 33 | rank_bm25(cache_config& cconfig) { 34 | uint64_t num_terms; 35 | load_from_cache(num_terms, surf::KEY_COLLEN, cconfig); 36 | if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){ 37 | surf::construct_doc_lengths(cconfig); 38 | } 39 | load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig); 40 | num_docs = doc_lengths.size(); 41 | std::cerr<<"num_docs = "<(cconfig); 86 | } 87 | load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig); 88 | num_docs = doc_lengths.size(); 89 | std::cerr<<"num_docs = "<(cconfig); 137 | } 138 | load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig); 139 | num_docs = doc_lengths.size(); 140 | std::cerr<<"num_docs = "<(cconfig); 184 | } 185 | load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig); 186 | num_docs = doc_lengths.size(); 187 | std::cerr<<"num_docs = "< 237 | const double rank_lmds::smoothing_param = (double)t_s; 238 | 239 | template 240 | const double rank_bm25_simple_est::k1 = (double)t_k1/100.0; 241 | 242 | template 243 | const double rank_bm25_simple_est::b = (double)t_b/100.0; 244 | 245 | template 246 | const double rank_bm25_simple_est::epsilon_score = 1e-6; 247 | 248 | template 249 | const double rank_bm25::k1 = (double)t_k1/100.0; 250 | 251 | template 252 | const double rank_bm25::b = (double)t_b/100.0; 253 | 254 | template 255 | const double rank_bm25::epsilon_score = 1e-6; 256 | 257 | 258 | } // end surf namespace 259 | 260 | #endif 261 | -------------------------------------------------------------------------------- /include/surf/util.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SURF_UTIL_HPP 2 | #define SURF_UTIL_HPP 3 | 4 | #include "surf/config.hpp" 5 | #include "sdsl/io.hpp" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | namespace surf{ 17 | 18 | bool 19 | directory_exists(std::string dir) 20 | { 21 | struct stat sb; 22 | const char* pathname = dir.c_str(); 23 | if (stat(pathname, &sb) == 0 && (S_IFDIR&sb.st_mode)) { 24 | return true; 25 | } 26 | return false; 27 | } 28 | 29 | bool 30 | file_exists(std::string file_name) 31 | { 32 | sdsl::isfstream in(file_name); 33 | if (in) { 34 | in.close(); 35 | return true; 36 | } 37 | return false; 38 | } 39 | 40 | bool 41 | symlink_exists(std::string file) 42 | { 43 | struct stat sb; 44 | const char* filename = file.c_str(); 45 | if (stat(filename, &sb) == 0 && (S_IFLNK&sb.st_mode) ) { 46 | return true; 47 | } 48 | return false; 49 | } 50 | 51 | void 52 | create_directory(std::string dir) 53 | { 54 | if (!directory_exists(dir)) { 55 | if (mkdir(dir.c_str(),0777) == -1) { 56 | perror("could not create directory"); 57 | exit(EXIT_FAILURE); 58 | } 59 | } 60 | } 61 | 62 | bool 63 | valid_collection(std::string collection_dir) 64 | { 65 | if (! surf::directory_exists(collection_dir)) { 66 | std::cerr << collection_dir << " is not a valid directory.\n"; 67 | return false; 68 | } else { 69 | /* make sure the necessary files are present */ 70 | if( ! surf::file_exists(collection_dir+"/"+surf::TEXT_FILENAME) || 71 | ! surf::file_exists(collection_dir+"/"+surf::DICT_FILENAME) || 72 | ! surf::file_exists(collection_dir+"/"+surf::DOCNAMES_FILENAME) ) 73 | { 74 | std::cerr << collection_dir << " does not contain a valid surf collection.\n"; 75 | std::cerr << "The files " << surf::TEXT_FILENAME << " , " << surf::DICT_FILENAME 76 | << " , " << surf::DOCNAMES_FILENAME << " have to be present" << std::endl; 77 | return false; 78 | } 79 | } 80 | return true; 81 | } 82 | 83 | 84 | 85 | sdsl::cache_config 86 | parse_collection(std::string collection_dir) 87 | { 88 | /* check if all the directories exist */ 89 | if( !surf::valid_collection(collection_dir) ) { 90 | exit(EXIT_FAILURE); 91 | } 92 | 93 | std::string index_directory = collection_dir+"/index/"; 94 | surf::create_directory(index_directory); 95 | 96 | std::string results_directory = collection_dir+"/results/"; 97 | surf::create_directory(results_directory); 98 | 99 | /* populate cache config */ 100 | sdsl::cache_config config(false,collection_dir+"/index/","SURF"); 101 | 102 | /* create symlink to text in index directory */ 103 | std::string symlink_name = cache_file_name(sdsl::conf::KEY_TEXT_INT,config); 104 | if( ! surf::symlink_exists(cache_file_name(sdsl::conf::KEY_TEXT_INT,config)) ) { 105 | std::string collection_file = collection_dir+"/"+surf::TEXT_FILENAME; 106 | char* col_file_absolute = realpath(collection_file.c_str(), NULL); 107 | if( symlink(col_file_absolute,symlink_name.c_str()) != 0) { 108 | perror("cannot create symlink to collection file in index directory"); 109 | exit(EXIT_FAILURE); 110 | } 111 | free(col_file_absolute); 112 | } 113 | 114 | /* register files that are present */ 115 | for(const auto& key : surf::storage_keys) { 116 | register_cache_file(key,config); 117 | } 118 | 119 | return config; 120 | } 121 | 122 | 123 | } // end of surf namespace 124 | #endif 125 | -------------------------------------------------------------------------------- /queries/trec0406-adhoc.qry: -------------------------------------------------------------------------------- 1 | 701;us oil industry history 2 | 702;pearl farming 3 | 703;us against international criminal court 4 | 704;green party politics view 5 | 705;iraq foreign debt reduction 6 | 706;control type ii diabetes 7 | 707;aspirin cancer prevention 8 | 708;decorative slate source 9 | 709;horse racing jockey weight 10 | 710;prostate cancer treatment 11 | 711;train station security measure 12 | 712;pyramid scheme 13 | 713;chesapeake bay maryland clean 14 | 714;license restriction old driver 15 | 715;schizophrenia drug 16 | 716;spam arrest sue 17 | 717;gifted talented student program 18 | 718;control acid rain 19 | 719;cruise ship damage sea life 20 | 720;federal welfare reform 21 | 721;census data application 22 | 722;iran terrorism 23 | 723;executive privilege 24 | 724;iran contra 25 | 725;low white blood cell count 26 | 726;hubble telescope repair 27 | 727;church arson 28 | 728;whale save endanger 29 | 729;whistle blower department of defense 30 | 730;gastric bypass complication 31 | 731;kurd history 32 | 732;us cheese production 33 | 733;airline overbooke 34 | 734;recycle success 35 | 735;afghanistan women condition 36 | 736;location bse infection 37 | 737;enron california energy crisis 38 | 738;anthrax hoax 39 | 739;habitat for humanity 40 | 740;regulate assist living maryland 41 | 741;artificial intelligence 42 | 742;hedge funds fraud protection 43 | 743;freighter ship registration 44 | 744;counterfeit id punishment 45 | 745;doomsday cult 46 | 746;outsource job india 47 | 747;library computer oversight 48 | 748;nuclear reactor type 49 | 749;puerto rico state 50 | 750;john edwards women issue 51 | 751;scrabble player 52 | 752;dam removal 53 | 753;bully prevention program 54 | 754;domestic adoption law 55 | 755;scotland highland games 56 | 756;volcanic activity 57 | 757;mural 58 | 758;embryonic stem cell 59 | 759;civil war battle reenactment 60 | 760;america muslim mosque school 61 | 761;problem of hmong immigrant 62 | 762;history of physician in america 63 | 763;hunting death 64 | 764;increase mass transit use 65 | 765;ephedra ma huang death 66 | 766;diamond smuggle 67 | 767;pharmacist license requirement 68 | 768;women in state legislature 69 | 769;kroll associate employee 70 | 770;kyrgyzstan united states relations 71 | 771;deform leopard frog 72 | 772;flag display rule 73 | 773;pennsylvania slot machine gamble 74 | 774;cause of homeless 75 | 775;commercial candy maker 76 | 776;magnet school success 77 | 777;hybrid alternative fuel car 78 | 778;golden ratio 79 | 779;javelina range and description 80 | 780;arable land 81 | 781;squirrel control and protection 82 | 782;orange variety season 83 | 783;school mercury poison 84 | 784;mersenne prime 85 | 785;ivory billed woodpecker 86 | 786;yew tree 87 | 787;sunflower cultivation 88 | 788;reverse mortgage 89 | 789;abandoned mine reclamation 90 | 790;women rights in saudi arabia 91 | 791;gullah geechee language culture 92 | 792;social security means test 93 | 793;bagpipe band 94 | 794;pet therapy 95 | 795;notable cock spaniel 96 | 796;blue grass music festival history 97 | 797;reintroduction of gray wolf 98 | 798;massachusetts textile mill 99 | 799;animal in alzheimer research 100 | 800;ovarian cancer treatment 101 | 801;kudzu pueraria lobata 102 | 802;volcano eruption global temperature 103 | 803;may day 104 | 804;ban on human clone 105 | 805;identity theft passport 106 | 806;doctor without border 107 | 807;sugar tariff rate quota 108 | 808;north korea counterfeit 109 | 809;wetland wastewater treatment 110 | 810;timeshare resale 111 | 811;handwriting recognition 112 | 812;total knee replacement surgery 113 | 813;atlantic intracoastal waterway 114 | 814;johnstown flood 115 | 815;coast guard rescue 116 | 816;usaid assistance to galapago 117 | 817;sports stadium name rights 118 | 818;chaco culture national park 119 | 819;1890 census 120 | 820;import fire ant 121 | 821;internet work at home scam 122 | 822;custer last stand 123 | 823;continue care retirement community 124 | 824;civil air patrol 125 | 825;national guard involve in iraq 126 | 826;florida seminole indian 127 | 827;hidden markov model hmm 128 | 828;secret shop 129 | 829;spain civil war support 130 | 830;model railroad 131 | 831;dulles airport security 132 | 832;labor union activity 133 | 833;iceland government 134 | 834;global position system earthquake 135 | 835;big dig pork 136 | 836;illegal immigrant wages 137 | 837;eskimo history 138 | 838;urban suburban coyote 139 | 839;textile dye technique 140 | 840;geyser 141 | 841;camel north america 142 | 842;david mccullough 143 | 843;pol pot 144 | 844;segment duplicate 145 | 845;new jersey tomato 146 | 846;heredity and obese 147 | 847;portugal world war ii 148 | 848;radio station call letters 149 | 849;scale vector graphics 150 | 850;mississippi river flood 151 | -------------------------------------------------------------------------------- /queries/trec2005-efficiency-10.qry: -------------------------------------------------------------------------------- 1 | 70;bentley automobile 2 | 211;downtown orlando florida 3 | 257;shannyn sossamon 4 | 259;theoriginal rainbow cone 5 | 450;heterogeneous uterus 6 | 527;john steinbeck 7 | 591;diamond blackfan anemia 8 | 626;j boog 9 | 717;pregnancy 10 | 736;america fidelity insurance company 11 | -------------------------------------------------------------------------------- /queries/trec2005-efficiency-100.qry: -------------------------------------------------------------------------------- 1 | 70;bentley automobile 2 | 211;downtown orlando florida 3 | 257;shannyn sossamon 4 | 259;theoriginal rainbow cone 5 | 450;heterogeneous uterus 6 | 527;john steinbeck 7 | 591;diamond blackfan anemia 8 | 626;j boog 9 | 717;pregnancy 10 | 736;america fidelity insurance company 11 | 775;can i drive a motorcycle wit a driving permit 12 | 892;reverse phone look up 13 | 966;katie holmes 14 | 1113;bella frisk 15 | 1131;real estate value 16 | 1180;pc to xbox interface for steer wheel 17 | 1209;in living color jamie foxx wanda 18 | 1235;potterybarn 19 | 1243;canne watches 20 | 1248;samsung lcd flat screen 21 | 1253;rom 22 | 1264;northwestern memorial hospital 23 | 1286;boxer breeder in new york 24 | 1412;cheap ticket 25 | 1468;texas news paper 26 | 1550;warre 27 | 1551;stocks 28 | 1613;driving directions 29 | 1661;cheat code 30 | 1796;john cena 31 | 1847;gameboy advance fire emblem walkthrough 32 | 1903;hotel in wright city missouri 33 | 1969;bead jewlry 34 | 2118;gienn county child sopport 35 | 2146;mcfarlane art 36 | 2217;ca lottery numbers 37 | 2271;field s 38 | 2321;weather by the hour 39 | 2343;house for rent in charlotte nc 40 | 2348;john dillon day 41 | 2383;bad homburg germany 42 | 2388;infiltration of liver 43 | 2449;bank of america 44 | 2550;jewish camp in queens 45 | 2568;cheap rental apartment in miami 46 | 2602;avon ohio school 47 | 2669;las vega show 48 | 2761;goldie hawn 49 | 2825;mike jones 50 | 2826;translator 51 | 2828;leon county school 52 | 3064;jagermeister 53 | 3068;recipy 54 | 3091;provo lds temple prayer roll 55 | 3161;ebay 56 | 3234;frontier hotel and casino 57 | 3281;sympathy john r 58 | 3398;people 59 | 3421;boutique in key west florida 60 | 3443;text letter of condolence of a mother 61 | 3628;animal shelter clermont 62 | 3764;abercrombie 63 | 3828;hen mankell 64 | 3845;science magazine 65 | 3895;cinderella man movie 66 | 3942;heart candle tin 67 | 3950;phoenix arizona state death record 68 | 3972;audi usa 69 | 4001;baby falcon and falconry equipment 70 | 4151;shane hmiel 71 | 4153;luca county auditor 72 | 4172;all marine 73 | 4212;copycat recipe 74 | 4234;utah chat rooms 75 | 4290;hairstyle 76 | 4379;father s day idea 77 | 4521;birth control pill 78 | 4526;nickelodean hotel 79 | 4580;johnny depp 80 | 4684;alaskaair 81 | 4688;spiderman vs venom 82 | 4720;deconsal 83 | 4802;baby einsten 84 | 4829;nietzche 85 | 4846;lake geneva boat 86 | 4855;halle berry 87 | 4942;map of saudi arabia 88 | 5001;tcc 89 | 5005;north fork bank 90 | 5016;train station 91 | 5023;the honeymoon movie 92 | 5036;disney free fr 93 | 5183;key west 94 | 5209;d d home medium 95 | 5241;marijuana plant 96 | 5274;very yonug children model art gallery 97 | 5302;ny lottery 98 | 5332;zona girdle zone 99 | 5336;prom dress 100 | 5340;history of communication 101 | -------------------------------------------------------------------------------- /queries/trec2006-efficiency-10.qry: -------------------------------------------------------------------------------- 1 | 68;slight anemia high tibc 2 | 250;medical board of georgia administrative action 3 | 503;leading export in kiribati 4 | 561;why is is important to study primary source document 5 | 683;sample cosmetic product recall plan 6 | 918;phone number for middle town ri dmv 7 | 1153;campo band of mission indian 8 | 1178;spring mountain ranch las vega nv 9 | 1206;ny city employment opportunity 10 | 1394;prevent gas in bottlefe baby 11 | -------------------------------------------------------------------------------- /queries/trec2006-efficiency-100.qry: -------------------------------------------------------------------------------- 1 | 68;slight anemia high tibc 2 | 250;medical board of georgia administrative action 3 | 503;leading export in kiribati 4 | 561;why is is important to study primary source document 5 | 683;sample cosmetic product recall plan 6 | 918;phone number for middle town ri dmv 7 | 1153;campo band of mission indian 8 | 1178;spring mountain ranch las vega nv 9 | 1206;ny city employment opportunity 10 | 1394;prevent gas in bottlefe baby 11 | 1465;hematology competent test 12 | 1734;membrane in the egg 13 | 1847;va medical record form 14 | 1984;samhsa lidocaine 15 | 2007;textile fiber identification act 16 | 2127;animal that have vertebrae in there back different kind 17 | 2176;special needs health insurance 18 | 2198;julliard string quartet schedule 19 | 2203;criminal law 20 | 2244;operate room mistake 21 | 2388;passport form 22 | 2440;pre divorce stress 23 | 2653;georgia seasonal fruits and vegetable 24 | 2875;snohomish county evidence department 25 | 2896;ac 61 107 26 | 3219;us government parkersburg west virginia 27 | 3414;wac code of washington 28 | 3605;advocate for resident rights 29 | 3622;city of milwaukee assessment 30 | 3768;irs mileage expense rate 31 | 3910;mark d shrive is locate gene 32 | 3931;be 11 government form 33 | 4257;apd albuquerque 34 | 4314;pell federal grant 35 | 4411;plan first medicaid 36 | 4415;nys math exam 37 | 4448;how to make steel form machine 38 | 4523;employment opportunity with the us district court of arizona 39 | 4572;mental health institute cherokee iowa 40 | 4708;washing powder ingest 41 | 4712;grass tillage 42 | 4720;treatment for narcissism 43 | 4727;five level of muscle contraction 44 | 4740;clare b connaughton 45 | 4875;tim cord 46 | 4956;ca gov claim 47 | 4983;lewis and clark item sent to jefferson at monticello 48 | 5013;history of subway 49 | 5017;los alto hill town government 50 | 5138;blue winged warbler 51 | 5258;test for ny city firefighter application 52 | 5884;check insurance with ohio fair plan 53 | 5927;were there any lives lost in the 1811 and 1812 earth quake that form reelfoot lake 54 | 5941;citgo petroleum corporation 55 | 5971;what should a freshmen do to prepare to be a sophomore in high school 56 | 6062;culpeper county dogs adoption 57 | 6161;hov lane compliance 58 | 6480;complicated migraine 59 | 6785;somerset refinery co 60 | 6811;operate without reasonable control citation 61 | 7005;nutrition and food service 62 | 7078;webster county mo sales tax 63 | 7172;oregon per 64 | 7188;nurse practitioner in legislature 65 | 7211;stony creek virginia fishing 66 | 7450;s c rule on transcription of court record 67 | 7552;trask venture fund 68 | 7635;cycloheximide 69 | 7712;bus from jfk to lga 70 | 7956;brandon mayfield 71 | 8018;section 8 apartments in terre haute 72 | 8847;pennsylvania coin 73 | 8946;channel stuffing and consignment sales and fasb 74 | 8961;polycystic kidney and pericardial effusion 75 | 9061;workforce development ma 76 | 9178;irs head of household 77 | 9275;prisoner of war life 78 | 9523;business by federal id number 79 | 9527;home equity loan for poor credit 80 | 9535;finger injury statistics 81 | 9540;houston anderson thyme surgery 82 | 9726;space stars 83 | 9730;my history of real estate tax nyc 84 | 9839;teacher edition books online 85 | 9840;connecticut attorney general 86 | 9866;what schedule is methadone 87 | 9950;gemini program nasa 88 | 9957;childhood leukemia 89 | 9977;infertile after iud 90 | 9991;fuse panel diagram for old old original tennessee homeowner obtain 91 | 10021;state of michigan state bird 92 | 10063;regulatory system quiz 93 | 10139;epa freon test 94 | 10231;the waterpocket fold in capitol reef national park 95 | 10395;cobalt urine level 96 | 10397;san francisco maritime park management plan 97 | 10470;california worker compensation law 98 | 11060;lancaster va 99 | 11077;toddler self regulation 100 | 11155;home for sale okalona ms 101 | -------------------------------------------------------------------------------- /queries/trec2006-efficiency.qry: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simongog/surf/c8caa199391793395df85dede6df88c17514097e/queries/trec2006-efficiency.qry -------------------------------------------------------------------------------- /results/trec8_wtdup_stat.R: -------------------------------------------------------------------------------- 1 | data <- read.csv2("trec8_wtdup_stat.txt",sep=",",header=F) 2 | 3 | pdf("trec8_wtdup_stat.pdf") 4 | 5 | plot(data$V1,cumsum(data$V2*data$V1)/crossprod(data$V2,data$V1),ylim=c(0,1),xlab="Node size",ylab="Ratio of covered elements in DUP") 6 | 7 | #dev.copy2pdf(file="trec8_wtdup_stat.pdf") 8 | 9 | dev.off() 10 | -------------------------------------------------------------------------------- /results/trec8_wtdup_stat.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simongog/surf/c8caa199391793395df85dede6df88c17514097e/results/trec8_wtdup_stat.pdf -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | !.cpp 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /src/doc_lengths.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "surf/util.hpp" 12 | #include "sdsl/config.hpp" 13 | #include "surf/construct_doc_lengths.hpp" 14 | 15 | typedef struct cmdargs { 16 | std::string collection_dir; 17 | } cmdargs_t; 18 | 19 | void 20 | print_usage(char* program) 21 | { 22 | fprintf(stdout,"%s -c -p -r\n",program); 23 | fprintf(stdout,"where\n"); 24 | fprintf(stdout," -c : the directory the collection is stored.\n"); 25 | }; 26 | 27 | cmdargs_t 28 | parse_args(int argc,char* const argv[]) 29 | { 30 | cmdargs_t args; 31 | int op; 32 | args.collection_dir = ""; 33 | while ((op=getopt(argc,argv,"c:")) != -1) { 34 | switch (op) { 35 | case 'c': 36 | args.collection_dir = optarg; 37 | break; 38 | case '?': 39 | default: 40 | print_usage(argv[0]); 41 | } 42 | } 43 | if (args.collection_dir=="") { 44 | std::cerr << "Missing command line parameters.\n"; 45 | print_usage(argv[0]); 46 | exit(EXIT_FAILURE); 47 | } 48 | return args; 49 | } 50 | 51 | int main(int argc,char* const argv[]) 52 | { 53 | /* parse command line */ 54 | cmdargs_t args = parse_args(argc,argv); 55 | 56 | /* parse repo */ 57 | auto cc = surf::parse_collection(args.collection_dir); 58 | char tmp_str[256] = {0}; 59 | strncpy(tmp_str,args.collection_dir.c_str(),256); 60 | std::string base_name = basename(tmp_str); 61 | 62 | sdsl::int_vector<> doc_lengths; 63 | if (!sdsl::cache_file_exists(surf::KEY_DOC_LENGTHS, cc)){ 64 | surf::construct_doc_lengths(cc); 65 | } 66 | sdsl::load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cc); 67 | 68 | std::sort(doc_lengths.begin(),doc_lengths.end()); 69 | 70 | std::cout << "count;len\n"; 71 | auto cur = doc_lengths[0]; 72 | size_t count = 1; 73 | for(size_t i=1;i 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "surf/query.hpp" 12 | #include "sdsl/config.hpp" 13 | #include "surf/indexes.hpp" 14 | #include "surf/query_parser.hpp" 15 | #include "surf/comm.hpp" 16 | #include "surf/phrase_parser.hpp" 17 | #include "surf/rank_functions.hpp" 18 | 19 | #include "zmq.hpp" 20 | 21 | typedef struct cmdargs { 22 | std::string collection_dir; 23 | std::string port; 24 | bool load_dictionary; 25 | } cmdargs_t; 26 | 27 | void 28 | print_usage(char* program) 29 | { 30 | fprintf(stdout,"%s -c -p -r\n",program); 31 | fprintf(stdout,"where\n"); 32 | fprintf(stdout," -c : the directory the collection is stored.\n"); 33 | fprintf(stdout," -p : the port the daemon is running on.\n"); 34 | fprintf(stdout," -r : do not load the dictionary.\n"); 35 | }; 36 | 37 | cmdargs_t 38 | parse_args(int argc,char* const argv[]) 39 | { 40 | cmdargs_t args; 41 | int op; 42 | args.collection_dir = ""; 43 | args.port = std::to_string(12345); 44 | args.load_dictionary = true; 45 | while ((op=getopt(argc,argv,"c:p:r")) != -1) { 46 | switch (op) { 47 | case 'c': 48 | args.collection_dir = optarg; 49 | break; 50 | case 'p': 51 | args.port = optarg; 52 | break; 53 | case 'r': 54 | args.load_dictionary = false; 55 | break; 56 | case '?': 57 | default: 58 | print_usage(argv[0]); 59 | } 60 | } 61 | if (args.collection_dir=="") { 62 | std::cerr << "Missing command line parameters.\n"; 63 | print_usage(argv[0]); 64 | exit(EXIT_FAILURE); 65 | } 66 | return args; 67 | } 68 | 69 | int main(int argc,char* const argv[]) 70 | { 71 | using clock = std::chrono::high_resolution_clock; 72 | /* parse command line */ 73 | cmdargs_t args = parse_args(argc,argv); 74 | 75 | /* parse repo */ 76 | auto cc = surf::parse_collection(args.collection_dir); 77 | char tmp_str[256] = {0}; 78 | strncpy(tmp_str,args.collection_dir.c_str(),256); 79 | std::string base_name = basename(tmp_str); 80 | 81 | /* parse queries */ 82 | surf::query_parser::mapping_t term_map; 83 | if(args.load_dictionary) { 84 | std::cout << "Loading dictionary and creating term map." << std::endl; 85 | term_map = surf::query_parser::load_dictionary(args.collection_dir); 86 | } 87 | 88 | /* define types */ 89 | using surf_index_t = INDEX_TYPE; 90 | std::string index_name = IDXNAME; 91 | 92 | /* load the index */ 93 | std::cout << "Loading index." << std::endl; 94 | surf_index_t index; 95 | auto load_start = clock::now(); 96 | construct(index, "", cc, 0); 97 | index.load(cc); 98 | auto load_stop = clock::now(); 99 | auto load_time_sec = std::chrono::duration_cast(load_stop-load_start); 100 | std::cout << "Index loaded in " << load_time_sec.count() << " seconds." << std::endl; 101 | 102 | 103 | /* daemon mode */ 104 | { 105 | std::cout << "Starting daemon mode on port " << args.port << std::endl; 106 | zmq::context_t context(1); 107 | zmq::socket_t server(context, ZMQ_REP); 108 | server.bind(std::string("tcp://*:"+args.port).c_str()); 109 | 110 | while(true) { 111 | zmq::message_t request; 112 | /* wait for msg */ 113 | server.recv(&request); 114 | surf_qry_request* surf_req = (surf_qry_request*) request.data(); 115 | 116 | if(surf_req->type == REQ_TYPE_QUIT) { 117 | std::cout << "Quitting..." << std::endl; 118 | break; 119 | } 120 | 121 | /* perform query */ 122 | auto qry_start = clock::now(); 123 | 124 | surf::query_t prased_query; 125 | bool parse_ok = false; 126 | 127 | if(surf_req->phrases) { 128 | #ifdef PHRASE_SUPPORT 129 | const auto& id_mapping = term_map.first; 130 | const auto& reverse_mapping = term_map.second; 131 | auto qry_mapping = surf::query_parser::map_to_ids(id_mapping, 132 | std::string(surf_req->qry_str),true,surf_req->int_qry); 133 | if(std::get<0>(qry_mapping)) { 134 | auto qid = std::get<1>(qry_mapping); 135 | auto qry_ids = std::get<2>(qry_mapping); 136 | prased_query = surf::phrase_parser::phrase_segmentation(index.m_csa,qry_ids,reverse_mapping, 137 | surf_req->phrase_threshold); 138 | std::get<0>(prased_query) = qid; 139 | parse_ok = true; 140 | } 141 | #endif 142 | } else { 143 | auto qry = surf::query_parser::parse_query(term_map, 144 | std::string(surf_req->qry_str), 145 | true, 146 | surf_req->int_qry); 147 | if(qry.first) { 148 | prased_query = qry.second; 149 | parse_ok = true; 150 | } 151 | } 152 | 153 | if(!parse_ok) { 154 | // error parsing the qry. send back error 155 | surf_time_resp surf_resp; 156 | surf_resp.status = REQ_PARSE_ERROR; 157 | surf_resp.req_id = surf_req->id; 158 | zmq::message_t reply (sizeof(surf_time_resp)); 159 | memcpy(reply.data(),&surf_resp,sizeof(surf_time_resp)); 160 | server.send(reply); 161 | std::cout << "ERROR IN QUERY PARSING PROCESS. SKIPPING QUERY" << std::endl; 162 | continue; 163 | } 164 | 165 | /* (1) parse qry terms */ 166 | bool profile = false; 167 | if(surf_req->mode == REQ_MODE_PROFILE) { 168 | profile = true; 169 | } 170 | bool ranked_and = false; 171 | if(surf_req->type == REQ_TYPE_QRY_AND) { 172 | ranked_and = true; 173 | } 174 | 175 | /* (2) query the index */ 176 | auto qry_id = std::get<0>(prased_query); 177 | auto qry_tokens = std::get<1>(prased_query); 178 | auto search_start = clock::now(); 179 | auto results = index.search(qry_tokens,surf_req->k,ranked_and,profile); 180 | auto search_stop = clock::now(); 181 | auto search_time = std::chrono::duration_cast(search_stop-search_start); 182 | 183 | auto qry_stop = clock::now(); 184 | auto query_time = std::chrono::duration_cast(qry_stop-qry_start); 185 | 186 | /* (3a) output to qry to console */ 187 | std::cout << "REQ=" << std::left << std::setw(10) << surf_req->id << " " 188 | << " k=" << std::setw(5) << surf_req->k 189 | << " QID=" << std::setw(5) << qry_id 190 | << " TIME=" << std::setw(7) << query_time.count()/1000.0 191 | << " AND=" << ranked_and 192 | << " PHRASE=" << surf_req->phrases; 193 | std::cout << " ["; 194 | if(args.load_dictionary) { 195 | for(const auto& token : qry_tokens) { 196 | if(token.token_ids.size() > 1) { 197 | // phrase 198 | std::cout << "("; 199 | for(const auto tstr : token.token_strs) { 200 | std::cout << tstr << " "; 201 | } 202 | std::cout << ") "; 203 | } else { 204 | std::cout << token.token_strs[0] << " "; 205 | } 206 | } 207 | } else { 208 | for(const auto& token : qry_tokens) { 209 | if(token.token_ids.size() > 1) { 210 | // phrase 211 | std::cout << "("; 212 | for(const auto tid : token.token_ids) { 213 | std::cout << tid << " "; 214 | } 215 | std::cout << ") "; 216 | } else { 217 | std::cout << token.token_ids[0] << " "; 218 | } 219 | } 220 | } 221 | std::cout << "]" << std::endl; 222 | 223 | /* (3) create answer and send */ 224 | if(!surf_req->output_results) { 225 | surf_time_resp surf_resp; 226 | surf_resp.status = REQ_RESPONE_OK; 227 | strncpy(surf_resp.index,index_name.c_str(),sizeof(surf_resp.index)); 228 | strncpy(surf_resp.collection,base_name.c_str(),sizeof(surf_resp.collection)); 229 | strncpy(surf_resp.ranker,surf_index_t::ranker_type::name().c_str(),sizeof(surf_resp.ranker)); 230 | surf_resp.req_id = surf_req->id; 231 | surf_resp.k = surf_req->k; 232 | surf_resp.qry_id = qry_id; 233 | surf_resp.qry_len = qry_tokens.size(); 234 | surf_resp.result_size = results.list.size(); 235 | surf_resp.qry_time = query_time.count(); 236 | surf_resp.search_time = search_time.count(); 237 | surf_resp.wt_search_space = results.wt_search_space; 238 | surf_resp.wt_nodes = results.wt_nodes; 239 | surf_resp.postings_evaluated = results.postings_evaluated; 240 | surf_resp.postings_total = results.postings_total; 241 | 242 | zmq::message_t reply (sizeof(surf_time_resp)); 243 | memcpy(reply.data(),&surf_resp,sizeof(surf_time_resp)); 244 | server.send (reply); 245 | } else { 246 | size_t res_size = results.list.size()*2*sizeof(double) + sizeof(uint64_t); 247 | zmq::message_t zmq_results (res_size); 248 | surf_results* sr = (surf_results*)(zmq_results.data()); 249 | sr->size = results.list.size(); 250 | for(size_t i=0;idata[i*2] = results.list[i].doc_id; 252 | sr->data[i*2+1] = results.list[i].score; 253 | } 254 | server.send (zmq_results); 255 | } 256 | } 257 | } 258 | 259 | 260 | return EXIT_SUCCESS; 261 | } 262 | -------------------------------------------------------------------------------- /src/surf_index.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "sdsl/config.hpp" 3 | #include "surf/indexes.hpp" 4 | #include "surf/util.hpp" 5 | 6 | typedef struct cmdargs { 7 | std::string collection_dir; 8 | bool print_memusage; 9 | } cmdargs_t; 10 | 11 | void 12 | print_usage(char* program) 13 | { 14 | fprintf(stdout,"%s -c -m\n",program); 15 | fprintf(stdout,"where\n"); 16 | fprintf(stdout," -c : the directory the collection is stored.\n"); 17 | fprintf(stdout," -m : print memory usage.\n"); 18 | }; 19 | 20 | cmdargs_t 21 | parse_args(int argc,char* const argv[]) 22 | { 23 | cmdargs_t args; 24 | int op; 25 | args.collection_dir = ""; 26 | args.print_memusage = false; 27 | while ((op=getopt(argc,argv,"c:m")) != -1) { 28 | switch (op) { 29 | case 'c': 30 | args.collection_dir = optarg; 31 | break; 32 | case 'm': 33 | args.print_memusage = true; 34 | break; 35 | case '?': 36 | default: 37 | print_usage(argv[0]); 38 | } 39 | } 40 | if (args.collection_dir=="") { 41 | std::cerr << "Missing command line parameters.\n"; 42 | print_usage(argv[0]); 43 | exit(EXIT_FAILURE); 44 | } 45 | return args; 46 | } 47 | 48 | int main(int argc,char* const argv[]) 49 | { 50 | using clock = std::chrono::high_resolution_clock; 51 | /* parse command line */ 52 | cmdargs_t args = parse_args(argc,argv); 53 | 54 | /* parse repo */ 55 | sdsl::cache_config cc = surf::parse_collection(args.collection_dir); 56 | std::cout<<"parse collections"<(build_stop-build_start); 71 | std::cout << "Index built in " << build_time_sec.count() << " seconds." << std::endl; 72 | 73 | /* visualize space usage */ 74 | index.load(cc); 75 | std::cout<<"Write structure"<(index,vofs); 78 | 79 | /* print mem usage */ 80 | if(args.print_memusage) { 81 | index.mem_info(); 82 | } 83 | 84 | return EXIT_SUCCESS; 85 | } 86 | -------------------------------------------------------------------------------- /src/surf_query.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include "surf/comm.hpp" 13 | #include "surf/util.hpp" 14 | #include "surf/query_parser.hpp" 15 | 16 | #include "zmq.hpp" 17 | 18 | typedef struct cmdargs { 19 | std::string host; 20 | std::string query_file; 21 | uint64_t k; 22 | uint64_t runs; 23 | bool profile; 24 | bool quit; 25 | bool ranked_and; 26 | bool phrases; 27 | double phrase_threshold; 28 | bool output_results; 29 | bool integer_mode; 30 | std::string collection_dir; 31 | } cmdargs_t; 32 | 33 | void 34 | print_usage(char* program) 35 | { 36 | fprintf(stdout,"%s -h -q -k -r -p -P -s -a -R -i \n",program); 37 | fprintf(stdout,"where\n"); 38 | fprintf(stdout," -h : host of the daemon.\n"); 39 | fprintf(stdout," -q : the queries to be performed.\n"); 40 | fprintf(stdout," -k : the top-k documents to be retrieved for each query.\n"); 41 | fprintf(stdout," -r : the number of runs.\n"); 42 | fprintf(stdout," -R : output results only\n"); 43 | fprintf(stdout," -p : run queries in profile mode.\n"); 44 | fprintf(stdout," -P : run queries with phrase parsing enabled and threshold .\n"); 45 | fprintf(stdout," -s : stop the daemon after queries are processed.\n"); 46 | fprintf(stdout," -a : perform ranked AND instead of ranked OR.\n"); 47 | fprintf(stdout," -i : perform dict lookup at the client from .\n"); 48 | }; 49 | 50 | cmdargs_t 51 | parse_args(int argc,char* const argv[]) 52 | { 53 | cmdargs_t args; 54 | int op; 55 | args.host = "127.0.0.1:12345"; 56 | args.query_file = ""; 57 | args.k = 10; 58 | args.runs = 3; 59 | args.profile = false; 60 | args.quit = false; 61 | args.ranked_and = false; 62 | args.phrases = false; 63 | args.phrase_threshold = 0.0f; 64 | args.output_results = false; 65 | args.integer_mode = false; 66 | while ((op=getopt(argc,argv,"r:h:q:k:psaP:Ri:")) != -1) { 67 | switch (op) { 68 | case 'r': 69 | args.runs = std::strtoul(optarg,NULL,10); 70 | break; 71 | case 'h': 72 | args.host = optarg; 73 | break; 74 | case 'p': 75 | args.profile = true; 76 | break; 77 | case 'P': 78 | args.phrases = true; 79 | args.phrase_threshold = std::strtod(optarg,NULL); 80 | break; 81 | case 's': 82 | args.quit = true; 83 | break; 84 | case 'a': 85 | args.ranked_and = true; 86 | break; 87 | case 'R': 88 | args.output_results = true; 89 | break; 90 | case 'q': 91 | args.query_file = optarg; 92 | break; 93 | case 'k': 94 | args.k = std::strtoul(optarg,NULL,10); 95 | break; 96 | case 'i': 97 | args.integer_mode = true; 98 | args.collection_dir = optarg; 99 | break; 100 | case '?': 101 | default: 102 | print_usage(argv[0]); 103 | } 104 | } 105 | if (args.query_file=="") { 106 | std::cerr << "Missing command line parameters.\n"; 107 | print_usage(argv[0]); 108 | exit(EXIT_FAILURE); 109 | } 110 | return args; 111 | } 112 | 113 | int main(int argc,char* const argv[]) 114 | { 115 | using clock = std::chrono::high_resolution_clock; 116 | 117 | /* parse command line */ 118 | cmdargs_t args = parse_args(argc,argv); 119 | 120 | /* load queries from disk */ 121 | std::cerr << "Loading queries from disk." << std::endl; 122 | std::ifstream qfs(args.query_file); 123 | std::string qry_str; 124 | std::vector queries; 125 | while(std::getline(qfs,qry_str)) { 126 | if(qry_str.size() < MAX_QRY_LEN) { 127 | queries.push_back(qry_str); 128 | } 129 | } 130 | 131 | if(args.integer_mode) { 132 | surf::parse_collection(args.collection_dir); // makes sure dir is valid 133 | std::cout << "Loading dictionary and creating term map." << std::endl; 134 | auto term_map = surf::query_parser::load_dictionary(args.collection_dir); 135 | const auto& id_mapping = term_map.first; 136 | std::vector mapped_queries; 137 | for(auto& query: queries) { 138 | auto qry_mapping = surf::query_parser::map_to_ids(id_mapping,query,true,false); 139 | if(std::get<0>(qry_mapping)) { 140 | auto qid = std::get<1>(qry_mapping); 141 | auto qry_ids = std::get<2>(qry_mapping); 142 | std::string new_qry_str; 143 | new_qry_str += std::to_string(qid) + ";"; 144 | for(size_t i=0;i(reply.data()); 219 | 220 | auto req_stop = clock::now(); 221 | auto req_time = std::chrono::duration_cast(req_stop-req_start); 222 | 223 | if(surf_resp->req_id != surf_req.id) { 224 | std::cerr << "ERROR: got response for wrong request id!" << std::endl; 225 | } 226 | 227 | if(surf_resp->status != REQ_PARSE_ERROR) { 228 | /* output */ 229 | std::cout << surf_resp->qry_id << ";" 230 | << surf_resp->collection << ";" 231 | << surf_resp->ranker << ";" 232 | << surf_resp->index << ";" 233 | << (int)qry_mode << ";" 234 | << surf_resp->k << ";" 235 | << surf_resp->qry_len << ";" 236 | << surf_resp->result_size << ";" 237 | << surf_resp->qry_time << ";" 238 | << surf_resp->search_time << ";" 239 | << surf_resp->wt_search_space << ";" 240 | << surf_resp->wt_nodes << ";" 241 | << surf_resp->postings_evaluated << ";" 242 | << surf_resp->postings_total << ";" 243 | << req_time.count() << std::endl; 244 | } else { 245 | std::cerr << "Error processing query '" << query << "'" << std::endl; 246 | } 247 | } else { 248 | zmq::message_t output; 249 | socket.recv (&output); 250 | surf_results* sr = (surf_results*)(output.data()); 251 | for(size_t j=0;jsize;j++) { 252 | std::cout << "(" << j+1 << ") : " 253 | << (uint64_t)sr->data[j*2] 254 | << " - " 255 | << sr->data[j*2+1] << std::endl; 256 | } 257 | } 258 | } 259 | } 260 | 261 | // stop the daemon 262 | if(args.quit) { 263 | surf_qry_request surf_req; 264 | surf_req.type = REQ_TYPE_QUIT; 265 | zmq::message_t request(sizeof(surf_qry_request)); 266 | memcpy ((void *) request.data (), &surf_req, sizeof(surf_qry_request)); 267 | socket.send (request); 268 | } 269 | 270 | 271 | return EXIT_SUCCESS; 272 | } 273 | -------------------------------------------------------------------------------- /src/surf_search.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include "surf/query.hpp" 11 | #include "sdsl/config.hpp" 12 | #include "surf/indexes.hpp" 13 | #include "surf/query_parser.hpp" 14 | 15 | typedef struct cmdargs { 16 | std::string collection_dir; 17 | std::string query_file; 18 | uint64_t k; 19 | } cmdargs_t; 20 | 21 | void 22 | print_usage(char* program) 23 | { 24 | fprintf(stdout,"%s -c -q -k -o \n",program); 25 | fprintf(stdout,"where\n"); 26 | fprintf(stdout," -c : the directory the collection is stored.\n"); 27 | fprintf(stdout," -q : the queries to be performed.\n"); 28 | fprintf(stdout," -k : the top-k documents to be retrieved for each query.\n"); 29 | }; 30 | 31 | cmdargs_t 32 | parse_args(int argc,char* const argv[]) 33 | { 34 | cmdargs_t args; 35 | int op; 36 | args.collection_dir = ""; 37 | args.query_file = ""; 38 | args.k = 10; 39 | while ((op=getopt(argc,argv,"c:q:k:")) != -1) { 40 | switch (op) { 41 | case 'c': 42 | args.collection_dir = optarg; 43 | break; 44 | case 'q': 45 | args.query_file = optarg; 46 | break; 47 | case 'k': 48 | args.k = std::strtoul(optarg,NULL,10); 49 | break; 50 | case '?': 51 | default: 52 | print_usage(argv[0]); 53 | } 54 | } 55 | if (args.collection_dir==""||args.query_file=="") { 56 | std::cerr << "Missing command line parameters.\n"; 57 | print_usage(argv[0]); 58 | exit(EXIT_FAILURE); 59 | } 60 | return args; 61 | } 62 | 63 | int main(int argc,char* const argv[]) 64 | { 65 | using clock = std::chrono::high_resolution_clock; 66 | /* parse command line */ 67 | cmdargs_t args = parse_args(argc,argv); 68 | 69 | /* parse repo */ 70 | auto cc = surf::parse_collection(args.collection_dir); 71 | 72 | /* parse queries */ 73 | std::cout << "Parsing query file '" << args.query_file << "'" << std::endl; 74 | auto queries = surf::query_parser::parse_queries(args.collection_dir,args.query_file); 75 | std::cout << "Found " << queries.size() << " queries." << std::endl; 76 | 77 | /* define types */ 78 | using surf_index_t = INDEX_TYPE; 79 | std::string index_name = IDXNAME; 80 | 81 | /* load the index */ 82 | surf_index_t index; 83 | auto load_start = clock::now(); 84 | construct(index, "", cc, 0); 85 | index.load(cc); 86 | auto load_stop = clock::now(); 87 | auto load_time_sec = std::chrono::duration_cast(load_stop-load_start); 88 | std::cout << "Index loaded in " << load_time_sec.count() << " seconds." << std::endl; 89 | 90 | /* process the queries */ 91 | std::map query_times; 92 | std::map query_results; 93 | std::map query_lengths; 94 | 95 | size_t num_runs = 1; 96 | for(size_t i=0;i(query); 99 | auto qry_tokens = std::get<1>(query); 100 | std::cout << "[" << id << "] |Q|=" << qry_tokens.size(); std::cout.flush(); 101 | 102 | // run the query 103 | auto qry_start = clock::now(); 104 | auto results = index.search(qry_tokens,args.k); 105 | auto qry_stop = clock::now(); 106 | 107 | auto query_time = std::chrono::duration_cast(qry_stop-qry_start); 108 | std::cout << " TIME = " << std::setprecision(5) 109 | << query_time.count() / 1000.0 110 | << " ms" << std::endl; 111 | 112 | auto itr = query_times.find(id); 113 | if(itr != query_times.end()) { 114 | itr->second += query_time; 115 | } else { 116 | query_times[id] = query_time; 117 | } 118 | 119 | if(i==0) { 120 | query_results[id] = results; 121 | query_lengths[id] = qry_tokens.size(); 122 | } 123 | } 124 | } 125 | 126 | /* output results to csv */ 127 | char time_buffer [80] = {0}; 128 | std::time_t t = std::time(NULL); 129 | auto timeinfo = localtime (&t); 130 | strftime (time_buffer,80,"%F-%H:%M:%S",timeinfo); 131 | std::string time_output_file = args.collection_dir + "/results/" 132 | + "surf-timings-" + index_name + "-k" + std::to_string(args.k) 133 | + "-" + std::string(time_buffer) + ".csv"; 134 | std::string res_output_file = args.collection_dir + "/results/" 135 | + "surf-results-" + index_name + "-k" + std::to_string(args.k) 136 | + "-" + std::string(time_buffer) + ".csv"; 137 | 138 | /* calc average */ 139 | for(auto& timing : query_times) { 140 | timing.second = timing.second / num_runs; 141 | } 142 | 143 | /* output */ 144 | { 145 | std::cout << "Writing timing results to '" << time_output_file << "'" << std::endl; 146 | std::ofstream resfs(time_output_file); 147 | if(resfs.is_open()) { 148 | resfs << "id;index;k;num_terms;time_ms" << std::endl; 149 | for(const auto& timing: query_times) { 150 | auto qry_id = timing.first; 151 | auto qry_time = timing.second; 152 | resfs << qry_id << ";" << index_name << ";" << args.k << ";" 153 | << query_lengths[qry_id] << ";" 154 | << qry_time.count() / 1000.0 << "\n"; 155 | } 156 | } else { 157 | perror("could not output results to file."); 158 | } 159 | std::cout << "Writing result listing to '" << res_output_file << "'" << std::endl; 160 | std::ofstream res_outfs(res_output_file); 161 | if(res_outfs.is_open()) { 162 | res_outfs << "id;rank;docid;score" << std::endl; 163 | for(const auto& result: query_results) { 164 | auto qry_id = result.first; 165 | auto qry_res = result.second.list; 166 | for(size_t i=1;i<=qry_res.size();i++) { 167 | res_outfs << qry_id << ";" 168 | << i << ";" 169 | << qry_res[i-1].doc_id << ";" 170 | << qry_res[i-1].score << "\n"; 171 | } 172 | } 173 | } else { 174 | perror("could not output results to file."); 175 | } 176 | } 177 | 178 | 179 | return EXIT_SUCCESS; 180 | } 181 | -------------------------------------------------------------------------------- /src/surf_trec.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include "surf/query.hpp" 11 | #include "sdsl/config.hpp" 12 | #include "surf/indexes.hpp" 13 | #include "surf/query_parser.hpp" 14 | 15 | typedef struct cmdargs { 16 | std::string collection_dir; 17 | std::string query_file; 18 | std::string output_file; 19 | uint64_t k; 20 | } cmdargs_t; 21 | 22 | void 23 | print_usage(char* program) 24 | { 25 | fprintf(stdout,"%s -c -q -k -o \n",program); 26 | fprintf(stdout,"where\n"); 27 | fprintf(stdout," -c : the directory the collection is stored.\n"); 28 | fprintf(stdout," -q : the queries to be performed.\n"); 29 | fprintf(stdout," -k : the top-k documents to be retrieved for each query.\n"); 30 | fprintf(stdout," -o : output results to file in csv format.\n"); 31 | }; 32 | 33 | cmdargs_t 34 | parse_args(int argc,char* const argv[]) 35 | { 36 | cmdargs_t args; 37 | int op; 38 | args.collection_dir = ""; 39 | args.query_file = ""; 40 | args.output_file = ""; 41 | args.k = 10; 42 | while ((op=getopt(argc,argv,"c:q:k:o:")) != -1) { 43 | switch (op) { 44 | case 'c': 45 | args.collection_dir = optarg; 46 | break; 47 | case 'q': 48 | args.query_file = optarg; 49 | break; 50 | case 'o': 51 | args.output_file = optarg; 52 | break; 53 | case 'k': 54 | args.k = std::strtoul(optarg,NULL,10); 55 | break; 56 | case '?': 57 | default: 58 | print_usage(argv[0]); 59 | } 60 | } 61 | if (args.collection_dir==""||args.query_file=="") { 62 | std::cerr << "Missing command line parameters.\n"; 63 | print_usage(argv[0]); 64 | exit(EXIT_FAILURE); 65 | } 66 | return args; 67 | } 68 | 69 | int main(int argc,char* const argv[]) 70 | { 71 | using clock = std::chrono::high_resolution_clock; 72 | /* parse command line */ 73 | cmdargs_t args = parse_args(argc,argv); 74 | 75 | /* parse repo */ 76 | auto cc = surf::parse_collection(args.collection_dir); 77 | 78 | /* parse queries */ 79 | std::cout << "Parsing query file '" << args.query_file << "'" << std::endl; 80 | auto queries = surf::query_parser::parse_queries(args.collection_dir,args.query_file); 81 | std::cout << "Found " << queries.size() << " queries." << std::endl; 82 | 83 | /* define types */ 84 | using surf_index_t = INDEX_TYPE; 85 | std::string index_name = IDXNAME; 86 | 87 | /* load the index */ 88 | surf_index_t index; 89 | auto load_start = clock::now(); 90 | construct(index, "", cc, 0); 91 | index.load(cc); 92 | auto load_stop = clock::now(); 93 | auto load_time_sec = std::chrono::duration_cast(load_stop-load_start); 94 | std::cout << "Index loaded in " << load_time_sec.count() << " seconds." << std::endl; 95 | 96 | /* process the queries */ 97 | std::map query_results; 98 | 99 | for(const auto& query: queries) { 100 | auto id = std::get<0>(query); 101 | auto qry_tokens = std::get<1>(query); 102 | std::cout << "[" << id << "] |Q|=" << qry_tokens.size(); std::cout.flush(); 103 | 104 | // run the query 105 | auto qry_start = clock::now(); 106 | auto results = index.search(qry_tokens,args.k); 107 | auto qry_stop = clock::now(); 108 | 109 | auto query_time = std::chrono::duration_cast(qry_stop-qry_start); 110 | std::cout << " TIME = " << std::setprecision(5) 111 | << query_time.count() / 1000.0 112 | << " ms" << std::endl; 113 | 114 | query_results[id] = results; 115 | } 116 | /* output results to csv */ 117 | std::string output_file = args.output_file; 118 | if(output_file.empty()) { 119 | char time_buffer [80] = {0}; 120 | std::time_t t = std::time(NULL); 121 | auto timeinfo = localtime (&t); 122 | strftime (time_buffer,80,"%F-%H:%M:%S",timeinfo); 123 | output_file = "surf-timings-" + index_name + "-k" + std::to_string(args.k) 124 | + "-" + std::string(time_buffer) + ".trec"; 125 | } 126 | std::cout << "Writing timing results to '" << output_file << "'" << std::endl; 127 | 128 | /* output */ 129 | { 130 | /* load the url mapping */ 131 | 132 | std::ofstream resfs(output_file); 133 | if(resfs.is_open()) { 134 | for(const auto& res: query_results) { 135 | for (size_t j=0; j(res.list[j])) << "\t" 139 | << j << "\t" 140 | << std::get<1>(res.list[j]) << "\t" 141 | << index_name << std::endl; 142 | } 143 | } 144 | } else { 145 | perror("could not output results to file."); 146 | } 147 | } 148 | 149 | 150 | return EXIT_SUCCESS; 151 | } 152 | -------------------------------------------------------------------------------- /src/test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "sdsl/int_vector_buffer.hpp" 3 | 4 | using namespace sdsl; 5 | using namespace std; 6 | 7 | int main(int argc, char* argv[]){ 8 | if ( argc < 2 ){ 9 | cout << "./" << argv[0] << " file" << endl; 10 | cout << "file has to contain a serialized sdsl::int_vector<>" << endl; 11 | cout << "Program outputs the size of elements and the width per element" << endl; 12 | return 1; 13 | } 14 | int_vector_buffer<> ivb(argv[1]); 15 | cout << ivb.size() << " " << (int) ivb.width() << endl; 16 | } 17 | -------------------------------------------------------------------------------- /src/test_postings_list.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | #include "surf/block_postings_list.hpp" 6 | 7 | int main( int argc, char** argv ) { 8 | using plist_type = surf::block_postings_list<128>; 9 | 10 | // test small uncompressed lists 11 | for(size_t i=0;i<500;i++) { 12 | size_t n = 1 + rand()%20; 13 | std::vector< std::pair > A; 14 | uint64_t cur_id = rand()%5000; 15 | for(size_t j=0;j > A; 40 | uint64_t cur_id = rand()%500; 41 | for(size_t j=0;j 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "surf/config.hpp" 10 | #include "surf/util.hpp" 11 | 12 | typedef struct cmdargs { 13 | std::string collection_dir; 14 | std::string surf_file; 15 | std::string trec_file; 16 | } cmdargs_t; 17 | 18 | void 19 | print_usage(char* program) 20 | { 21 | fprintf(stdout,"%s -c -q -r -o \n",program); 22 | fprintf(stdout,"where\n"); 23 | fprintf(stdout," -c : the directory the collection is stored.\n"); 24 | fprintf(stdout," -r : the results file produced by surf.\n"); 25 | fprintf(stdout," -o : results converted to trec format.\n"); 26 | }; 27 | 28 | cmdargs_t 29 | parse_args(int argc,char* const argv[]) 30 | { 31 | cmdargs_t args; 32 | int op; 33 | args.collection_dir = ""; 34 | args.surf_file = ""; 35 | args.trec_file = ""; 36 | while ((op=getopt(argc,argv,"c:r:o:")) != -1) { 37 | switch (op) { 38 | case 'c': 39 | args.collection_dir = optarg; 40 | break; 41 | case 'r': 42 | args.surf_file = optarg; 43 | break; 44 | case 'o': 45 | args.trec_file = optarg; 46 | break; 47 | case '?': 48 | default: 49 | print_usage(argv[0]); 50 | } 51 | } 52 | if (args.collection_dir==""||args.surf_file==""||args.trec_file=="") { 53 | std::cerr << "Missing command line parameters.\n"; 54 | print_usage(argv[0]); 55 | exit(EXIT_FAILURE); 56 | } 57 | return args; 58 | } 59 | 60 | std::vector 61 | tokenize(std::string line) { 62 | std::vector tokens; 63 | size_t pos = 0; 64 | std::string token; 65 | while ((pos = line.find(";")) != std::string::npos) { 66 | token = line.substr(0, pos); 67 | tokens.push_back(token); 68 | line.erase(0, pos + 1); 69 | } 70 | tokens.push_back(line); 71 | return tokens; 72 | } 73 | 74 | int main( int argc, char** argv ) { 75 | 76 | /* parse command line */ 77 | cmdargs_t args = parse_args(argc,argv); 78 | 79 | /* parse repo */ 80 | surf::parse_collection(args.collection_dir); 81 | 82 | /* load the docnames map */ 83 | std::unordered_map id_mapping; 84 | auto docnames_file = args.collection_dir + surf::DOCNAMES_FILENAME; 85 | std::ifstream dfs(docnames_file); 86 | size_t j=0; 87 | std::string name_mapping; 88 | while( std::getline(dfs,name_mapping) ) { 89 | id_mapping[j] = name_mapping; 90 | j++; 91 | } 92 | 93 | std::ofstream trec_out(args.trec_file); 94 | std::ifstream surfres_fs(args.surf_file); 95 | bool first = true; 96 | for(std::string line; std::getline(surfres_fs,line);) { 97 | if(first) { 98 | first = false; 99 | continue; 100 | } 101 | auto tokens = tokenize(line); 102 | auto qry_id = std::strtoul(tokens[0].c_str(),NULL,10); 103 | auto rank = std::strtoul(tokens[1].c_str(),NULL,10); 104 | auto doc_id = std::strtoul(tokens[2].c_str(),NULL,10); 105 | auto doc_score = std::strtod(tokens[3].c_str(),NULL); 106 | 107 | trec_out 108 | << qry_id << "\t" 109 | << "Q0" << "\t" 110 | << id_mapping[doc_id] << "\t" 111 | << rank << "\t" 112 | << doc_score << "\t" 113 | << "SURF" << std::endl; 114 | } 115 | } 116 | 117 | 118 | -------------------------------------------------------------------------------- /tools/create_surf_collection.cpp: -------------------------------------------------------------------------------- 1 | // extracts from an indri index a monoton sequence of integers in sdsl format 2 | // which represent the parsed text collection. 3 | #include 4 | 5 | #include "surf/config.hpp" 6 | #include "surf/util.hpp" 7 | #include "sdsl/int_vector_buffer.hpp" 8 | 9 | 10 | int main( int argc, char** argv ) { 11 | if(argc != 3) { 12 | std::cout << "USAGE: " << argv[0] 13 | << " " << std::endl; 14 | return EXIT_FAILURE; 15 | } 16 | std::string test_str = argv[1]; 17 | std::string dir = argv[2]; 18 | 19 | // setup collection directory 20 | if(surf::directory_exists(dir)) { 21 | std::cerr << "ERROR: collection directory already exists." << std::endl; 22 | return EXIT_FAILURE; 23 | } 24 | surf::create_directory(dir); 25 | 26 | if( test_str.back() != '#' ) { 27 | std::cerr << "ERROR: test string must end with doc seperator '#'" << std::endl; 28 | return EXIT_FAILURE; 29 | } 30 | 31 | // write collection string 32 | std::map::value_type> existing_syms; 33 | std::map::value_type,std::string::value_type> sym_mapping; 34 | sdsl::int_vector<> text_col(test_str.size()+1); 35 | size_t j=0; 36 | size_t num_docs = 0; 37 | for(const auto& sym : test_str) { 38 | if(sym == '#') { 39 | text_col[j++] = 1; 40 | num_docs++; 41 | } else { 42 | auto itr = existing_syms.find(sym); 43 | if(itr != existing_syms.end()) { 44 | text_col[j++] = itr->second; 45 | } else { 46 | sdsl::int_vector<>::value_type new_sym = existing_syms.size()+2; 47 | existing_syms[sym] = new_sym; 48 | sym_mapping[new_sym] = sym; 49 | text_col[j++] = new_sym; 50 | } 51 | } 52 | } 53 | text_col[j] = 0; 54 | std::ofstream ofs(dir+"/"+surf::TEXT_FILENAME); 55 | if(ofs.is_open()) { 56 | text_col.serialize(ofs); 57 | } else { 58 | std::cerr << "ERROR: could not write collection file." << std::endl; 59 | return EXIT_FAILURE; 60 | } 61 | 62 | // write the dict 63 | std::ofstream dict_ofs(dir+"/"+surf::DICT_FILENAME); 64 | if(dict_ofs.is_open()) { 65 | for(const auto& mapping : sym_mapping) { 66 | dict_ofs << mapping.second << " " << mapping.first << std::endl; 67 | } 68 | } else { 69 | std::cerr << "ERROR: could not write dictionary file." << std::endl; 70 | return EXIT_FAILURE; 71 | } 72 | 73 | // write docnames file 74 | std::ofstream docnames_ofs(dir+"/"+surf::DOCNAMES_FILENAME); 75 | if(docnames_ofs.is_open()) { 76 | for(size_t i=1;i<=num_docs;i++) { 77 | docnames_ofs << "DOCUMENT " << i << std::endl; 78 | } 79 | } else { 80 | std::cerr << "ERROR: could not write docnames file." << std::endl; 81 | return EXIT_FAILURE; 82 | } 83 | 84 | std::cout << "Created surf collection for string '" << test_str << "'" << std::endl; 85 | std::cout << "Found " << num_docs << " documents." << std::endl; 86 | std::cout << "Document delimiter = " << 1 << std::endl; 87 | std::cout << surf::TEXT_FILENAME << ": "; 88 | for(const auto& sym : text_col) { 89 | std::cout << sym << " "; 90 | } 91 | std::cout << std::endl; 92 | std::cout << "Mapping: "; 93 | for(const auto& mapping : sym_mapping) { 94 | std::cout << mapping.second << " -> " << mapping.first << "; "; 95 | } 96 | std::cout << std::endl; 97 | std::cout << "Document Names: "; 98 | for(size_t i=1;i<=num_docs;i++) { 99 | std::cout << "'DOCUMENT " << i << "'; "; 100 | } 101 | std::cout << std::endl; 102 | } 103 | 104 | 105 | -------------------------------------------------------------------------------- /tools/extract_document.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "surf/config.hpp" 11 | #include "surf/util.hpp" 12 | #include "sdsl/int_vector.hpp" 13 | #include "sdsl/select_support_mcl.hpp" 14 | 15 | typedef struct cmdargs { 16 | std::string collection_dir; 17 | uint64_t doc_id; 18 | } cmdargs_t; 19 | 20 | void 21 | print_usage(char* program) 22 | { 23 | fprintf(stdout,"%s -c -d ",program); 24 | fprintf(stdout,"where\n"); 25 | fprintf(stdout," -c : the directory the collection is stored.\n"); 26 | fprintf(stdout," -d : the document to output\n"); 27 | }; 28 | 29 | cmdargs_t 30 | parse_args(int argc,char* const argv[]) 31 | { 32 | cmdargs_t args; 33 | int op; 34 | args.collection_dir = ""; 35 | int64_t doc_id = -1; 36 | while ((op=getopt(argc,argv,"c:d:")) != -1) { 37 | switch (op) { 38 | case 'c': 39 | args.collection_dir = optarg; 40 | break; 41 | case 'd': 42 | doc_id = std::strtoll(optarg,NULL,10); 43 | break; 44 | case '?': 45 | default: 46 | print_usage(argv[0]); 47 | } 48 | } 49 | if (args.collection_dir==""||doc_id<0) { 50 | std::cerr << "Missing command line parameters.\n"; 51 | print_usage(argv[0]); 52 | exit(EXIT_FAILURE); 53 | } 54 | 55 | args.doc_id = (uint64_t) doc_id; 56 | 57 | return args; 58 | } 59 | 60 | int main( int argc, char** argv ) { 61 | 62 | /* parse command line */ 63 | cmdargs_t args = parse_args(argc,argv); 64 | 65 | /* parse repo */ 66 | auto cc = surf::parse_collection(args.collection_dir); 67 | 68 | /* load doc border bv and build select structure */ 69 | sdsl::bit_vector doc_border; 70 | sdsl::load_from_cache(doc_border, surf::KEY_DOCBORDER, cc); 71 | sdsl::bit_vector::select_1_type doc_border_select(&doc_border); 72 | 73 | /* load dictionary and create mapping */ 74 | std::unordered_map id_mapping; 75 | { 76 | auto dict_file = args.collection_dir + "/" + surf::DICT_FILENAME; 77 | std::ifstream dfs(dict_file); 78 | if(!dfs.is_open()) { 79 | std::cerr << "cannot load dictionary file."; 80 | exit(EXIT_FAILURE); 81 | } 82 | std::string term_mapping; 83 | while( std::getline(dfs,term_mapping) ) { 84 | auto sep_pos = term_mapping.find(' '); 85 | auto term = term_mapping.substr(0,sep_pos); 86 | auto idstr = term_mapping.substr(sep_pos+1); 87 | uint64_t id = std::stoull(idstr); 88 | id_mapping[id] = term; 89 | } 90 | } 91 | 92 | auto text_file = args.collection_dir + "/" + surf::TEXT_FILENAME; 93 | sdsl::int_vector_buffer<> T(text_file); 94 | uint64_t doc_id = args.doc_id; 95 | size_t doc_start = 0; 96 | if(doc_id != 0) { 97 | doc_start = doc_border_select(doc_id) + 1; 98 | } 99 | auto doc_stop = doc_border_select(doc_id+1) - 1; 100 | 101 | std::cout << "document length = " << doc_stop - doc_start + 1 << std::endl; 102 | std::cout << "document content = '"; 103 | for(size_t i=doc_start;i<=doc_stop;i++) { 104 | std::cout << id_mapping[T[i]] << " "; 105 | } 106 | std::cout << "'" << std::endl; 107 | } 108 | 109 | 110 | -------------------------------------------------------------------------------- /tools/extract_documents.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "surf/config.hpp" 11 | #include "surf/util.hpp" 12 | #include "sdsl/int_vector.hpp" 13 | #include "sdsl/select_support_mcl.hpp" 14 | 15 | typedef struct cmdargs { 16 | std::string collection_dir; 17 | std::string surf_file; 18 | } cmdargs_t; 19 | 20 | void 21 | print_usage(char* program) 22 | { 23 | fprintf(stdout,"%s -c -q -r ",program); 24 | fprintf(stdout,"where\n"); 25 | fprintf(stdout," -c : the directory the collection is stored.\n"); 26 | fprintf(stdout," -r : the results file produced by surf.\n"); 27 | }; 28 | 29 | cmdargs_t 30 | parse_args(int argc,char* const argv[]) 31 | { 32 | cmdargs_t args; 33 | int op; 34 | args.collection_dir = ""; 35 | args.surf_file = ""; 36 | while ((op=getopt(argc,argv,"c:r:")) != -1) { 37 | switch (op) { 38 | case 'c': 39 | args.collection_dir = optarg; 40 | break; 41 | case 'r': 42 | args.surf_file = optarg; 43 | break; 44 | case '?': 45 | default: 46 | print_usage(argv[0]); 47 | } 48 | } 49 | if (args.collection_dir==""||args.surf_file=="") { 50 | std::cerr << "Missing command line parameters.\n"; 51 | print_usage(argv[0]); 52 | exit(EXIT_FAILURE); 53 | } 54 | return args; 55 | } 56 | 57 | std::vector 58 | tokenize(std::string line) { 59 | std::vector tokens; 60 | size_t pos = 0; 61 | std::string token; 62 | while ((pos = line.find(";")) != std::string::npos) { 63 | token = line.substr(0, pos); 64 | tokens.push_back(token); 65 | line.erase(0, pos + 1); 66 | } 67 | tokens.push_back(line); 68 | return tokens; 69 | } 70 | 71 | int main( int argc, char** argv ) { 72 | 73 | /* parse command line */ 74 | cmdargs_t args = parse_args(argc,argv); 75 | 76 | /* parse repo */ 77 | auto cc = surf::parse_collection(args.collection_dir); 78 | 79 | /* load doc border bv and build select structure */ 80 | sdsl::bit_vector doc_border; 81 | sdsl::load_from_cache(doc_border, surf::KEY_DOCBORDER, cc); 82 | sdsl::bit_vector::select_1_type doc_border_select(&doc_border); 83 | 84 | /* load dictionary and create mapping */ 85 | std::unordered_map id_mapping; 86 | { 87 | auto dict_file = args.collection_dir + "/" + surf::DICT_FILENAME; 88 | std::ifstream dfs(dict_file); 89 | if(!dfs.is_open()) { 90 | std::cerr << "cannot load dictionary file."; 91 | exit(EXIT_FAILURE); 92 | } 93 | std::string term_mapping; 94 | while( std::getline(dfs,term_mapping) ) { 95 | auto sep_pos = term_mapping.find(' '); 96 | auto term = term_mapping.substr(0,sep_pos); 97 | auto idstr = term_mapping.substr(sep_pos+1); 98 | uint64_t id = std::stoull(idstr); 99 | id_mapping[id] = term; 100 | } 101 | } 102 | 103 | auto text_file = args.collection_dir + "/" + surf::TEXT_FILENAME; 104 | sdsl::int_vector_buffer<> T(text_file); 105 | std::ifstream surfres_fs(args.surf_file); 106 | bool first = true; 107 | for(std::string line; std::getline(surfres_fs,line);) { 108 | if(first) { 109 | first = false; 110 | continue; 111 | } 112 | auto tokens = tokenize(line); 113 | auto qry_id = std::strtoul(tokens[0].c_str(),NULL,10); 114 | auto rank = std::strtoul(tokens[1].c_str(),NULL,10); 115 | auto doc_id = std::strtoul(tokens[2].c_str(),NULL,10); 116 | auto doc_score = std::strtod(tokens[3].c_str(),NULL); 117 | 118 | std::cout << "=====================================================================================\n"; 119 | std::cout << "[Q]=" << qry_id << " rank=" << rank << " docid=" 120 | << doc_id << " score=" << doc_score << std::endl; 121 | 122 | size_t doc_start = 0; 123 | if(doc_id != 0) { 124 | doc_start = doc_border_select(doc_id) + 1; 125 | } 126 | auto doc_stop = doc_border_select(doc_id+1) - 1; 127 | 128 | std::cout << "document length = " << doc_stop - doc_start + 1 << std::endl; 129 | std::cout << "document content = '"; 130 | for(size_t i=doc_start;i<=doc_stop;i++) { 131 | std::cout << id_mapping[T[i]] << " "; 132 | } 133 | std::cout << "'" << std::endl; 134 | } 135 | } 136 | 137 | 138 | -------------------------------------------------------------------------------- /tools/indri_stem_krovetz.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "indri/KrovetzStemmer.hpp" 8 | 9 | int main( int argc, char** argv ) { 10 | if(argc != 1) { 11 | std::cout << "USAGE: " << argv[0] << " < > " << std::endl; 12 | return EXIT_FAILURE; 13 | } 14 | 15 | using stemmer_t = indri::parse::KrovetzStemmer; 16 | stemmer_t ks; 17 | for(std::string line; std::getline(std::cin,line);) { 18 | auto id_sep_pos = line.find(';'); 19 | auto qryid_str = line.substr(0,id_sep_pos); 20 | auto qry_id = std::stoull(qryid_str); 21 | std::istringstream qry_content_stream(line.substr(id_sep_pos+1)); 22 | std::vector stemmed_qry; 23 | for(std::string qry_token; std::getline(qry_content_stream,qry_token,' ');) { 24 | char stem_buf[stemmer_t::MAX_WORD_LENGTH+1] = {0}; 25 | char original_word[stemmer_t::MAX_WORD_LENGTH+1] = {0}; 26 | std::replace(qry_token.begin(),qry_token.end(),'-',' '); 27 | qry_token.erase(std::remove(qry_token.begin(),qry_token.end(),'\''),qry_token.end()); 28 | qry_token.erase(std::remove(qry_token.begin(),qry_token.end(),'.'),qry_token.end()); 29 | std::transform(qry_token.begin(), qry_token.end(), qry_token.begin(), ::tolower); 30 | std::copy(qry_token.begin(),qry_token.end(),std::begin(original_word)); 31 | auto ret = ks.kstem_stem_tobuffer(original_word,stem_buf); 32 | if (ret > 0) { 33 | std::string tmp(stem_buf); 34 | stemmed_qry.push_back(tmp); 35 | } else { 36 | stemmed_qry.push_back(qry_token); 37 | } 38 | } 39 | std::cout << qry_id << ";"; 40 | for(size_t i=0;i 4 | 5 | #include "indri/Repository.hpp" 6 | #include "indri/CompressedCollection.hpp" 7 | #include "sdsl/int_vector_buffer.hpp" 8 | 9 | bool 10 | directory_exists(std::string dir) 11 | { 12 | struct stat sb; 13 | const char* pathname = dir.c_str(); 14 | if (stat(pathname, &sb) == 0 && S_ISDIR(sb.st_mode)) { 15 | return true; 16 | } 17 | return false; 18 | } 19 | 20 | void 21 | create_directory(std::string dir) 22 | { 23 | if (!directory_exists(dir)) { 24 | if (mkdir(dir.c_str(),0777) == -1) { 25 | perror("could not create directory"); 26 | exit(EXIT_FAILURE); 27 | } 28 | } 29 | } 30 | 31 | 32 | int main( int argc, char** argv ) { 33 | if(argc != 3) { 34 | std::cout << "USAGE: " << argv[0] << " " << std::endl; 35 | return EXIT_FAILURE; 36 | } 37 | 38 | // parse cmd line 39 | std::string repository_name = argv[1]; 40 | std::string surf_collection_folder = argv[2]; 41 | create_directory(surf_collection_folder); 42 | std::string dict_file = surf_collection_folder + "/dict.txt"; 43 | std::string doc_names_file = surf_collection_folder + "/doc_names.txt"; 44 | std::string text_int_file = surf_collection_folder + "/text_int_SURF.sdsl"; 45 | 46 | // load stuff 47 | indri::collection::Repository repo; 48 | repo.openRead( repository_name ); 49 | 50 | // extract 51 | std::cout << "extracting sdsl integer file from indri index into file " << text_int_file << std::endl; 52 | std::vector document_names; 53 | indri::collection::Repository::index_state state = repo.indexes(); 54 | const auto& index = (*state)[0]; 55 | uint64_t uniq_terms = index->uniqueTermCount(); 56 | uniq_terms += 2; // we will shift all ids from idri by 2 so \0 and \1 is free 57 | uint8_t out_int_width = sdsl::bits::hi(uniq_terms)+1; 58 | sdsl::int_vector_buffer<> sdsl_col_file(text_int_file,std::ios::out,1024*1024,out_int_width,false); 59 | size_t written_term_ids = 0; 60 | indri::collection::CompressedCollection* collection = repo.collection(); 61 | int64_t document_id = index->documentBase(); 62 | indri::index::TermListFileIterator* iter = index->termListFileIterator(); 63 | iter->startIteration(); 64 | while( !iter->finished() ) { 65 | indri::index::TermList* list = iter->currentEntry(); 66 | 67 | // find document name 68 | std::string doc_name = collection->retrieveMetadatum( document_id , "docno" ); 69 | document_names.push_back(doc_name); 70 | 71 | if(document_id % 10000 == 0) { 72 | std::cout << "."; 73 | std::cout.flush(); 74 | } 75 | 76 | // iterate over termlist 77 | for(const auto& term_id : list->terms()) { 78 | // we will shift all ids from idri by 1 so \0 and \1 is free 79 | if(term_id != 0) { 80 | sdsl_col_file[written_term_ids++] = term_id+1; 81 | } 82 | } 83 | sdsl_col_file[written_term_ids++] = 1; // end of doc sep 84 | 85 | document_id++; 86 | iter->nextEntry(); 87 | } 88 | std::cout << std::endl; 89 | sdsl_col_file[written_term_ids++] = 0; // end of collection sep 90 | 91 | // write document names 92 | { 93 | std::cout << "writing document names to " << doc_names_file << std::endl; 94 | std::ofstream of_doc_names(doc_names_file); 95 | for(const auto& doc_name : document_names) { 96 | of_doc_names << doc_name << std::endl; 97 | } 98 | } 99 | // write dictionary 100 | { 101 | std::cout << "writing dictionary to " << dict_file << std::endl; 102 | const auto& index = (*state)[0]; 103 | std::ofstream of_dict(dict_file); 104 | for(size_t i=1;iuniqueTermCount();i++) { 105 | auto term_str = index->term(i); 106 | of_dict << term_str << " " << i+1 << std::endl; 107 | } 108 | } 109 | } 110 | 111 | 112 | -------------------------------------------------------------------------------- /tools/select_random_queries.cpp: -------------------------------------------------------------------------------- 1 | // extracts from an indri index a monoton sequence of integers in sdsl format 2 | // which represent the parsed text collection. 3 | #include 4 | #include 5 | #include 6 | 7 | #include "surf/query.hpp" 8 | #include "sdsl/config.hpp" 9 | #include "surf/query_parser.hpp" 10 | #include "surf/util.hpp" 11 | 12 | typedef struct cmdargs { 13 | std::string collection_dir; 14 | std::string query_file; 15 | std::string output_file; 16 | uint64_t num_qrys; 17 | } cmdargs_t; 18 | 19 | void 20 | print_usage(char* program) 21 | { 22 | fprintf(stdout,"%s -c -q -n -o \n",program); 23 | fprintf(stdout,"where\n"); 24 | fprintf(stdout," -c : the directory the collection is stored.\n"); 25 | fprintf(stdout," -q : the queries to be processed.\n"); 26 | fprintf(stdout," -n : the number of queries to be selected.\n"); 27 | fprintf(stdout," -o : selected queries.\n"); 28 | }; 29 | 30 | cmdargs_t 31 | parse_args(int argc,char* const argv[]) 32 | { 33 | cmdargs_t args; 34 | int op; 35 | args.collection_dir = ""; 36 | args.query_file = ""; 37 | args.output_file = ""; 38 | args.num_qrys = 1000; 39 | while ((op=getopt(argc,argv,"c:q:n:o:")) != -1) { 40 | switch (op) { 41 | case 'c': 42 | args.collection_dir = optarg; 43 | break; 44 | case 'q': 45 | args.query_file = optarg; 46 | break; 47 | case 'o': 48 | args.output_file = optarg; 49 | break; 50 | case 'n': 51 | args.num_qrys = std::strtoul(optarg,NULL,10); 52 | break; 53 | case '?': 54 | default: 55 | print_usage(argv[0]); 56 | } 57 | } 58 | if (args.collection_dir==""||args.query_file=="") { 59 | std::cerr << "Missing command line parameters.\n"; 60 | print_usage(argv[0]); 61 | exit(EXIT_FAILURE); 62 | } 63 | return args; 64 | } 65 | int main( int argc, char** argv ) { 66 | if(argc < 3) { 67 | print_usage(argv[0]); 68 | return EXIT_FAILURE; 69 | } 70 | 71 | /* parse command line */ 72 | cmdargs_t args = parse_args(argc,argv); 73 | 74 | /* parse repo */ 75 | auto cc = surf::parse_collection(args.collection_dir); 76 | 77 | /* parse queries */ 78 | std::cout << "Parsing query file '" << args.query_file << "'" << std::endl; 79 | auto queries = surf::query_parser::parse_queries(args.collection_dir,args.query_file,true); 80 | std::cout << "Found " << queries.size() << " queries." << std::endl; 81 | 82 | /* select num_queries random ones */ 83 | std::mt19937 gen(4711); 84 | std::shuffle(queries.begin(), queries.end(), gen); 85 | auto id_sort = [](const surf::query_t& a,const surf::query_t& b) { 86 | return std::get<0>(a) < std::get<0>(b); 87 | }; 88 | std::sort(queries.begin(),queries.begin()+args.num_qrys,id_sort); 89 | 90 | /* output */ 91 | std::ofstream selected_fs(args.output_file); 92 | if(selected_fs.is_open()) { 93 | for(size_t i=0;i(queries[i]) << ";"; 95 | const auto& tokens = std::get<1>(queries[i]); 96 | for(size_t j=0;j 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "surf/config.hpp" 10 | #include "surf/util.hpp" 11 | #include "surf/construct_doc_cnt.hpp" 12 | 13 | typedef struct cmdargs { 14 | std::string collection_dir; 15 | std::string surf_file; 16 | std::string trec_file; 17 | } cmdargs_t; 18 | 19 | void 20 | print_usage(char* program) 21 | { 22 | fprintf(stdout,"%s -c \n",program); 23 | fprintf(stdout,"where\n"); 24 | fprintf(stdout," -c : the directory the collection is stored.\n"); 25 | }; 26 | 27 | cmdargs_t 28 | parse_args(int argc,char* const argv[]) 29 | { 30 | cmdargs_t args; 31 | int op; 32 | args.collection_dir = ""; 33 | while ((op=getopt(argc,argv,"c:")) != -1) { 34 | switch (op) { 35 | case 'c': 36 | args.collection_dir = optarg; 37 | break; 38 | case '?': 39 | default: 40 | print_usage(argv[0]); 41 | } 42 | } 43 | if (args.collection_dir=="") { 44 | std::cerr << "Missing command line parameters.\n"; 45 | print_usage(argv[0]); 46 | exit(EXIT_FAILURE); 47 | } 48 | return args; 49 | } 50 | 51 | int main( int argc, char** argv ) { 52 | /* parse command line */ 53 | cmdargs_t args = parse_args(argc,argv); 54 | 55 | /* parse repo */ 56 | auto cc = surf::parse_collection(args.collection_dir); 57 | sdsl::int_vector_buffer<> T(args.collection_dir+"/"+surf::TEXT_FILENAME); 58 | std::cout << "n = |T|= " << T.size() << std::endl; 59 | surf::construct_doc_cnt(cc); 60 | uint64_t doc_cnt = 0; 61 | load_from_cache(doc_cnt, surf::KEY_DOCCNT, cc); 62 | std::cout << "number of documents = N = " << doc_cnt << std::endl; 63 | std::ifstream dic_fs(args.collection_dir+"/"+surf::DICT_FILENAME); 64 | std::string line; 65 | size_t num_terms = 0; 66 | while( std::getline(dic_fs,line) ) { 67 | num_terms++; 68 | } 69 | std::cout << "number of terms = sigma = " << num_terms << std::endl; 70 | std::cout << "avg document length = " << T.size() / doc_cnt << std::endl; 71 | } 72 | 73 | 74 | -------------------------------------------------------------------------------- /update-sdsl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd external/sdsl-lite && \ 3 | git checkout master && \ 4 | git pull && \ 5 | cd ../.. && \ 6 | git add external/sdsl-lite && \ 7 | git commit -m "forwarded sdsl-lite to current master" 8 | --------------------------------------------------------------------------------