├── .gitmodules
├── CMakeLists.txt
├── CMakeModules
    └── AppendCompilerFlags.cmake
├── LICENSE
├── README.md
├── build
    └── .gitignore
├── collections
    ├── README.md
    ├── cluewebB
    │   └── index
    │   │   ├── space_usage_IDX_D.html
    │   │   ├── space_usage_IDX_D1R1.html
    │   │   └── space_usage_IDX_DR.html
    ├── gov2
    │   └── index
    │   │   ├── space_usage_IDX_D.html
    │   │   ├── space_usage_IDX_D1R1.html
    │   │   └── space_usage_IDX_DR.html
    ├── speeches
    │   ├── .gitignore
    │   ├── dict.txt
    │   ├── doc_names.txt
    │   └── text_int_SURF.sdsl
    └── wikishort
    │   ├── .gitignore
    │   ├── dict.txt
    │   ├── doc_names.txt
    │   ├── text_int_SURF.sdsl
    │   └── wikishort-src.tar.gz
├── config
    ├── IDX-D-BM25.config
    ├── IDX-D-LMDS.config
    ├── IDX-D-SANSLEN.config
    ├── IDX-D-TFIDF.config
    ├── IDX-D.config
    ├── IDX-D1R1-BM25.config
    ├── IDX-D1R1-LMDS.config
    ├── IDX-D1R1-TFIDF.config
    ├── IDX-D1R1.config
    ├── IDX-D1R1MTF.config
    ├── IDX-DR-BM25.config
    ├── IDX-DR-LMDS.config
    ├── IDX-DR-SANSLEN.config
    ├── IDX-DR-TFIDF.config
    ├── IDX-DR.config
    ├── INVIDX-E-BM25.config
    ├── INVIDX-E-LMDS.config
    ├── INVIDX-E-TFIDF.config
    ├── INVIDX-E.config
    └── INVIDX-W.config
├── experiments
    ├── check_equivalence.sh
    ├── doclen-clueweb.csv
    ├── doclen-gov2.csv
    ├── eval.R
    ├── eval_3.R
    ├── mem_info.csv
    ├── mem_used.sh
    ├── nodes_evaluated.csv
    ├── nodes_evaluated.sh
    ├── nodes_evaluated_2005.csv
    ├── nodes_evaluated_2006.csv
    ├── nodes_evaluated_and_2005.csv
    ├── nodes_evaluated_and_2006.csv
    ├── phrase_time_2005.csv
    ├── phrase_time_2006.csv
    ├── phrase_time_and_2005.csv
    ├── phrase_time_and_2006.csv
    ├── phrases_time.sh
    ├── rank_times.sh
    ├── ranker_times_2005.csv
    ├── ranker_times_2006.csv
    ├── run.sh
    ├── sbatch_mem_used.sh
    ├── sbatch_nodes_evaluated.sh
    ├── time_per_wtnode.R
    ├── trec-2005-and-profile-IDX_SAWIT2.csv
    ├── trec-2005-and-time-IDX_SAWIT2.csv
    ├── trec-2005-or-profile-IDX_SAWIT2.csv
    ├── trec-2005-or-time-IDX_SAWIT2.csv
    ├── trec-2005-time-ex-and-wt.csv
    ├── trec-2005-time-ex-or-wt.csv
    ├── trec-2005.csv
    ├── trec-2005.dr.csv
    ├── trec-2006-and-profile-IDX_SAWIT2.csv
    ├── trec-2006-and-time-IDX_SAWIT2.csv
    ├── trec-2006-or-profile-IDX_SAWIT2.csv
    ├── trec-2006-or-time-IDX_SAWIT2.csv
    ├── trec-2006-time-ex-and-wt.csv
    ├── trec-2006-time-ex-or-wt.csv
    ├── trec-2006.csv
    ├── trec-2006.dr.csv
    └── wikishort.qry
├── extras
    ├── clueweb-collection.indricfg
    ├── gov2-collection.indricfg
    ├── speeches-collection.indricfg
    ├── trec8-collection.indricfg
    ├── wikishort-collection.indricfg
    └── wt10g-collection.indricfg
├── include
    └── surf
    │   ├── .gitignore
    │   ├── block_postings_list.hpp
    │   ├── comm.hpp
    │   ├── config.hpp
    │   ├── construct_DUP2.hpp
    │   ├── construct_U.hpp
    │   ├── construct_col_len.hpp
    │   ├── construct_darray.hpp
    │   ├── construct_doc_border.hpp
    │   ├── construct_doc_cnt.hpp
    │   ├── construct_doc_lengths.hpp
    │   ├── construct_doc_perm.hpp
    │   ├── construct_invidx.hpp
    │   ├── df_sada.hpp
    │   ├── doc_perm.hpp
    │   ├── idx_d.hpp
    │   ├── idx_d1r1.hpp
    │   ├── idx_d1r1mtf.hpp
    │   ├── idx_dr.hpp
    │   ├── idx_invfile.hpp
    │   ├── indexes.hpp
    │   ├── phrase_parser.hpp
    │   ├── query.hpp
    │   ├── query_parser.hpp
    │   ├── rank_functions.hpp
    │   └── util.hpp
├── queries
    ├── trec0406-adhoc.qry
    ├── trec2005-efficiency-10.qry
    ├── trec2005-efficiency-100.qry
    ├── trec2005-efficiency-1000.qry
    ├── trec2005-efficiency.qry
    ├── trec2006-efficiency-10.qry
    ├── trec2006-efficiency-100.qry
    ├── trec2006-efficiency-1000.qry
    └── trec2006-efficiency.qry
├── results
    ├── trec8_wtdup_stat.R
    ├── trec8_wtdup_stat.pdf
    └── trec8_wtdup_stat.txt
├── src
    ├── .gitignore
    ├── doc_lengths.cpp
    ├── surf_daemon.cpp
    ├── surf_index.cpp
    ├── surf_query.cpp
    ├── surf_search.cpp
    ├── surf_trec.cpp
    ├── test.cpp
    └── test_postings_list.cpp
├── tools
    ├── Makefile
    ├── convert_results_to_trec.cpp
    ├── create_surf_collection.cpp
    ├── extract_document.cpp
    ├── extract_documents.cpp
    ├── indri_stem_krovetz.cpp
    ├── indri_to_surf.cpp
    ├── select_random_queries.cpp
    └── surf_collection_info.cpp
└── update-sdsl.sh


/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "external/fastpfor"]
 2 | 	path = external/fastpfor
 3 | 	url = https://github.com/lemire/FastPFor.git
 4 | [submodule "external/sdsl-lite"]
 5 | 	path = external/sdsl-lite
 6 | 	url = https://github.com/simongog/sdsl-lite.git
 7 | [submodule "external/zeromq"]
 8 | 	path = external/zeromq
 9 | 	url = https://github.com/zeromq/libzmq.git
10 | [submodule "external/cppzmq"]
11 | 	path = external/cppzmq
12 | 	url = https://github.com/zeromq/cppzmq.git
13 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | cmake_policy(SET CMP0015 NEW)
 3 | set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
 4 | include(AppendCompilerFlags)
 5 | include(ExternalProject)
 6 | 
 7 | project(SURF CXX C)
 8 | 
 9 | INCLUDE_DIRECTORIES(${CMAKE_HOME_DIRECTORY}/include
10 |                     ${CMAKE_HOME_DIRECTORY}/external/fastpfor/headers/
11 |                     ${CMAKE_HOME_DIRECTORY}/external/cppzmq/
12 |                     ${CMAKE_HOME_DIRECTORY}/external/zeromq/include/
13 |                     ${CMAKE_BINARY_DIR}/external/sdsl-lite/include
14 |                     ${CMAKE_BINARY_DIR}/external/sdsl-lite/external/libdivsufsort-2.0.1/include
15 |                     )
16 | 
17 | LINK_DIRECTORIES(${CMAKE_BINARY_DIR}/external/sdsl-lite/lib
18 | 				 ${CMAKE_BINARY_DIR}/external/zeromq/lib
19 | 					)
20 | 
21 | append_cxx_compiler_flags("-msse4.2 -std=c++11 -Wall -DNDEBUG" "GCC" CMAKE_CXX_FLAGS)
22 | append_cxx_compiler_flags("-O3 -ffast-math -funroll-loops" "GCC" CMAKE_CXX_FLAGS)
23 | append_cxx_compiler_flags("-msse4.2 -std=c++11 -g -funroll-loops -DNDEBUG -stdlib=libc++" "CLANG" CMAKE_CXX_FLAGS)
24 | 
25 | 
26 | ADD_SUBDIRECTORY(external/zeromq)
27 | SET_PROPERTY(DIRECTORY external/zeromq PROPERTY ZMQ_BUILD_TESTS FALSE)
28 | 
29 | ADD_SUBDIRECTORY(external/sdsl-lite)
30 | 
31 | ADD_LIBRARY(fastpfor_lib STATIC external/fastpfor/src/bitpacking.cpp
32 |                                 external/fastpfor/src/bitpackingaligned.cpp
33 |                                 external/fastpfor/src/bitpackingunaligned.cpp
34 |                                 external/fastpfor/src/simdunalignedbitpacking.cpp
35 |                                 external/fastpfor/src/simdbitpacking.cpp)
36 | 
37 | # # read the index configs
38 | file(GLOB index_config_files RELATIVE ${CMAKE_HOME_DIRECTORY}/config/ "${CMAKE_HOME_DIRECTORY}/config/*.config")
39 | foreach(f ${index_config_files})
40 | 	file(STRINGS ${CMAKE_HOME_DIRECTORY}/config/${f} config_contents)
41 | 	set(compile_defs "")
42 | 	foreach(keyvalue ${config_contents})
43 | 		string(REGEX REPLACE "^[ ]+" "" keyvalue ${keyvalue})
44 | 		string(REGEX MATCH "^[^=]+" key ${keyvalue})
45 | 		string(REPLACE "${key}=" "" value ${keyvalue})
46 | 		set(${key} "${value}")
47 | 		list(APPEND compile_defs ${key}=${value})
48 | 	endforeach(keyvalue)
49 | 
50 | 	ADD_EXECUTABLE(surf_index-${NAME} src/surf_index.cpp)
51 | 	TARGET_LINK_LIBRARIES(surf_index-${NAME} sdsl divsufsort divsufsort64 pthread fastpfor_lib)
52 | 	set_property(TARGET surf_index-${NAME} PROPERTY COMPILE_DEFINITIONS IDXNAME="${NAME}" ${compile_defs})
53 | 
54 | 	ADD_EXECUTABLE(surf_search-${NAME} src/surf_search.cpp)
55 | 	TARGET_LINK_LIBRARIES(surf_search-${NAME} sdsl divsufsort divsufsort64 pthread fastpfor_lib)
56 | 	set_property(TARGET surf_search-${NAME} PROPERTY COMPILE_DEFINITIONS IDXNAME="${NAME}" ${compile_defs})
57 | 
58 | 	ADD_EXECUTABLE(surf_daemon-${NAME} src/surf_daemon.cpp)
59 | 	TARGET_LINK_LIBRARIES(surf_daemon-${NAME} sdsl divsufsort divsufsort64 pthread fastpfor_lib libzmq)
60 | 	set_property(TARGET surf_daemon-${NAME} PROPERTY COMPILE_DEFINITIONS IDXNAME="${NAME}" ${compile_defs})
61 | 
62 | endforeach(f)
63 | 
64 | ADD_EXECUTABLE(doc_lengths src/doc_lengths.cpp)
65 | TARGET_LINK_LIBRARIES(doc_lengths sdsl)
66 | 
67 | ADD_EXECUTABLE(surf_query src/surf_query.cpp)
68 | TARGET_LINK_LIBRARIES(surf_query libzmq sdsl)
69 | 
70 | ADD_EXECUTABLE(test src/test.cpp)
71 | TARGET_LINK_LIBRARIES(test sdsl divsufsort divsufsort64 pthread)
72 | 
73 | ADD_EXECUTABLE(select_random_queries tools/select_random_queries.cpp)
74 | TARGET_LINK_LIBRARIES(select_random_queries sdsl divsufsort divsufsort64 pthread)
75 | 
76 | ADD_EXECUTABLE(test_postings_list src/test_postings_list.cpp)
77 | TARGET_LINK_LIBRARIES(test_postings_list sdsl divsufsort divsufsort64 pthread fastpfor_lib)
78 | 
79 | ADD_EXECUTABLE(create_surf_collection tools/create_surf_collection.cpp)
80 | TARGET_LINK_LIBRARIES(create_surf_collection sdsl divsufsort divsufsort64 pthread fastpfor_lib)
81 | 
82 | ADD_EXECUTABLE(convert_results_to_trec tools/convert_results_to_trec.cpp)
83 | TARGET_LINK_LIBRARIES(convert_results_to_trec sdsl divsufsort divsufsort64 pthread fastpfor_lib)
84 | 
85 | ADD_EXECUTABLE(extract_documents tools/extract_documents.cpp)
86 | TARGET_LINK_LIBRARIES(extract_documents sdsl divsufsort divsufsort64 pthread fastpfor_lib)
87 | 
88 | ADD_EXECUTABLE(extract_document tools/extract_document.cpp)
89 | TARGET_LINK_LIBRARIES(extract_document sdsl divsufsort divsufsort64 pthread fastpfor_lib)
90 | 
91 | ADD_EXECUTABLE(surf_collection_info tools/surf_collection_info.cpp)
92 | TARGET_LINK_LIBRARIES(surf_collection_info sdsl divsufsort divsufsort64 pthread fastpfor_lib)
93 | 
94 | 


--------------------------------------------------------------------------------
/CMakeModules/AppendCompilerFlags.cmake:
--------------------------------------------------------------------------------
 1 | include(CheckCSourceCompiles)
 2 | include(CheckCXXSourceCompiles)
 3 | 
 4 | macro(append_c_compiler_flags _flags _name _result)
 5 |   set(SAFE_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
 6 |   string(REGEX REPLACE "[-+/ ]" "_" cname "${_name}")
 7 |   string(TOUPPER "${cname}" cname)
 8 |   foreach(flag ${_flags})
 9 |     string(REGEX REPLACE "^[-+/ ]+(.*)[-+/ ]*$" "\\1" flagname "${flag}")
10 |     string(REGEX REPLACE "[-+/ ]" "_" flagname "${flagname}")
11 |     string(TOUPPER "${flagname}" flagname)
12 |     set(have_flag "HAVE_${cname}_${flagname}")
13 |     set(CMAKE_REQUIRED_FLAGS "${flag}")
14 |     check_c_source_compiles("int main() { return 0; }" ${have_flag})
15 |     if(${have_flag})
16 |       set(${_result} "${${_result}} ${flag}")
17 |     endif(${have_flag})
18 |   endforeach(flag)
19 |   set(CMAKE_REQUIRED_FLAGS ${SAFE_CMAKE_REQUIRED_FLAGS})
20 | endmacro(append_c_compiler_flags)
21 | 
22 | macro(append_cxx_compiler_flags _flags _name _result)
23 |   set(SAFE_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
24 |   string(REGEX REPLACE "[-+/ ]" "_" cname "${_name}")
25 |   string(TOUPPER "${cname}" cname)
26 |   foreach(flag ${_flags})
27 |     string(REGEX REPLACE "^[-+/ ]+(.*)[-+/ ]*$" "\\1" flagname "${flag}")
28 |     string(REGEX REPLACE "[-+/ ]" "_" flagname "${flagname}")
29 |     string(TOUPPER "${flagname}" flagname)
30 |     set(have_flag "HAVE_${cname}_${flagname}")
31 |     set(CMAKE_REQUIRED_FLAGS "${flag}")
32 |     check_cxx_source_compiles("int main() { return 0; }" ${have_flag})
33 |     if(${have_flag})
34 |       set(${_result} "${${_result}} ${flag}")
35 |     endif(${have_flag})
36 |   endforeach(flag)
37 |   set(CMAKE_REQUIRED_FLAGS ${SAFE_CMAKE_REQUIRED_FLAGS})
38 | endmacro(append_cxx_compiler_flags)
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | surf
 2 | ====
 3 | 
 4 | the SUccinct Retrival Framework.
 5 | 
 6 | ## requirements
 7 | 
 8 | * gcc 4.7 or clang 4.3
 9 | * [indri](http://www.lemurproject.org/indri/) to convert indri indexes to surf input format
10 | 
11 | ## installation
12 | 
13 | ```
14 | cd surf
15 | git submodule init
16 | git submodule update
17 | cd build
18 | cmake ..
19 | make
20 | ```
21 | 
22 | ## building an index
23 | 
24 | ```
25 | cd surf/build
26 | ./surf_index-IDX_D -c ../collections/wikishort/
27 | ```
28 | 
29 | ## querying an index
30 | 
31 | ```
32 | cd surf/build
33 | ./surf_search -c ../collections/wikishort/ -q <qryfile> -k 10
34 | ```
35 | 
36 | ## creating an indri index and converting it into surf format
37 | 
38 | ### create the indri index
39 | 
40 | ```
41 | cd ./indri-5.6/
42 | ./configure
43 | make
44 | cd buildindex
45 | # change indri config to correct storage locations
46 | ./IndriBuildIndex ./surf/extras/gov2.indricfg
47 | ```
48 | 
49 | ### convert the index into surf format
50 | 
51 | ```
52 | cd surf/tools
53 | # change path of indri source code in Makefile
54 | make
55 | # ./indri_to_surf <path_to_indri_repo> <path_to_surf_repo>
56 | ./indri_to_surf ../collections/gov2indi ../collections/gov2/ 
57 | ```
58 | 
59 | ## starting a daemon
60 | 
61 | an index daemon can be started in the background and listen on a specific port for search requests
62 | 
63 | ```
64 | cd build
65 | ./surf_daemon-IDX_D -c ../collections/wikishort/ -p 12345
66 | ```
67 | 
68 | ## querying the search daemon
69 | 
70 | the daemon can be queried via the network or localhost
71 | 
72 | ```
73 | cd build
74 | ./surf_query -q <qry_file> -h 127.0.0.1:12345 -k 10
75 | ```
76 | 
77 | ## shutting down the daemon
78 | 
79 | the daemon can be terminated via the query client.
80 | 
81 | ```
82 | cd build
83 | ./surf_query -q <qry_file> -h 127.0.0.1:12345 -k 1 -s
84 | ```
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/build/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/collections/README.md:
--------------------------------------------------------------------------------
1 | Each collection is located in its own subdirectory.
2 | The subdirectories name is the identifier or the collection.
3 | One collections consists of three files:
4 | 
5 |   * dict.txt: Contains all words of the collection,
6 |     one word at a line. Line format: word <space> id.
7 |   * doc_names.txt: Contains the title of each document.
8 |   * text_int.sdsl: A concatenation of the documents.
9 | 


--------------------------------------------------------------------------------
/collections/gov2/index/space_usage_IDX_D.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 |    <head>
  3 |     <meta http-equiv="Content-Type" content="text/html;charset=utf-8">
  4 |     <title>sdsl data structure visualization</title>
  5 |     <script src="http://d3js.org/d3.v2.js"></script>
  6 |     <style type="text/css">
  7 |       path { stroke: #000; stroke-width: 0.8; cursor: pointer; }
  8 |       text { font: 11px sans-serif; cursor: pointer; }
  9 |       body { width: 900; margin: 0 auto; }
 10 |       h1 { text-align: center; margin: .5em 0; }
 11 |       #breadcrumbs { display: none; }
 12 |       svg { font: 10px sans-serif; }
 13 |      </style>
 14 |   </head>
 15 | <body marginwidth="0" marginheight="0">
 16 | <button><a id="download">Save as SVG</a></button>
 17 |   <div id="chart"></div>
 18 | <script type="text/javascript">
 19 | var w = 800,
 20 |   h = w,
 21 |   r = w / 2,
 22 |   x = d3.scale.linear().range([0, 2 * Math.PI]),
 23 |   y = d3.scale.pow().exponent(1.3).domain([0, 1]).range([0, r]),
 24 |   p = 5,
 25 |   color = d3.scale.category20c(),
 26 |   duration = 1000;
 27 | 
 28 | var vis = d3.select("#chart").append("svg:svg")
 29 |   .attr("width", w + p * 2)
 30 |   .attr("height", h + p * 2)
 31 |   .append("g")
 32 |   .attr("transform", "translate(" + (r + p) + "," + (r + p) + ")");
 33 | 
 34 | vis.append("p")
 35 |   .attr("id", "intro")
 36 |   .text("Click to zoom!");
 37 | 
 38 | var partition = d3.layout.partition()
 39 |   .sort(null)
 40 |   .size([2 * Math.PI, r * r])
 41 |   .value(function(d) { return d.size; });
 42 | 
 43 | var arc = d3.svg.arc()
 44 |   .startAngle(function(d) { return Math.max(0, Math.min(2 * Math.PI, x(d.x))); })
 45 |   .endAngle(function(d) { return Math.max(0, Math.min(2 * Math.PI, x(d.x + d.dx))); })
 46 |   .innerRadius(function(d) { return Math.max(0, d.y ? y(d.y) : d.y); })
 47 |   .outerRadius(function(d) { return Math.max(0, y(d.y + d.dy)); });
 48 | 
 49 |       
 50 | var spaceJSON = {
 51 | 	"class_name":"surf::idx_d",
 52 | 	"name":"",
 53 | 	"size":"102898883533",
 54 | 	"children":[
 55 | 		{
 56 | 			"class_name":"surf::doc_perm",
 57 | 			"name":"docperm",
 58 | 			"size":"157532402",
 59 | 			"children":[
 60 | 				{
 61 | 					"class_name":"int_vector",
 62 | 					"name":"len2id",
 63 | 					"size":"78766201"
 64 | 				},
 65 | 				{
 66 | 					"class_name":"int_vector",
 67 | 					"name":"id2len",
 68 | 					"size":"78766201"
 69 | 				}
 70 | 			]
 71 | 		},
 72 | 		{
 73 | 			"class_name":"surf::df_sada",
 74 | 			"name":"df",
 75 | 			"size":"2485266971",
 76 | 			"children":[
 77 | 				{
 78 | 					"class_name":"select_support_rrr",
 79 | 					"name":"m_sel",
 80 | 					"size":"0"
 81 | 				},
 82 | 				{
 83 | 					"class_name":"rrr_vector",
 84 | 					"name":"m_bv",
 85 | 					"size":"2485266971",
 86 | 					"children":[
 87 | 						{
 88 | 							"class_name":"bit_vector",
 89 | 							"name":"invert",
 90 | 							"size":"2908768"
 91 | 						},
 92 | 						{
 93 | 							"class_name":"int_vector",
 94 | 							"name":"btnrp",
 95 | 							"size":"98897601"
 96 | 						},
 97 | 						{
 98 | 							"class_name":"bit_vector",
 99 | 							"name":"btnr",
100 | 							"size":"1723173752"
101 | 						},
102 | 						{
103 | 							"class_name":"int_vector",
104 | 							"name":"rank_samples",
105 | 							"size":"101806353"
106 | 						},
107 | 						{
108 | 							"class_name":"int_vector",
109 | 							"name":"bt",
110 | 							"size":"558480489"
111 | 						},
112 | 						{
113 | 							"class_name":"unsigned long",
114 | 							"name":"size",
115 | 							"size":"8"
116 | 						}
117 | 					]
118 | 				}
119 | 			]
120 | 		},
121 | 		{
122 | 			"class_name":"wt_int",
123 | 			"name":"wtd",
124 | 			"size":"77923692196",
125 | 			"children":[
126 | 				{
127 | 					"class_name":"select_support_scan",
128 | 					"name":"tree_select_0",
129 | 					"size":"0"
130 | 				},
131 | 				{
132 | 					"class_name":"rank_support_v5",
133 | 					"name":"tree_rank",
134 | 					"size":"4583746616",
135 | 					"children":[
136 | 						{
137 | 							"class_name":"int_vector",
138 | 							"name":"cumulative_counts",
139 | 							"size":"4583746616"
140 | 						}
141 | 					]
142 | 				},
143 | 				{
144 | 					"class_name":"unsigned int",
145 | 					"name":"max_level",
146 | 					"size":"4"
147 | 				},
148 | 				{
149 | 					"class_name":"bit_vector",
150 | 					"name":"tree",
151 | 					"size":"73339945560"
152 | 				},
153 | 				{
154 | 					"class_name":"unsigned long",
155 | 					"name":"sigma",
156 | 					"size":"8"
157 | 				},
158 | 				{
159 | 					"class_name":"select_support_scan",
160 | 					"name":"tree_select_1",
161 | 					"size":"0"
162 | 				},
163 | 				{
164 | 					"class_name":"unsigned long",
165 | 					"name":"size",
166 | 					"size":"8"
167 | 				}
168 | 			]
169 | 		},
170 | 		{
171 | 			"class_name":"csa_wt",
172 | 			"name":"csa",
173 | 			"size":"22332391964",
174 | 			"children":[
175 | 				{
176 | 					"class_name":"int_alphabet",
177 | 					"name":"alphabet",
178 | 					"size":"171403491",
179 | 					"children":[
180 | 						{
181 | 							"class_name":"unsigned long",
182 | 							"name":"m_sigma",
183 | 							"size":"8"
184 | 						},
185 | 						{
186 | 							"class_name":"int_vector",
187 | 							"name":"m_C",
188 | 							"size":"171403441"
189 | 						},
190 | 						{
191 | 							"class_name":"rank_support_sd",
192 | 							"name":"m_char_rank",
193 | 							"size":"0"
194 | 						},
195 | 						{
196 | 							"class_name":"select_support_sd",
197 | 							"name":"m_char_select",
198 | 							"size":"0"
199 | 						},
200 | 						{
201 | 							"class_name":"sd_vector",
202 | 							"name":"m_char",
203 | 							"size":"42",
204 | 							"children":[
205 | 								{
206 | 									"class_name":"select_support_mcl",
207 | 									"name":"high_1_select",
208 | 									"size":"8"
209 | 								},
210 | 								{
211 | 									"class_name":"bit_vector",
212 | 									"name":"high",
213 | 									"size":"8"
214 | 								},
215 | 								{
216 | 									"class_name":"int_vector",
217 | 									"name":"low",
218 | 									"size":"9"
219 | 								},
220 | 								{
221 | 									"class_name":"select_support_mcl",
222 | 									"name":"high_0_select",
223 | 									"size":"8"
224 | 								},
225 | 								{
226 | 									"class_name":"unsigned char",
227 | 									"name":"wl",
228 | 									"size":"1"
229 | 								},
230 | 								{
231 | 									"class_name":"unsigned long",
232 | 									"name":"size",
233 | 									"size":"8"
234 | 								}
235 | 							]
236 | 						}
237 | 					]
238 | 				},
239 | 				{
240 | 					"class_name":"int_vector",
241 | 					"name":"isa_samples",
242 | 					"size":"102697"
243 | 				},
244 | 				{
245 | 					"class_name":"int_vector",
246 | 					"name":"sa_samples",
247 | 					"size":"102689"
248 | 				},
249 | 				{
250 | 					"class_name":"wt_int",
251 | 					"name":"wavelet_tree",
252 | 					"size":"22160783087",
253 | 					"children":[
254 | 						{
255 | 							"class_name":"unsigned int",
256 | 							"name":"max_level",
257 | 							"size":"4"
258 | 						},
259 | 						{
260 | 							"class_name":"select_support_rrr",
261 | 							"name":"tree_select_1",
262 | 							"size":"0"
263 | 						},
264 | 						{
265 | 							"class_name":"rank_support_rrr",
266 | 							"name":"tree_rank",
267 | 							"size":"0"
268 | 						},
269 | 						{
270 | 							"class_name":"unsigned long",
271 | 							"name":"sigma",
272 | 							"size":"8"
273 | 						},
274 | 						{
275 | 							"class_name":"select_support_rrr",
276 | 							"name":"tree_select_0",
277 | 							"size":"0"
278 | 						},
279 | 						{
280 | 							"class_name":"rrr_vector",
281 | 							"name":"tree",
282 | 							"size":"22160783067",
283 | 							"children":[
284 | 								{
285 | 									"class_name":"bit_vector",
286 | 									"name":"invert",
287 | 									"size":"37834112"
288 | 								},
289 | 								{
290 | 									"class_name":"int_vector",
291 | 									"name":"btnrp",
292 | 									"size":"1399861673"
293 | 								},
294 | 								{
295 | 									"class_name":"bit_vector",
296 | 									"name":"btnr",
297 | 									"size":"12059078592"
298 | 								},
299 | 								{
300 | 									"class_name":"int_vector",
301 | 									"name":"rank_samples",
302 | 									"size":"1399861681"
303 | 								},
304 | 								{
305 | 									"class_name":"int_vector",
306 | 									"name":"bt",
307 | 									"size":"7264147001"
308 | 								},
309 | 								{
310 | 									"class_name":"unsigned long",
311 | 									"name":"size",
312 | 									"size":"8"
313 | 								}
314 | 							]
315 | 						},
316 | 						{
317 | 							"class_name":"unsigned long",
318 | 							"name":"size",
319 | 							"size":"8"
320 | 						}
321 | 					]
322 | 				}
323 | 			]
324 | 		}
325 | 	]
326 | };
327 | 
328 | 
329 | 
330 |   var nodes = partition.nodes(spaceJSON);
331 | 
332 |   var path = vis.selectAll("path").data(nodes);
333 |   path.enter().append("path")
334 |     .attr("id", function(d, i) { return "path-" + i; })
335 |     .attr("d", arc)
336 |     .attr("fill-rule", "evenodd")
337 |     .style("fill", colour)
338 |     .on("click", click);
339 | 
340 |   path.append("title").text(function(d) { return 'class name: ' + d.class_name + '\nmember_name: ' + d.name + '\n size: ' + sizeMB(d) });
341 | 
342 |   var text = vis.selectAll("text").data(nodes);
343 |   var textEnter = text.enter().append("text")
344 |     .style("opacity", 1)
345 |     .style("fill", function(d) {
346 |     return brightness(d3.rgb(colour(d))) < 125 ? "#eee" : "#000";
347 |     })
348 |     .attr("text-anchor", function(d) {
349 |     return x(d.x + d.dx / 2) > Math.PI ? "end" : "start";
350 |     })
351 |     .attr("dy", ".2em")
352 |     .attr("transform", function(d) {
353 |     var multiline = (d.name || "").split(" ").length > 1,
354 |       angle = x(d.x + d.dx / 2) * 180 / Math.PI - 90,
355 |       rotate = angle + (multiline ? -.5 : 0);
356 |     return "rotate(" + rotate + ")translate(" + (y(d.y) + p) + ")rotate(" + (angle > 90 ? -180 : 0) + ")";
357 |     })
358 |     .on("click", click);
359 | 
360 |   textEnter.append("title").text(function(d) { return 'class name: ' + d.class_name + '\nmember_name: ' + d.name + '\n size: ' + sizeMB(d) });
361 | 
362 |   textEnter.append("tspan")
363 |     .attr("x", 0)
364 |     .text(function(d) { return d.dx < 0.05 ? "" : d.depth ? d.name.split(" ")[0] : ""; });
365 |   textEnter.append("tspan")
366 |     .attr("x", 0)
367 |     .attr("dy", "1em")
368 |     .text(function(d) { return d.dx < 0.05 ? "" : d.depth ? d.name.split(" ")[1] || "" : ""; });
369 | 
370 |   function click(d) {
371 |   path.transition()
372 |     .duration(duration)
373 |     .attrTween("d", arcTween(d));
374 | 
375 |   // Somewhat of a hack as we rely on arcTween updating the scales.
376 |   text
377 |     .style("visibility", function(e) {
378 |     return isParentOf(d, e) ? null : d3.select(this).style("visibility");
379 |     })
380 |     .transition().duration(duration)
381 |     .attrTween("text-anchor", function(d) {
382 |     return function() {
383 |       return x(d.x + d.dx / 2) > Math.PI ? "end" : "start";
384 |     };
385 |     })
386 |     .attrTween("transform", function(d) {
387 |     var multiline = (d.name || "").split(" ").length > 1;
388 |     return function() {
389 |       var angle = x(d.x + d.dx / 2) * 180 / Math.PI - 90,
390 |         rotate = angle + (multiline ? -.5 : 0);
391 |       return "rotate(" + rotate + ")translate(" + (y(d.y) + p) + ")rotate(" + (angle > 90 ? -180 : 0) + ")";
392 |     };
393 |     })
394 |     .style("opacity", function(e) { return isParentOf(d, e) ? 1 : 1e-6; })
395 |     .each("end", function(e) {
396 |     d3.select(this).style("visibility", isParentOf(d, e) ? null : "hidden");
397 |     });
398 |   }
399 | 
400 | 
401 | function sizeMB(d) {
402 | //  if (d.children) {
403 | //  var sum = calcSum(d);
404 | //  return (sum / (1024*1024)).toFixed(2) + 'MB';
405 | //  } else {
406 |   return (d.size / (1024*1024)).toFixed(2) + 'MB';
407 | //  }
408 | }
409 | 
410 | function calcSum(d) {
411 |   if(d.children) {
412 |   var sum = 0;
413 |   function recurse(d) {
414 |     if(d.children) d.children.forEach( function(child) { recurse(child); } );
415 |     else sum += d.size;
416 |   }
417 |   recurse(d,sum);
418 |   console.log(sum);
419 |   console.log(d.children);
420 |   return sum;
421 |   } else {
422 |   console.log(d.size);
423 |   return d.size;
424 |   }
425 | }
426 | 
427 | function isParentOf(p, c) {
428 |   if (p === c) return true;
429 |   if (p.children) {
430 |   return p.children.some(function(d) {
431 |     return isParentOf(d, c);
432 |   });
433 |   }
434 |   return false;
435 | }
436 | 
437 | function colour(d) {
438 |   return color(d.name);
439 | }
440 | 
441 | // Interpolate the scales!
442 | function arcTween(d) {
443 |   var my = maxY(d),
444 |     xd = d3.interpolate(x.domain(), [d.x, d.x + d.dx]),
445 |     yd = d3.interpolate(y.domain(), [d.y, my]),
446 |     yr = d3.interpolate(y.range(), [d.y ? 20 : 0, r]);
447 |   return function(d) {
448 |   return function(t) { x.domain(xd(t)); y.domain(yd(t)).range(yr(t)); return arc(d); };
449 |   };
450 | }
451 | 
452 | // Interpolate the scales!
453 | function arcTween2(d) {
454 |   var xd = d3.interpolate(x.domain(), [d.x, d.x + d.dx]),
455 |     yd = d3.interpolate(y.domain(), [d.y, 1]),
456 |     yr = d3.interpolate(y.range(), [d.y ? 20 : 0, radius]);
457 |   return function(d, i) {
458 |   return i
459 |     ? function(t) { return arc(d); }
460 |     : function(t) { x.domain(xd(t)); y.domain(yd(t)).range(yr(t)); return arc(d); };
461 |   };
462 | }
463 | 
464 | function maxY(d) {
465 |   return d.children ? Math.max.apply(Math, d.children.map(maxY)) : d.y + d.dy;
466 | }
467 | 
468 | // http://www.w3.org/WAI/ER/WD-AERT/#color-contrast
469 | function brightness(rgb) {
470 |   return rgb.r * .299 + rgb.g * .587 + rgb.b * .114;
471 | }
472 | d3.select("#download").on("click", function () {
473 | d3.select(this).attr("href", 'data:application/octet-stream;base64,' + btoa(d3.select("#chart").html())).attr("download", "memorysun.svg")})
474 | 
475 | click(nodes[0]);
476 |     
477 | </script>
478 | </body>
479 | </html>
480 | 


--------------------------------------------------------------------------------
/collections/speeches/.gitignore:
--------------------------------------------------------------------------------
1 | index
2 | 


--------------------------------------------------------------------------------
/collections/speeches/dict.txt:
--------------------------------------------------------------------------------
  1 | 35 3
  2 | a 4
  3 | able 5
  4 | about 6
  5 | abundance 7
  6 | accept 8
  7 | again 9
  8 | ago 10
  9 | alabama 11
 10 | all 12
 11 | am 13
 12 | america 14
 13 | an 15
 14 | and 16
 15 | any 17
 16 | appomattox 18
 17 | are 19
 18 | as 20
 19 | ask 21
 20 | assault 22
 21 | at 23
 22 | atlantic 24
 23 | back 25
 24 | bare 26
 25 | basic 27
 26 | be 28
 27 | beautiful 29
 28 | because 30
 29 | been 31
 30 | believe 32
 31 | beloved 33
 32 | berlin 34
 33 | berliner 35
 34 | best 36
 35 | between 37
 36 | bin 38
 37 | black 39
 38 | boast 40
 39 | both 41
 40 | boys 42
 41 | brother 43
 42 | brotherhood 44
 43 | brutal 45
 44 | but 46
 45 | by 47
 46 | carlyle’s 48
 47 | cause 49
 48 | century 50
 49 | challenge 51
 50 | chancellor 52
 51 | children 53
 52 | choose 54
 53 | city 55
 54 | civi 56
 55 | clay 57
 56 | climb 58
 57 | colors 59
 58 | come 60
 59 | committed 61
 60 | common 62
 61 | company 63
 62 | concord 64
 63 | conflict 65
 64 | conquest 66
 65 | conviction 67
 66 | convocation 68
 67 | cooperation 69
 68 | country 70
 69 | cries 71
 70 | crisis 72
 71 | crooked 73
 72 | day 74
 73 | debate 75
 74 | decade 76
 75 | democracy 77
 76 | democratic 78
 77 | denial 79
 78 | depression 80
 79 | deserve 81
 80 | despair 82
 81 | destiny 83
 82 | die 84
 83 | difference 85
 84 | dignity 86
 85 | discord 87
 86 | distinguished 88
 87 | do 89
 88 | down 90
 89 | dream 91
 90 | dripping 92
 91 | during 93
 92 | earth 94
 93 | easy 95
 94 | ein 96
 95 | energy 97
 96 | engaged 98
 97 | envisage 99
 98 | equal 100
 99 | establish 101
100 | ever 102
101 | every 103
102 | everybody 104
103 | exalted 105
104 | fail 106
105 | faith 107
106 | fate 108
107 | father’s 109
108 | federal 110
109 | fellow 111
110 | fight 112
111 | flesh 113
112 | fly 114
113 | for 115
114 | forget 116
115 | fought 117
116 | france 118
117 | free 119
118 | freedom 120
119 | from 121
120 | general 122
121 | genuine 123
122 | germany 124
123 | girl 125
124 | glory 126
125 | go 127
126 | goal 128
127 | god 129
128 | god’s 130
129 | good 131
130 | government 132
131 | governor 133
132 | great 134
133 | greatest 135
134 | growth 136
135 | guest 137
136 | hand 138
137 | happen 139
138 | happening 140
139 | hard 141
140 | has 142
141 | have 143
142 | hazard 144
143 | heart 145
144 | here 146
145 | hew 147
146 | highest 148
147 | hill 149
148 | hindu 150
149 | his 151
150 | history 152
151 | hope 153
152 | hostile 154
153 | hymn 155
154 | i 156
155 | ich 157
156 | ideal 158
157 | if 159
158 | in 160
159 | inasmuch 161
160 | independence 162
161 | indian 163
162 | intend 164
163 | interposition 165
164 | into 166
165 | invite 167
166 | is 168
167 | issue 169
168 | it 170
169 | its 171
170 | itself 172
171 | jail 173
172 | jangle 174
173 | jawaharlal 175
174 | join 176
175 | justice 177
176 | kill 178
177 | knowing 179
178 | land 180
179 | last 181
180 | lay 182
181 | let 183
182 | lexington 184
183 | liberty 185
184 | lip 186
185 | little 187
186 | live 188
187 | lives 189
188 | long 190
189 | lord 191
190 | low 192
191 | made 193
192 | majesty 194
193 | man 195
194 | mankind 196
195 | many 197
196 | marked 198
197 | master 199
198 | may 200
199 | mayor 201
200 | me 202
201 | meaning 203
202 | measure 204
203 | meet 205
204 | member 206
205 | men 207
206 | met 208
207 | million 209
208 | mission 210
209 | moments 211
210 | moon 212
211 | more 213
212 | most 214
213 | mountain 215
214 | mountainside 216
215 | muslim 217
216 | my 218
217 | nation 219
218 | national 220
219 | need 221
220 | never 222
221 | new 223
222 | no 224
223 | non 225
224 | not 226
225 | nullify 227
226 | of 228
227 | oldest 229
228 | on 230
229 | once 231
230 | one 232
231 | only 233
232 | opportunity 234
233 | oppress 235
234 | or 236
235 | organize 237
236 | other 238
237 | our 239
238 | ours 240
239 | out 241
240 | outer 242
241 | own 243
242 | pain 244
243 | pandit 245
244 | party 246
245 | peace 247
246 | peaceful 248
247 | people 249
248 | pilgrim’s 250
249 | place 251
250 | plain 252
251 | play 253
252 | point 254
253 | postpone 255
254 | pray 256
255 | prejudice 257
256 | pride 258
257 | prison 259
258 | progress 260
259 | prosperity 261
260 | protest 262
261 | proud 263
262 | proudest 264
263 | purpose 265
264 | racist 266
265 | rarely 267
266 | rather 268
267 | read 269
268 | realize 270
269 | religion 271
270 | republic 272
271 | resolution 273
272 | reveal 274
273 | revolution 275
274 | rice 276
275 | right 277
276 | rights 278
277 | ring 279
278 | romanu 280
279 | rough 281
280 | russia 282
281 | satisfaction 283
282 | say 284
283 | search 285
284 | secret 286
285 | section 287
286 | security 288
287 | see 289
288 | self 290
289 | selma 291
290 | serve 292
291 | shall 293
292 | shape 294
293 | sing 295
294 | single 296
295 | sister 297
296 | skill 298
297 | so 299
298 | some 300
299 | something 301
300 | south 302
301 | space 303
302 | speak 304
303 | spirit 305
304 | stand 306
305 | stone 307
306 | straight 308
307 | strife 309
308 | struggle 310
309 | such 311
310 | suffering 312
311 | sum 313
312 | summon 314
313 | sweet 315
314 | symbolize 316
315 | symphony 317
316 | texas 318
317 | than 319
318 | that 320
319 | the 321
320 | thee 322
321 | their 323
322 | there 324
323 | these 325
324 | they 326
325 | things 327
326 | think 328
327 | this 329
328 | thousand 330
329 | throughout 331
330 | time 332
331 | times 333
332 | to 334
333 | today 335
334 | together 336
335 | told 337
336 | tonight 338
337 | too 339
338 | transform 340
339 | turning 341
340 | two 342
341 | unend 343
342 | unwill 344
343 | up 345
344 | urge 346
345 | us 347
346 | valley 348
347 | value 349
348 | vicious 350
349 | violence 351
350 | visit 352
351 | war 353
352 | was 354
353 | we 355
354 | weapon 356
355 | week 357
356 | welfare 358
357 | well 359
358 | were 360
359 | west 361
360 | what 362
361 | when 363
362 | where 364
363 | which 365
364 | while 366
365 | white 367
366 | who 368
367 | why 369
368 | will 370
369 | willing 371
370 | win 372
371 | with 373
372 | women 374
373 | words 375
374 | work 376
375 | world 377
376 | wrong 378
377 | years 379
378 | yet 380
379 | you 381
380 | your 382
381 | yourself 383
382 | – 384
383 | ’tis 385
384 | “my 386
385 | 


--------------------------------------------------------------------------------
/collections/speeches/doc_names.txt:
--------------------------------------------------------------------------------
1 | /devhome6/mpetri/collections/speeches/lyndon_b_johnson-we_shall_overcome.txt
2 | /devhome6/mpetri/collections/speeches/john_f_kennedy_-the_decision_to_go_to_the_moon.txt
3 | /devhome6/mpetri/collections/speeches/mahatma_gandhi-quit_india.txt
4 | /devhome6/mpetri/collections/speeches/martin_luther_king_jr-i_have_a_dream.txt
5 | /devhome6/mpetri/collections/speeches/john_f_kennedy_-ich_bin_ein_berliner.txt
6 | 


--------------------------------------------------------------------------------
/collections/speeches/text_int_SURF.sdsl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simongog/surf/c8caa199391793395df85dede6df88c17514097e/collections/speeches/text_int_SURF.sdsl


--------------------------------------------------------------------------------
/collections/wikishort/.gitignore:
--------------------------------------------------------------------------------
1 | index
2 | 


--------------------------------------------------------------------------------
/collections/wikishort/text_int_SURF.sdsl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simongog/surf/c8caa199391793395df85dede6df88c17514097e/collections/wikishort/text_int_SURF.sdsl


--------------------------------------------------------------------------------
/collections/wikishort/wikishort-src.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simongog/surf/c8caa199391793395df85dede6df88c17514097e/collections/wikishort/wikishort-src.tar.gz


--------------------------------------------------------------------------------
/config/IDX-D-BM25.config:
--------------------------------------------------------------------------------
1 | NAME=IDX_D_BM25
2 | CSA_TYPE=sdsl::csa_wt<sdsl::wt_int<sdsl::rrr_vector<63>>,1000000,1000000>
3 | WTD_TYPE=sdsl::wt_int<sdsl::bit_vector, sdsl::rank_support_v5<>, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>>
4 | DF_TYPE=surf::df_sada<sdsl::rrr_vector<63>>
5 | RANK_TYPE=surf::rank_bm25<>
6 | INDEX_TYPE=surf::idx_d<CSA_TYPE,WTD_TYPE,DF_TYPE,RANK_TYPE>
7 | PHRASE_SUPPORT=1
8 | 


--------------------------------------------------------------------------------
/config/IDX-D-LMDS.config:
--------------------------------------------------------------------------------
1 | NAME=IDX_D_LMDS
2 | CSA_TYPE=sdsl::csa_wt<sdsl::wt_int<sdsl::rrr_vector<63>>,1000000,1000000>
3 | WTD_TYPE=sdsl::wt_int<sdsl::bit_vector, sdsl::rank_support_v5<>, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>>
4 | DF_TYPE=surf::df_sada<sdsl::rrr_vector<63>>
5 | RANK_TYPE=surf::rank_lmds<>
6 | INDEX_TYPE=surf::idx_d<CSA_TYPE,WTD_TYPE,DF_TYPE,RANK_TYPE>
7 | PHRASE_SUPPORT=1
8 | 


--------------------------------------------------------------------------------
/config/IDX-D-SANSLEN.config:
--------------------------------------------------------------------------------
1 | NAME=IDX_D_SANSLEN
2 | CSA_TYPE=sdsl::csa_wt<sdsl::wt_int<sdsl::rrr_vector<63>>,1000000,1000000>
3 | WTD_TYPE=sdsl::wt_int<sdsl::bit_vector, sdsl::rank_support_v5<>, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>>
4 | DF_TYPE=surf::df_sada<sdsl::rrr_vector<63>>
5 | RANK_TYPE=surf::rank_bm25_simple_est<120,75>
6 | INDEX_TYPE=surf::idx_d<CSA_TYPE,WTD_TYPE,DF_TYPE,RANK_TYPE>
7 | PHRASE_SUPPORT=1
8 | 


--------------------------------------------------------------------------------
/config/IDX-D-TFIDF.config:
--------------------------------------------------------------------------------
1 | NAME=IDX_D_TFIDF
2 | CSA_TYPE=sdsl::csa_wt<sdsl::wt_int<sdsl::rrr_vector<63>>,1000000,1000000>
3 | WTD_TYPE=sdsl::wt_int<sdsl::bit_vector, sdsl::rank_support_v5<>, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>>
4 | DF_TYPE=surf::df_sada<sdsl::rrr_vector<63>>
5 | RANK_TYPE=surf::rank_tfidf
6 | INDEX_TYPE=surf::idx_d<CSA_TYPE,WTD_TYPE,DF_TYPE,RANK_TYPE>
7 | PHRASE_SUPPORT=1
8 | 


--------------------------------------------------------------------------------
/config/IDX-D.config:
--------------------------------------------------------------------------------
1 | NAME=IDX_D
2 | CSA_TYPE=sdsl::csa_wt<sdsl::wt_int<sdsl::rrr_vector<63>>,1000000,1000000>
3 | WTD_TYPE=sdsl::wt_int<sdsl::bit_vector, sdsl::rank_support_v5<>, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>>
4 | DF_TYPE=surf::df_sada<sdsl::rrr_vector<63>>
5 | RANK_TYPE=surf::rank_bm25<>
6 | INDEX_TYPE=surf::idx_d<CSA_TYPE,WTD_TYPE,DF_TYPE,RANK_TYPE>
7 | PHRASE_SUPPORT=1
8 | 


--------------------------------------------------------------------------------
/config/IDX-D1R1-BM25.config:
--------------------------------------------------------------------------------
1 | NAME=IDX_D1R1_BM25
2 | CSA_TYPE=sdsl::csa_wt<sdsl::wt_int<sdsl::rrr_vector<63>>,1000000,1000000>
3 | DF_TYPE=surf::df_sada<sdsl::rrr_vector<63>>
4 | WTP_TYPE=sdsl::wt_int<sdsl::rrr_vector<63>>
5 | WTU_TYPE=sdsl::wt_int<sdsl::bit_vector, sdsl::rank_support_v5<>, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>>
6 | RANK_TYPE=surf::rank_bm25<>
7 | INDEX_TYPE=surf::idx_d1r1<CSA_TYPE,DF_TYPE,WTP_TYPE,WTU_TYPE,RANK_TYPE>
8 | 


--------------------------------------------------------------------------------
/config/IDX-D1R1-LMDS.config:
--------------------------------------------------------------------------------
1 | NAME=IDX_D1R1_LMDS
2 | CSA_TYPE=sdsl::csa_wt<sdsl::wt_int<sdsl::rrr_vector<63>>,1000000,1000000>
3 | DF_TYPE=surf::df_sada<sdsl::rrr_vector<63>>
4 | WTP_TYPE=sdsl::wt_int<sdsl::rrr_vector<63>>
5 | WTU_TYPE=sdsl::wt_int<sdsl::bit_vector, sdsl::rank_support_v5<>, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>>
6 | RANK_TYPE=surf::rank_lmds<>
7 | INDEX_TYPE=surf::idx_d1r1<CSA_TYPE,DF_TYPE,WTP_TYPE,WTU_TYPE,RANK_TYPE>
8 | 


--------------------------------------------------------------------------------
/config/IDX-D1R1-TFIDF.config:
--------------------------------------------------------------------------------
1 | NAME=IDX_D1R1_TFIDF
2 | CSA_TYPE=sdsl::csa_wt<sdsl::wt_int<sdsl::rrr_vector<63>>,1000000,1000000>
3 | DF_TYPE=surf::df_sada<sdsl::rrr_vector<63>>
4 | WTP_TYPE=sdsl::wt_int<sdsl::rrr_vector<63>>
5 | WTU_TYPE=sdsl::wt_int<sdsl::bit_vector, sdsl::rank_support_v5<>, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>>
6 | RANK_TYPE=surf::rank_tfidf
7 | INDEX_TYPE=surf::idx_d1r1<CSA_TYPE,DF_TYPE,WTP_TYPE,WTU_TYPE,RANK_TYPE>
8 | 


--------------------------------------------------------------------------------
/config/IDX-D1R1.config:
--------------------------------------------------------------------------------
1 | NAME=IDX_D1R1
2 | CSA_TYPE=sdsl::csa_wt<sdsl::wt_int<sdsl::rrr_vector<63>>,1000000,1000000>
3 | DF_TYPE=surf::df_sada<sdsl::rrr_vector<63>>
4 | WTP_TYPE=sdsl::wt_int<sdsl::rrr_vector<63>>
5 | WTU_TYPE=sdsl::wt_int<sdsl::bit_vector, sdsl::rank_support_v5<>, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>>
6 | RANK_TYPE=surf::rank_bm25<>
7 | INDEX_TYPE=surf::idx_d1r1<CSA_TYPE,DF_TYPE,WTP_TYPE,WTU_TYPE,RANK_TYPE>
8 | 


--------------------------------------------------------------------------------
/config/IDX-D1R1MTF.config:
--------------------------------------------------------------------------------
1 | NAME=IDX_D1R1MTF
2 | CSA_TYPE=sdsl::csa_wt<sdsl::wt_int<sdsl::rrr_vector<63>>,1000000,1000000>
3 | DF_TYPE=surf::df_sada<sdsl::rrr_vector<63>>
4 | WTP_TYPE=sdsl::wt_int<sdsl::rrr_vector<63>>
5 | WTU_TYPE=sdsl::wt_int<sdsl::bit_vector, sdsl::rank_support_v5<>, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>>
6 | INDEX_TYPE=surf::idx_d1r1mtf<CSA_TYPE,DF_TYPE,WTP_TYPE,WTU_TYPE>
7 | 


--------------------------------------------------------------------------------
/config/IDX-DR-BM25.config:
--------------------------------------------------------------------------------
1 | NAME=IDX_DR_BM25
2 | CSA_TYPE=sdsl::csa_wt<sdsl::wt_int<sdsl::rrr_vector<63>>,1000000,1000000>
3 | DF_TYPE=surf::df_sada<sdsl::rrr_vector<63>>
4 | WTD_TYPE=sdsl::wt_int<sdsl::bit_vector, sdsl::rank_support_v5<>, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>>
5 | WTR_TYPE=sdsl::wt_int<sdsl::rrr_vector<63>>
6 | RANK_TYPE=surf::rank_bm25<>
7 | INDEX_TYPE=surf::idx_dr<CSA_TYPE,DF_TYPE,WTD_TYPE,WTR_TYPE,RANK_TYPE>
8 | PHRASE_SUPPORT=1
9 | 


--------------------------------------------------------------------------------
/config/IDX-DR-LMDS.config:
--------------------------------------------------------------------------------
1 | NAME=IDX_DR_LMDS
2 | CSA_TYPE=sdsl::csa_wt<sdsl::wt_int<sdsl::rrr_vector<63>>,1000000,1000000>
3 | DF_TYPE=surf::df_sada<sdsl::rrr_vector<63>>
4 | WTD_TYPE=sdsl::wt_int<sdsl::bit_vector, sdsl::rank_support_v5<>, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>>
5 | WTR_TYPE=sdsl::wt_int<sdsl::rrr_vector<63>>
6 | RANK_TYPE=surf::rank_lmds<>
7 | INDEX_TYPE=surf::idx_dr<CSA_TYPE,DF_TYPE,WTD_TYPE,WTR_TYPE,RANK_TYPE>
8 | PHRASE_SUPPORT=1
9 | 


--------------------------------------------------------------------------------
/config/IDX-DR-SANSLEN.config:
--------------------------------------------------------------------------------
1 | NAME=IDX_DR_SANSLEN
2 | CSA_TYPE=sdsl::csa_wt<sdsl::wt_int<sdsl::rrr_vector<63>>,1000000,1000000>
3 | DF_TYPE=surf::df_sada<sdsl::rrr_vector<63>>
4 | WTD_TYPE=sdsl::wt_int<sdsl::bit_vector, sdsl::rank_support_v5<>, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>>
5 | WTR_TYPE=sdsl::wt_int<sdsl::rrr_vector<63>>
6 | RANK_TYPE=surf::rank_bm25_simple_est<120,75>
7 | INDEX_TYPE=surf::idx_dr<CSA_TYPE,DF_TYPE,WTD_TYPE,WTR_TYPE,RANK_TYPE>
8 | PHRASE_SUPPORT=1
9 | 


--------------------------------------------------------------------------------
/config/IDX-DR-TFIDF.config:
--------------------------------------------------------------------------------
1 | NAME=IDX_DR_TFIDF
2 | CSA_TYPE=sdsl::csa_wt<sdsl::wt_int<sdsl::rrr_vector<63>>,1000000,1000000>
3 | DF_TYPE=surf::df_sada<sdsl::rrr_vector<63>>
4 | WTD_TYPE=sdsl::wt_int<sdsl::bit_vector, sdsl::rank_support_v5<>, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>>
5 | WTR_TYPE=sdsl::wt_int<sdsl::rrr_vector<63>>
6 | RANK_TYPE=surf::rank_tfidf
7 | INDEX_TYPE=surf::idx_dr<CSA_TYPE,DF_TYPE,WTD_TYPE,WTR_TYPE,RANK_TYPE>
8 | PHRASE_SUPPORT=1
9 | 


--------------------------------------------------------------------------------
/config/IDX-DR.config:
--------------------------------------------------------------------------------
1 | NAME=IDX_DR
2 | CSA_TYPE=sdsl::csa_wt<sdsl::wt_int<sdsl::rrr_vector<63>>,1000000,1000000>
3 | DF_TYPE=surf::df_sada<sdsl::rrr_vector<63>>
4 | WTD_TYPE=sdsl::wt_int<sdsl::bit_vector, sdsl::rank_support_v5<>, sdsl::select_support_scan<1>, sdsl::select_support_scan<0>>
5 | WTR_TYPE=sdsl::wt_int<sdsl::rrr_vector<63>>
6 | RANK_TYPE=surf::rank_bm25<>
7 | INDEX_TYPE=surf::idx_dr<CSA_TYPE,DF_TYPE,WTD_TYPE,WTR_TYPE,RANK_TYPE>
8 | PHRASE_SUPPORT=1
9 | 


--------------------------------------------------------------------------------
/config/INVIDX-E-BM25.config:
--------------------------------------------------------------------------------
1 | NAME=INVIDX_E_BM25
2 | PLIST_TYPE=surf::block_postings_list<128>
3 | RANK_TYPE=surf::rank_bm25<>
4 | INDEX_TYPE=surf::idx_invfile<PLIST_TYPE,RANK_TYPE,true>
5 | 


--------------------------------------------------------------------------------
/config/INVIDX-E-LMDS.config:
--------------------------------------------------------------------------------
1 | NAME=INVIDX_E_LMDS
2 | PLIST_TYPE=surf::block_postings_list<128>
3 | RANK_TYPE=surf::rank_lmds<>
4 | INDEX_TYPE=surf::idx_invfile<PLIST_TYPE,RANK_TYPE,true>
5 | 


--------------------------------------------------------------------------------
/config/INVIDX-E-TFIDF.config:
--------------------------------------------------------------------------------
1 | NAME=INVIDX_E_TFIDF
2 | PLIST_TYPE=surf::block_postings_list<128>
3 | RANK_TYPE=surf::rank_tfidf
4 | INDEX_TYPE=surf::idx_invfile<PLIST_TYPE,RANK_TYPE,true>
5 | 


--------------------------------------------------------------------------------
/config/INVIDX-E.config:
--------------------------------------------------------------------------------
1 | NAME=INVIDX_E
2 | PLIST_TYPE=surf::block_postings_list<128>
3 | RANK_TYPE=surf::rank_bm25<>
4 | INDEX_TYPE=surf::idx_invfile<PLIST_TYPE,RANK_TYPE,true>
5 | 


--------------------------------------------------------------------------------
/config/INVIDX-W.config:
--------------------------------------------------------------------------------
1 | NAME=INVIDX_W
2 | PLIST_TYPE=surf::block_postings_list<128>
3 | RANK_TYPE=surf::rank_bm25<>
4 | INDEX_TYPE=surf::idx_invfile<PLIST_TYPE,RANK_TYPE,false>
5 | 


--------------------------------------------------------------------------------
/experiments/check_equivalence.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | COLLECTION="../collections/wikishort/"
 4 | 
 5 | RANKERS="BM25 LMDS TFIDF"
 6 | 
 7 | INDEXES="IDX_D IDX_DR IDX_D1R1 INVIDX_E"
 8 | 
 9 | SURF_PATH="../"
10 | QRYBIN=$SURF_PATH/build/surf_query
11 | QRYFILE="wikishort.qry"
12 | 
13 | for col in $COLLECTION
14 | do
15 |     for rank in $RANKERS
16 |     do
17 |         OUTPUT_FILES_OR=""
18 |         OUTPUT_FILES_AND=""
19 |         for idx in $INDEXES
20 |         do
21 |             DAEMONBIN="$SURF_PATH/build/surf_daemon-${idx}_$rank"
22 |             $DAEMONBIN -c $col > /dev/null 2>&1 &
23 |             $QRYBIN -q $QRYFILE -k 10 -r 1 -R 1> OUT_OR_${idx}_$rank 2>/dev/null
24 |             $QRYBIN -q $QRYFILE -k 10 -r 1 -R -a 2>/dev/null 1> OUT_AND_${idx}_$rank 
25 |             OUTPUT_FILES_OR="$OUTPUT_FILES_OR OUT_OR_${idx}_$rank"
26 |             OUTPUT_FILES_AND="$OUTPUT_FILES_AND OUT_AND_${idx}_$rank"
27 |             $QRYBIN -q $QRYFILE -s > /dev/null 2>&1 1> /dev/null
28 |         done
29 | 
30 |         # cmp for equality now...
31 |         diff $OUTPUT_FILES_OR > /dev/null 2>&1
32 |         if [ $? -eq 1 ]
33 |         then
34 |             echo "output for OR NOT EQUAL for $rank $col !!!"
35 |         else
36 |             echo "all good for OR and $rank"
37 |         fi
38 |         diff $OUTPUT_FILES_AND > /dev/null 2>&1
39 |         if [ $? -eq 1 ]
40 |         then
41 |             echo "output for AND NOT EQUAL for $rank $col !!!"
42 |         else
43 |             echo "all good for AND and $rank"
44 |         fi
45 |     done
46 | done
47 | 
48 | 
49 | # cleanup
50 | #rm -f OUT_*
51 | 


--------------------------------------------------------------------------------
/experiments/eval.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | d <- read.csv(file="trec-2005-and-profile-IDX_SAWIT.csv",sep=";")
 4 | d <- cbind(d,qry_and="RANKED-AND")
 5 | f <- read.csv(file="trec-2005-profile-IDX_SAWIT.csv",sep=";")
 6 | f <- cbind(f,qry_and="RANKED-OR")
 7 | 
 8 | i <- read.csv(file="trec-2006-and-profile-IDX_SAWIT.csv",sep=";")
 9 | i <- cbind(i,qry_and="RANKED-AND")
10 | j <- read.csv(file="trec-2006-profile-IDX_SAWIT.csv",sep=";")
11 | j <- cbind(j,qry_and="RANKED-OR")
12 | 
13 | d2 <- read.csv(file="trec-2005-and-profile-IDX_SAWIT2.csv",sep=";")
14 | d2 <- cbind(d2,qry_and="RANKED-AND")
15 | f2 <- read.csv(file="trec-2005-profile-IDX_SAWIT2.csv",sep=";")
16 | f2 <- cbind(f2,qry_and="RANKED-OR")
17 | 
18 | i2 <- read.csv(file="trec-2006-and-profile-IDX_SAWIT2.csv",sep=";")
19 | i2 <- cbind(i2,qry_and="RANKED-AND")
20 | j2 <- read.csv(file="trec-2006-profile-IDX_SAWIT2.csv",sep=";")
21 | j2 <- cbind(j2,qry_and="RANKED-OR")
22 | 
23 | g <- rbind(d,f)
24 | g <- cbind(g,qryfile="trec2005")
25 | h <- rbind(i,j)
26 | h <- cbind(h,qryfile="trec2006")
27 | l <- rbind(g,h)
28 | 
29 | g2 <- rbind(d2,f2)
30 | g2 <- cbind(g2,qryfile="trec2005")
31 | h2 <- rbind(i2,j2)
32 | h2 <- cbind(h2,qryfile="trec2006")
33 | l2 <- rbind(g2,h2)
34 | 
35 | q2 <- rbind(l,l2)
36 | 
37 | 
38 | p <- ggplot(q2,aes(factor(k),qry_time/1000,fill=index)) 
39 | p <- p + geom_boxplot()
40 | p <- p + facet_grid(qryfile ~ qry_and)
41 | p <- p + scale_y_log10(limits=c(0.1, 10000),breaks=c(1,10,100,1000,10000))
42 | p <- p + annotation_logticks(sides = "lr") 
43 | print(p)


--------------------------------------------------------------------------------
/experiments/eval_3.R:
--------------------------------------------------------------------------------
1 | eval_3.R


--------------------------------------------------------------------------------
/experiments/mem_info.csv:
--------------------------------------------------------------------------------
1 | IDX_D;   gov2;     22332391964; 77923692196;2485266971;          0;157532402
2 | IDX_DR;  gov2;     22332391964; 77923692196;2485266971;32641953354;157532402
3 | IDX_D1R1;gov2;     22332391964; 10218453986;2485266971;16665796154;157532402
4 | IDX_D;   cluewebB; 49302242460;140127439444;3305861323;          0;326432770
5 | IDX_DR;  cluewebB; 49302242460;140127439444;3305861323;53089093770;326432770
6 | IDX_D1R1;cluewebB; 49302242460; 30655284746;3305861323;31069499370;326432770
7 | 


--------------------------------------------------------------------------------
/experiments/mem_used.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CUR_DIR=`pwd`
 3 | MY_DIR="$( cd "$( dirname "$0" )" && pwd )" # gets the directory where the script is located in
 4 | cd "${MY_DIR}"
 5 | MY_DIR=`pwd`
 6 | SURF_PATH=/scratch/VR0052/ESA2014/surf
 7 | 
 8 | COLLECTIONS="$SURF_PATH/collections/gov2 $SURF_PATH/collections/cluewebB"
 9 | EXP_DIR="$SURF_PATH/experiments"
10 | PORT=12345
11 | 
12 | INDEXES="IDX_D IDX_DR IDX_D1R1"
13 | 
14 | for col in $COLLECTIONS
15 | do
16 |     echo $col
17 |     for idx in $INDEXES
18 |     do
19 | 	echo $idx
20 |         $SURF_PATH/build_turpin/surf_index-$idx -c $col 
21 |     done
22 | done
23 | 
24 | cd "${CUR_DIR}"
25 | 


--------------------------------------------------------------------------------
/experiments/nodes_evaluated.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CUR_DIR=`pwd`
 3 | MY_DIR="$( cd "$( dirname "$0" )" && pwd )" # gets the directory where the script is located in
 4 | cd "${MY_DIR}"
 5 | MY_DIR=`pwd`
 6 | #SURF_PATH=$MY_DIR/..
 7 | SURF_PATH=/scratch/VR0052/ESA2014/surf
 8 | 
 9 | COLLECTIONS="$SURF_PATH/collections/gov2 $SURF_PATH/collections/cluewebB"
10 | EXP_DIR="$SURF_PATH/experiments"
11 | PORT=12345
12 | 
13 | INDEXES="IDX_DR IDX_D IDX_D_SANSLEN"
14 | 
15 | echo "qryid;collection;index;qrymode;k;qrylen;res_size;qry_time;search_time;nodes_evaluated;nodes_total;postings_evaluated;postings_total;client_time" > $EXP_DIR/nodes_evaluated.csv
16 | 
17 | for col in $COLLECTIONS
18 | do
19 |     for idx in $INDEXES
20 |     do
21 |         $SURF_PATH/build_turpin/surf_daemon-$idx -c $col -p $PORT &
22 |         for k in 10 100 1000 
23 |         do
24 |             $SURF_PATH/build_turpin/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2005-efficiency-1000.qry -k $k -r 1 -p >> $EXP_DIR/nodes_evaluated_2005.csv
25 |             $SURF_PATH/build_turpin/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2006-efficiency-1000.qry -k $k -r 1 -p >> $EXP_DIR/nodes_evaluated_2006.csv
26 | 		done
27 |         # shut down daemon
28 |         $SURF_PATH/build_turpin/surf_query -h localhost:$PORT -q $SURF_PATH/queries/wiki.q -k 1 -s > /dev/null
29 |     done
30 | done
31 | 
32 | cd "${CUR_DIR}"
33 | 


--------------------------------------------------------------------------------
/experiments/phrases_time.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CUR_DIR=`pwd`
 3 | MY_DIR="$( cd "$( dirname "$0" )" && pwd )" # gets the directory where the script is located in
 4 | cd "${MY_DIR}"
 5 | MY_DIR=`pwd`
 6 | SURF_PATH=$MY_DIR/..
 7 | #SURF_PATH=/scratch/VR0052/ESA2014/surf
 8 | 
 9 | #COLLECTIONS="$SURF_PATH/collections/gov2 $SURF_PATH/collections/cluewebB"
10 | COLLECTIONS="/devhome3/sgog/ESA2014/surf/collections/gov2"
11 | EXP_DIR="$SURF_PATH/experiments"
12 | PORT=12345
13 | 
14 | INDEXES="IDX_D"
15 | 
16 | echo "qryid;collection;index;qrymode;k;qrylen;res_size;qry_time;search_time;nodes_evaluated;nodes_total;postings_evaluated;postings_total;client_time" > $EXP_DIR/phrase_time_2005-2.csv
17 | 
18 | for col in $COLLECTIONS
19 | do
20 |     for idx in $INDEXES
21 |     do
22 |         $SURF_PATH/build/surf_daemon-$idx -c $col -p $PORT &
23 |         for k in 10 100
24 |         do
25 |             $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2006-efficiency-1000.qry -k $k -r 1 -P 10 -p >> $EXP_DIR/phrase_time_2006-2.csv
26 | 		done
27 |         # shut down daemon
28 |         $SURF_PATH/build_turpin/surf_query -h localhost:$PORT -q $SURF_PATH/queries/wiki.q -k 1 -s > /dev/null
29 |     done
30 | done
31 | 
32 | cd "${CUR_DIR}"
33 | 


--------------------------------------------------------------------------------
/experiments/rank_times.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CUR_DIR=`pwd`
 3 | MY_DIR="$( cd "$( dirname "$0" )" && pwd )" # gets the directory where the script is located in
 4 | cd "${MY_DIR}"
 5 | MY_DIR=`pwd`
 6 | #SURF_PATH=$MY_DIR/..
 7 | SURF_PATH=/scratch/VR0052/ESA2014/surf
 8 | 
 9 | COLLECTIONS="$SURF_PATH/collections/gov2"
10 | EXP_DIR="$SURF_PATH/experiments"
11 | PORT=12345
12 | 
13 | INDEXES="IDX_DR IDX_D IDX_D1R1 INVIDX_E"
14 | RANKERS="BM25 TFIDF LMDS"
15 | 
16 | echo "qryid;collection;ranker;index;qrymode;k;qrylen;res_size;qry_time;search_time;nodes_evaluated;nodes_total;postings_evaluated;postings_total;client_time" > $EXP_DIR/nodes_evaluated.csv
17 | 
18 | for col in $COLLECTIONS
19 | do
20 |     for idx in $INDEXES
21 |     do
22 |         for rank in $RANKERS
23 |         do
24 |             $SURF_PATH/build/surf_daemon-${idx}_$rank -c $col -p $PORT &
25 |             for k in 100
26 |             do
27 |                 $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2005-efficiency-1000.qry -k $k -r 1 -a -p >> $EXP_DIR/ranker_times_2005.csv
28 |                 $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2006-efficiency-1000.qry -k $k -r 1 -a -p >> $EXP_DIR/ranker_times_2006.csv
29 | 		    done
30 |             # shut down daemon
31 |             $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/wiki.q -k 1 -s > /dev/null
32 |         done
33 |     done
34 | done
35 | 
36 | cd "${CUR_DIR}"
37 | 


--------------------------------------------------------------------------------
/experiments/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CUR_DIR=`pwd`
 3 | MY_DIR="$( cd "$( dirname "$0" )" && pwd )" # gets the directory where the script is located in
 4 | cd "${MY_DIR}"
 5 | MY_DIR=`pwd`
 6 | SURF_PATH="$MY_DIR/.."
 7 | 
 8 | COLLECTIONS="$SURF_PATH/collections/gov2/"
 9 | PORT=12345
10 | 
11 | for col in $COLLECTIONS
12 | do
13 |     for idx in $SURF_PATH/build/surf_daemon-IDX_SAWIT2
14 |     do
15 |         IDXNAME=$(echo $idx | sed 's/.*-\(.*\)/\1/g')
16 |         $idx -c $col -p $PORT &
17 |         for k in 10 100 1000 
18 |         do
19 |             $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2005-efficiency-100.qry -k $k -r 1 -p >> trec-2005.csv
20 |             $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2006-efficiency-100.qry -k $k -r 1 -p >> trec-2006.csv
21 |             $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2005-efficiency-100.qry -k $k -r 1 -p -a >> trec-2005.csv
22 |             $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/trec2006-efficiency-100.qry -k $k -r 1 -p -a >> trec-2006.csv
23 |         done
24 |         # shut down daemon
25 |         $SURF_PATH/build/surf_query -h localhost:$PORT -q $SURF_PATH/queries/wiki.q -k 1 -s > /dev/null
26 |     done
27 | done
28 | 
29 | cd "${CUR_DIR}"
30 | 


--------------------------------------------------------------------------------
/experiments/sbatch_mem_used.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # use sbatch to lauch script
 4 | 
 5 | BASE_DIR=/scratch/VR0052/ESA2014/surf/experiments
 6 | #SBATCH -p turpin
 7 | #SBATCH --job-name=ex_wtd
 8 | #SBATCH --account="VR0280"
 9 | #SBATCH --nodes=1
10 | #SBATCH --ntasks=1
11 | #SBATCH --time=10-12:00:00
12 | #SBATCH --mem=512GB
13 | #SBATCH --mail-user simon.gog@unimelb.edu.au
14 | #SBATCH --mail-type=BEGIN
15 | #SBATCH --mail-type=END
16 | #SBATCH --mail-type=FAIL
17 | 
18 | module load gcc
19 | module load cmake
20 | 
21 | $BASE_DIR/mem_used.sh
22 | 
23 | ## --exclusive
24 | 


--------------------------------------------------------------------------------
/experiments/sbatch_nodes_evaluated.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # use sbatch to lauch script
 4 | 
 5 | BASE_DIR=/scratch/VR0052/ESA2014/surf/experiments
 6 | 
 7 | #SBATCH -p turpin
 8 | #SBATCH --job-name=nodes_evaluated.sh
 9 | #SBATCH --account="VR0280"
10 | #SBATCH --nodes=1
11 | #SBATCH --ntasks=1
12 | #SBATCH --time=10-12:00:00
13 | #SBATCH --mem=300GB
14 | #SBATCH --mail-user simon.gog@unimelb.edu.au
15 | #SBATCH --mail-type=BEGIN
16 | #SBATCH --mail-type=END
17 | #SBATCH --mail-type=FAIL
18 | 
19 | module load gcc
20 | module load cmake
21 | 
22 | $BASE_DIR/nodes_evaluated.sh
23 | 
24 | 


--------------------------------------------------------------------------------
/experiments/time_per_wtnode.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | d <- read.csv(file="trec-2005-and-profile-IDX_SAWIT.csv",sep=";")
 4 | d <- cbind(d,qry_and="RANKED-AND")
 5 | f <- read.csv(file="trec-2005-profile-IDX_SAWIT.csv",sep=";")
 6 | f <- cbind(f,qry_and="RANKED-OR")
 7 | 
 8 | i <- read.csv(file="trec-2006-and-profile-IDX_SAWIT.csv",sep=";")
 9 | i <- cbind(i,qry_and="RANKED-AND")
10 | j <- read.csv(file="trec-2006-profile-IDX_SAWIT.csv",sep=";")
11 | j <- cbind(j,qry_and="RANKED-OR")
12 | 
13 | d2 <- read.csv(file="trec-2005-and-profile-IDX_SAWIT2.csv",sep=";")
14 | d2 <- cbind(d2,qry_and="RANKED-AND")
15 | f2 <- read.csv(file="trec-2005-profile-IDX_SAWIT2.csv",sep=";")
16 | f2 <- cbind(f2,qry_and="RANKED-OR")
17 | 
18 | i2 <- read.csv(file="trec-2006-and-profile-IDX_SAWIT2.csv",sep=";")
19 | i2 <- cbind(i2,qry_and="RANKED-AND")
20 | j2 <- read.csv(file="trec-2006-profile-IDX_SAWIT2.csv",sep=";")
21 | j2 <- cbind(j2,qry_and="RANKED-OR")
22 | 
23 | g <- rbind(d,f)
24 | g <- cbind(g,qryfile="trec2005")
25 | h <- rbind(i,j)
26 | h <- cbind(h,qryfile="trec2006")
27 | l <- rbind(g,h)
28 | 
29 | g2 <- rbind(d2,f2)
30 | g2 <- cbind(g2,qryfile="trec2005")
31 | h2 <- rbind(i2,j2)
32 | h2 <- cbind(h2,qryfile="trec2006")
33 | l2 <- rbind(g2,h2)
34 | 
35 | q2 <- rbind(l,l2)
36 | 
37 | p <- ggplot(q2,aes(wt_searched,qry_time,colour=index)) 
38 | p <- p + geom_point()
39 | p <- p + facet_grid(qryfile ~ qry_and)
40 | #p <- p + scale_y_log10(limits=c(0.1, 10000))
41 | print(p)


--------------------------------------------------------------------------------
/experiments/wikishort.qry:
--------------------------------------------------------------------------------
1 | 1;of air
2 | 2;until death
3 | 3;history the first
4 | 4;was one born
5 | 


--------------------------------------------------------------------------------
/extras/clueweb-collection.indricfg:
--------------------------------------------------------------------------------
 1 | <parameters>
 2 |     <memory>100G</memory>
 3 |     <storeDocs>true</storeDocs>
 4 |     <index>cluewebB</index>
 5 |     <corpus>
 6 |         <path>/collections/clueweb/CLUEWEB09_1/ClueWeb09_English_1</path>
 7 |         <class>warc</class>
 8 |     </corpus>
 9 |     <stemmer><name>krovetz</name></stemmer>
10 | </parameters>
11 | 


--------------------------------------------------------------------------------
/extras/gov2-collection.indricfg:
--------------------------------------------------------------------------------
 1 | <parameters>
 2 |     <memory>100G</memory>
 3 |     <storeDocs>true</storeDocs>
 4 |     <index>gov2</index>
 5 |     <corpus>
 6 |         <path>/collections/TREC/datasets/gov2/gov2-corpus</path>
 7 |         <class>trecweb</class>
 8 |     </corpus>
 9 |     <stemmer><name>krovetz</name></stemmer>
10 | </parameters>
11 | 


--------------------------------------------------------------------------------
/extras/speeches-collection.indricfg:
--------------------------------------------------------------------------------
 1 | <parameters>
 2 |     <memory>100G</memory>
 3 |     <storeDocs>true</storeDocs>
 4 |     <index>speeches</index>
 5 |     <corpus>
 6 |         <path>/devhome6/mpetri/collections/speeches/</path>
 7 |         <class>txt</class>
 8 |     </corpus>
 9 |     <stemmer><name>krovetz</name></stemmer>
10 | </parameters>
11 | 


--------------------------------------------------------------------------------
/extras/trec8-collection.indricfg:
--------------------------------------------------------------------------------
 1 | <parameters>
 2 |     <memory>100G</memory>
 3 |     <storeDocs>true</storeDocs>
 4 |     <index>trec8</index>
 5 |     <corpus>
 6 |         <path>/devhome6/mpetri/collections/trec8/</path>
 7 |         <class>trectext</class>
 8 |     </corpus>
 9 |     <stemmer><name>krovetz</name></stemmer>
10 | </parameters>
11 | 


--------------------------------------------------------------------------------
/extras/wikishort-collection.indricfg:
--------------------------------------------------------------------------------
 1 | <parameters>
 2 |     <memory>100G</memory>
 3 |     <storeDocs>true</storeDocs>
 4 |     <index>wikishort</index>
 5 |     <corpus>
 6 |         <path>/devhome6/mpetri/collections/wikishort/</path>
 7 |         <class>txt</class>
 8 |     </corpus>
 9 |     <stemmer><name>krovetz</name></stemmer>
10 | </parameters>
11 | 


--------------------------------------------------------------------------------
/extras/wt10g-collection.indricfg:
--------------------------------------------------------------------------------
 1 | <parameters>
 2 |     <memory>100G</memory>
 3 |     <storeDocs>true</storeDocs>
 4 |     <index>wt10g</index>
 5 |     <corpus>
 6 |         <path>/collections/TREC/datasets/wt10g/</path>
 7 |         <class>trecweb</class>
 8 |     </corpus>
 9 |     <stemmer><name>krovetz</name></stemmer>
10 | </parameters>
11 | 


--------------------------------------------------------------------------------
/include/surf/.gitignore:
--------------------------------------------------------------------------------
1 | !*.hpp
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/include/surf/comm.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SURF_COMM_H
 2 | #define SURF_COMM_H
 3 | 
 4 | #define REQ_PARSE_ERROR		0
 5 | #define REQ_RESPONE_OK		1
 6 | 
 7 | #define REQ_TYPE_QRY_OR		0
 8 | #define REQ_TYPE_QRY_AND	1
 9 | #define REQ_TYPE_QUIT		2
10 | 
11 | #define REQ_MODE_PROFILE	0
12 | #define REQ_MODE_TIME		1
13 | 
14 | #define MAX_QRY_LEN		 1024
15 | 
16 | struct surf_time_resp {
17 | 	uint8_t status;
18 |     uint64_t req_id;
19 |     uint64_t qry_id;
20 |     uint64_t qry_len;
21 |     uint64_t k;
22 |     uint64_t result_size;
23 |     uint64_t qry_time;
24 |     uint64_t search_time;
25 |     uint64_t wt_search_space;
26 |     uint64_t wt_nodes;
27 |     uint64_t postings_evaluated;
28 |     uint64_t postings_total;
29 |     char index[256];
30 |     char collection[256];
31 |     char ranker[256];
32 | };
33 | 
34 | struct surf_qry_request {
35 | 	uint8_t type;
36 | 	uint8_t mode;
37 |     uint8_t phrases;
38 |     double phrase_threshold;
39 | 	uint64_t id;
40 | 	uint64_t k;
41 |     uint8_t output_results;
42 |     uint8_t int_qry;
43 | 	char qry_str[MAX_QRY_LEN] = {0};
44 | };
45 | 
46 | struct surf_results {
47 |     uint64_t size;
48 |     double data[0];
49 | };
50 | 
51 | 
52 | #endif


--------------------------------------------------------------------------------
/include/surf/config.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SURF_CONFIG_HPP
 2 | #define SURF_CONFIG_HPP
 3 | 
 4 | #include "sdsl/config.hpp"
 5 | #include <string>
 6 | #include <vector>
 7 | 
 8 | namespace surf{
 9 | 
10 | const std::string TEXT_FILENAME = "text_int_SURF.sdsl";
11 | const std::string DICT_FILENAME = "dict.txt";
12 | const std::string URL2ID_FILENAME = "url2id.txt";
13 | const std::string DOCNAMES_FILENAME = "doc_names.txt";
14 | const std::string SPACEUSAGE_FILENAME = "space_usage";
15 | 
16 | const std::string KEY_DOCWEIGHT = "docweights";
17 | const std::string KEY_DARRAY = "darray";
18 | const std::string KEY_U = "U";
19 | const std::string KEY_WTU = "wtu";
20 | const std::string KEY_UMARK = "Umark";
21 | const std::string KEY_URANK = "Urank";
22 | const std::string KEY_DOCPERM = "docperm";
23 | const std::string KEY_SADADF = "sadadf";
24 | const std::string KEY_WTD = "wtd";
25 | const std::string KEY_C = "C";
26 | const std::string KEY_WTC = "wtc";
27 | const std::string KEY_TMPCST = "tempcst";
28 | const std::string KEY_TMPDUP = "tmpdup";
29 | const std::string KEY_WTDUP  = "wtdup";
30 | const std::string KEY_WTDUP2  = "wtdup2";
31 | const std::string KEY_WTR  = "wtr";
32 | const std::string KEY_WTDP  = "wtdp";
33 | const std::string KEY_DUP  = "dup";
34 | const std::string KEY_R  = "R";                // =R1 in the paper
35 | const std::string KEY_DUPMARK  = "DUPmark";
36 | const std::string KEY_DUPRANK  = "DUPrank";
37 | const std::string KEY_DUP2  = "dup2";          // =R in the paper
38 | const std::string KEY_DOCCNT  = "doccnt";
39 | const std::string KEY_COLLEN  = "collen";
40 | const std::string KEY_DOCBORDER = "docborder";
41 | const std::string KEY_DOC_LENGTHS = "doclengths";
42 | const std::string KEY_INVFILE_TERM_RANGES = "invfile_term_ranges";
43 | const std::string KEY_INVFILE_PLISTS = "invfile_postings_lists";
44 | const std::string KEY_INVFILE_DOCPERM = "invfile_docperm";
45 | const std::string KEY_INVFILE_IDOCPERM = "invfile_inv_docperm";
46 | const std::string KEY_F_T = "Ft";
47 | const std::string KEY_H = "H";
48 | const std::string KEY_CSA = "csa";
49 | const std::string KEY_MAXTF = "maxtf";
50 | 
51 | std::vector<std::string> storage_keys = {KEY_DOCCNT,
52 | 										 KEY_DARRAY,
53 | 										 KEY_DOCPERM,
54 | 										 KEY_SADADF,
55 | 										 KEY_WTD,
56 | 										 KEY_C,
57 | 										 KEY_WTC,
58 | 										 KEY_TMPCST,
59 | 										 KEY_TMPDUP,
60 | 										 KEY_DUP,
61 | 										 KEY_DUP2,
62 |                                          KEY_R,
63 | 										 KEY_WTDUP,
64 | 										 KEY_WTDUP2,
65 |                                          KEY_WTR,
66 |                                          KEY_MAXTF,
67 | 										 KEY_DOCCNT,
68 | 										 KEY_DOC_LENGTHS,
69 |                                          KEY_COLLEN,
70 | 										 KEY_INVFILE_TERM_RANGES,
71 | 										 KEY_INVFILE_PLISTS,
72 |                                          KEY_H,
73 |                                          KEY_U,
74 |                                          KEY_WTU,
75 |                                          KEY_UMARK,
76 |                                          KEY_URANK,
77 |                                          KEY_CSA,
78 |                                          sdsl::conf::KEY_TEXT,
79 |                                          sdsl::conf::KEY_TEXT_INT,
80 |                                          sdsl::conf::KEY_SA,
81 |                                          sdsl::conf::KEY_LCP,
82 |                                          sdsl::conf::KEY_BWT,
83 |                                          sdsl::conf::KEY_BWT_INT,
84 |                                          sdsl::conf::KEY_PSI
85 |                };
86 | 
87 | } // end namespace
88 | #endif
89 | 


--------------------------------------------------------------------------------
/include/surf/construct_DUP2.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SURF_CONSTRUCT_DUP2_HPP
 2 | #define SURF_CONSTRUCT_DUP2_HPP
 3 | 
 4 | #include <sdsl/int_vector.hpp>
 5 | 
 6 | namespace surf{
 7 | 
 8 | // generate the DUP2 array (= R in the paper) and
 9 | // the KEY_DUPMARK bitvector
10 | template<typename t_df>
11 | void construct_dup2(sdsl::cache_config& cc)
12 | {
13 |     using namespace sdsl;
14 |     using namespace std;
15 | 
16 |     string dup2_file = cache_file_name(surf::KEY_DUP2,cc);
17 |     if (!cache_file_exists(surf::KEY_DUP2,cc)){
18 |         cout<<"......dup2 does not exist. Generate it..."<<endl;
19 |         {
20 |             t_df df;
21 |             construct(df, "", cc, 0); // make sure that surf::KEY_DUP was generated 
22 |         }
23 |         int_vector_buffer<> dup(cache_file_name(surf::KEY_DUP, cc));
24 |         cout<<".........dup.size()="<<dup.size()<<endl;
25 |         cout<<".........dup.width()="<<(int)dup.width()<<endl;
26 |         {
27 |             bit_vector dup_mark(dup.size(), 1);
28 |             store_to_cache(dup_mark, surf::KEY_DUPMARK, cc);
29 |         }
30 |         int_vector_buffer<1> dup_mark(cache_file_name(surf::KEY_DUPMARK, cc));
31 |         int_vector_buffer<> dup2(dup2_file, std::ios::out, 1024*1024,
32 |                                  dup.width());
33 |         cout<<".........load df"<<endl;
34 |         t_df df;
35 |         load_from_cache(df, surf::KEY_SADADF, cc, true);
36 |         cout<<".........load cst"<<endl;
37 |         using cst_type =  typename t_df::cst_type;
38 |         cst_type cst;
39 |         load_from_file(cst, cache_file_name<cst_type>(surf::KEY_TMPCST, cc));
40 |         cout<<".........cst.size()="<<cst.size()<<endl;
41 |         auto root = cst.root();
42 |         auto left_most_leaf = cst.select_leaf(1);
43 |         uint64_t next_idx = 0;
44 |         for (auto& v : cst.children(root)){
45 |             if ( v == left_most_leaf )
46 |                 continue;
47 |             auto lb = cst.lb(v);
48 |             auto rb = cst.rb(v);
49 |             auto df_info = df(lb, rb);
50 |             std::vector<uint64_t> buf;
51 |             for (uint64_t i = std::get<1>(df_info); i <= std::get<2>(df_info); ++i) {
52 |                 buf.push_back(dup[i]); 
53 |             }
54 |             for (uint64_t i = next_idx; i < std::get<1>(df_info); ++i){
55 |                 dup_mark[i] = 0;
56 |             }
57 |             next_idx = std::get<2>(df_info)+1;
58 |             for (size_t i=0; i < buf.size(); ++i){
59 |                 dup2.push_back(buf[i]);
60 |             }
61 |         }
62 |         for (uint64_t i = next_idx; i < dup_mark.size(); ++i){
63 |             dup_mark[i]=0;
64 |         }
65 |     }
66 | }
67 | 
68 | }// end namespace
69 | 
70 | #endif
71 | 


--------------------------------------------------------------------------------
/include/surf/construct_U.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SURF_CONSTRUCT_U_HPP
 2 | #define SURF_CONSTRUCT_U_HPP
 3 | 
 4 | #include <sdsl/int_vector.hpp>
 5 | #include <type_traits>
 6 | 
 7 | namespace surf{
 8 | 
 9 | // generate the U array (= D^1 in the paper) and
10 | // the KEY_UMARK bitvector
11 | template<typename t_df>
12 | void construct_u(sdsl::cache_config& cc)
13 | {
14 |     using namespace sdsl;
15 |     using namespace std;
16 |     static_assert(std::is_same<typename t_df::cst_type::index_category, sdsl::cst_tag>::value, "CST class expected");
17 | 
18 |     string u_file = cache_file_name(surf::KEY_U,cc);
19 |     if (!cache_file_exists(surf::KEY_U,cc)){
20 |         cout<<"......U does not exist. Generate it..."<<endl;
21 |         {
22 |             t_df df;
23 |             construct(df, "", cc, 0); // make sure that the cst was generated
24 |         }
25 |         int_vector_buffer<> D_array(cache_file_name(surf::KEY_DARRAY, cc));
26 |         cout<<".........D.size()="<<D_array.size()<<endl;
27 |         cout<<".........D.width()="<<(int)D_array.width()<<endl;
28 |         cout<<".........load cst"<<endl;
29 |         using t_cst = typename t_df::cst_type;
30 |         t_cst cst;
31 |         load_from_file(cst, cache_file_name<t_cst>(surf::KEY_TMPCST, cc));
32 |         cout<<".........cst.size()="<<cst.size()<<endl;
33 |         string u_file = cache_file_name(surf::KEY_U, cc);
34 |         int_vector_buffer<> U(u_file, std::ios::out,
35 |                                    1024*1024, D_array.width());
36 |         string umark_file = cache_file_name(surf::KEY_UMARK, cc);
37 |         int_vector_buffer<1> Umark(umark_file, std::ios::out);
38 | 
39 |         uint64_t doc_cnt = 0;
40 |         load_from_cache(doc_cnt, KEY_DOCCNT, cc);
41 |         cout << ".........doc_cnt = " << doc_cnt << endl;
42 | 
43 |         std::vector<int64_t> last_occ(doc_cnt+1, -1);
44 | 
45 |         auto root = cst.root();
46 |         for (auto& v : cst.children(root)){
47 |             auto lb = cst.lb(v);
48 |             auto rb = cst.rb(v);
49 |             std::vector<uint64_t> buf;
50 |             for (auto i = lb; i<=rb; ++i){
51 |                 auto x = D_array[i];
52 |                 if ( last_occ[x] < (int64_t)lb ){
53 |                     buf.push_back(x);
54 |                 }
55 |                 last_occ[x] = i;
56 |             }
57 |             std::sort(buf.begin(), buf.end());
58 |             for (size_t i=0; i<buf.size();++i){
59 |                 U.push_back(buf[i]);
60 |                 Umark.push_back(1);
61 |             }
62 |             for (size_t i=0; i < rb-lb+1-buf.size(); ++i){
63 |                 Umark.push_back(0);
64 |             }
65 |         }
66 |     }
67 |     cout << "U and Umark generated" << endl;
68 | }
69 | 
70 | }// end namespace
71 | 
72 | #endif
73 | 


--------------------------------------------------------------------------------
/include/surf/construct_col_len.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SURF_CONSTRUCT_COL_LEN_HPP
 2 | #define SURF_CONSTRUCT_COL_LEN_HPP
 3 | 
 4 | #include <sdsl/int_vector.hpp>
 5 | 
 6 | namespace surf{
 7 | 
 8 | template<uint8_t t_width>
 9 | void construct_col_len(sdsl::cache_config& cc)
10 | {
11 |     using namespace sdsl;
12 |     using namespace std;
13 |     static_assert(t_width == 0 or t_width == 8 , 
14 |         "construct_col_len: width must be `0` for integer alphabet and `8` for byte alphabet");
15 | 
16 |     if ( !cache_file_exists(KEY_COLLEN, cc) ){
17 |         const char* KEY_TEXT  = key_text_trait<t_width>::KEY_TEXT;
18 |         std::string text_file = cache_file_name(KEY_TEXT, cc);
19 |         if (!cache_file_exists(KEY_TEXT, cc)) {
20 |             std::cerr << "ERROR: construct_col_len: " << text_file
21 |                       << " does not exist. Abort." << std::endl;
22 |             return;
23 |         }
24 |         uint64_t n = 0;
25 |         int_vector_buffer<t_width> text(text_file);
26 |         n = text.size();
27 |         store_to_cache(n, KEY_COLLEN, cc);
28 |     }
29 | }
30 | 
31 | }// end namespace
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/include/surf/construct_darray.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SURF_DARRAY_HPP
 2 | #define SURF_DARRAY_HPP
 3 | 
 4 | #include "config.hpp"
 5 | #include "construct_doc_perm.hpp"
 6 | #include "construct_doc_border.hpp"
 7 | #include <sdsl/suffix_arrays.hpp>
 8 | #include <algorithm>
 9 | 
10 | namespace surf{
11 | 
12 | template<uint8_t t_width>
13 | void construct_darray(sdsl::cache_config& cc)
14 | {
15 |     using namespace sdsl;
16 |     using namespace std;
17 |     if ( !cache_file_exists(KEY_DARRAY, cc) ) {
18 |         bit_vector doc_border;
19 |         construct_doc_border<t_width>(cc);
20 |         load_from_cache(doc_border, KEY_DOCBORDER, cc);
21 |         
22 |         int_vector_buffer<> sa(cache_file_name(conf::KEY_SA, cc));
23 | 
24 |         rank_support_v<> doc_border_rank(&doc_border);
25 |         uint64_t doc_cnt = doc_border_rank(doc_border.size());
26 | 
27 |         construct_doc_perm<t_width>(cc);
28 |         doc_perm dp;
29 |         load_from_cache(dp, KEY_DOCPERM,cc);
30 | 
31 |         int_vector<> darray(sa.size(), 0, bits::hi(doc_cnt)+1);
32 |         for (uint64_t i=0; i<sa.size(); ++i){
33 |             darray[i] = dp.id2len[doc_border_rank(sa[i])];
34 |         }
35 |         store_to_cache(darray, KEY_DARRAY, cc);
36 |     }
37 | }
38 | 
39 | }// end namespace
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/include/surf/construct_doc_border.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SURF_CONSTRUCT_DOC_BORDER_HPP
 2 | #define SURF_CONSTRUCT_DOC_BORDER_HPP
 3 | 
 4 | #include <sdsl/int_vector.hpp>
 5 | #include <algorithm>
 6 | 
 7 | namespace surf{
 8 | 
 9 | template<uint8_t t_width>
10 | void construct_doc_border(sdsl::cache_config& cc)
11 | {
12 |     using namespace sdsl;
13 |     using namespace std;
14 |     static_assert(t_width == 0 or t_width == 8 , 
15 |             "construct_doc_border: width must be `0` for integer alphabet and `8` for byte alphabet");
16 | 
17 |     if ( !cache_file_exists(KEY_DOCBORDER, cc) ) {
18 |         const char* KEY_TEXT  = key_text_trait<t_width>::KEY_TEXT;
19 |         std::string text_file = cache_file_name(KEY_TEXT, cc);
20 |         if (!cache_file_exists(KEY_TEXT, cc)) {
21 |             std::cerr << "ERROR: construct_doc_border: " << text_file
22 |                       << " does not exist. Abort." << std::endl;
23 |             return;
24 |         }
25 |         int_vector_buffer<t_width> text(text_file);
26 |         bit_vector doc_border(text.size(), 0);
27 |         for (uint64_t i=0; i < text.size(); ++i){
28 |             if ( 1 == text[i] ){
29 |                 doc_border[i] = 1;
30 |             }
31 |         }
32 |         store_to_cache(doc_border, KEY_DOCBORDER, cc);
33 |     }
34 | }
35 | 
36 | }// end namespace
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/include/surf/construct_doc_cnt.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SURF_CONSTRUCT_DOC_CNT_HPP
 2 | #define SURF_CONSTRUCT_DOC_CNT_HPP
 3 | 
 4 | #include <sdsl/int_vector.hpp>
 5 | #include <algorithm>
 6 | 
 7 | namespace surf{
 8 | 
 9 | template<uint8_t t_width>
10 | void construct_doc_cnt(sdsl::cache_config& cc)
11 | {
12 |     using namespace sdsl;
13 |     using namespace std;
14 |     static_assert(t_width == 0 or t_width == 8 , 
15 |                 "construct_doc_cnt: width must be `0` for integer alphabet and `8` for byte alphabet");
16 | 
17 |     if ( !cache_file_exists(KEY_DOCCNT, cc) ){
18 |                 const char* KEY_TEXT  = key_text_trait<t_width>::KEY_TEXT;
19 |         std::string text_file = cache_file_name(KEY_TEXT, cc);
20 |         if (!cache_file_exists(KEY_TEXT, cc)) {
21 |             std::cerr << "ERROR: construct_doc_cnt: " << text_file
22 |                       << " does not exist. Abort." << std::endl;
23 |             return;
24 |         }
25 |         uint64_t doc_cnt = 0;
26 |         int_vector_buffer<t_width> text(text_file);
27 |         doc_cnt = count_if(text.begin(), text.end(),
28 |                      [](decltype(*(text.begin())) y){
29 |                         return y==1;       
30 |                      });
31 |         store_to_cache(doc_cnt, KEY_DOCCNT, cc);
32 |     }
33 | }
34 | 
35 | }// end namespace
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/include/surf/construct_doc_lengths.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SURF_CONSTRUCT_DOC_LENGTHS_HPP
 2 | #define SURF_CONSTRUCT_DOC_LENGTHS_HPP
 3 | 
 4 | #include <sdsl/int_vector.hpp>
 5 | #include <algorithm>
 6 | 
 7 | namespace surf{
 8 | 
 9 | template<uint8_t t_width>
10 | void construct_doc_lengths(sdsl::cache_config& cconfig)
11 | {
12 |     using namespace sdsl;
13 |     using namespace std;
14 |     static_assert(t_width == 0 or t_width == 8 , 
15 |             "construct_doc_border: width must be `0` for integer alphabet and `8` for byte alphabet");
16 |     const char* KEY_TEXT  = key_text_trait<t_width>::KEY_TEXT;
17 |     std::string text_file = cache_file_name(KEY_TEXT, cconfig);
18 |     if (!cache_file_exists(KEY_TEXT, cconfig)) {
19 |         std::cerr << "ERROR: construct_doc_cnt: " << text_file
20 |                   << " does not exist. Abort." << std::endl;
21 |         return;
22 |     }
23 |     int_vector_buffer<t_width> text(text_file);
24 |     std::vector<uint64_t> doc_lengths;
25 |     for (uint64_t i=0, len=0; i < text.size(); ++i){
26 |         ++len;
27 |         if ( 1 == text[i] ){
28 |             doc_lengths.push_back(len);
29 |             len = 0;
30 |         } 
31 |     }
32 |     sdsl::int_vector<> sdsl_doc_len(doc_lengths.size());
33 |     for(size_t i=0;i<doc_lengths.size();i++) {
34 |         sdsl_doc_len[i] = doc_lengths[i];
35 |     }
36 |     sdsl::util::bit_compress(sdsl_doc_len);
37 |     store_to_cache(sdsl_doc_len, KEY_DOC_LENGTHS, cconfig);
38 | }
39 | 
40 | }// end namespace
41 | 
42 | #endif
43 | 


--------------------------------------------------------------------------------
/include/surf/construct_doc_perm.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SURF_CONSTRUCT_DOC_PERM_HPP
 2 | #define SURF_CONSTRUCT_DOC_PERM_HPP
 3 | 
 4 | #include "doc_perm.hpp"
 5 | #include <sdsl/int_vector.hpp>
 6 | #include <algorithm>
 7 | #include <utility>
 8 | 
 9 | namespace surf{
10 | 
11 | template<uint8_t t_width>
12 | void construct_doc_perm(sdsl::cache_config& cc)
13 | {
14 |     using namespace sdsl;
15 |     using namespace std;
16 |     static_assert(t_width == 0 or t_width == 8 , 
17 |             "construct_doc_perm: width must be `0` for integer alphabet and `8` for byte alphabet");
18 | 
19 |     if ( !cache_file_exists(KEY_DOCPERM, cc) ) {
20 |         const char* KEY_TEXT  = key_text_trait<t_width>::KEY_TEXT;
21 |         std::string text_file = cache_file_name(KEY_TEXT, cc);
22 |         if (!cache_file_exists(KEY_TEXT, cc)) {
23 |             std::cerr << "ERROR: construct_doc_perm: " << text_file
24 |                       << " does not exist. Abort." << std::endl;
25 |             return;
26 |         }
27 |         int_vector_buffer<t_width> text(text_file);
28 | 
29 |         std::cout<<"constructing doc_perm start"<<std::endl;
30 |         typedef std::pair<uint64_t, uint64_t> tPII;
31 |         std::vector<tPII> len_id; 
32 |         for (uint64_t i=0, doc_len=0,id=0; i < text.size(); ++i){
33 |             ++doc_len;
34 |             if ( 1 == text[i] ){
35 |                 len_id.emplace_back(doc_len, id);
36 |                 ++id;
37 |                 doc_len = 0;
38 |             }
39 |         }
40 |         std::cout<<"now sorting..."<<std::endl;
41 |         std::sort(len_id.begin(),len_id.end());
42 |         std::cout<<"end sorting"<<std::endl;
43 |         doc_perm dp;
44 |         dp.id2len = int_vector<>(len_id.size(), 0, sdsl::bits::hi(len_id.size()-1)+1);
45 |         dp.len2id = dp.id2len;
46 |         for (size_t i=0; i<len_id.size(); ++i){
47 |             dp.id2len[len_id[i].second] = i;
48 |         }
49 |         std::cout << "inv perm..." << std::endl;
50 |         for (size_t i=0; i<len_id.size(); ++i){
51 |             dp.len2id[dp.id2len[i]] = i;
52 |         }
53 |         std::cout<<"constructing doc_perm end"<<std::endl;
54 |         store_to_cache(dp, KEY_DOCPERM, cc);
55 |     }
56 | }
57 | 
58 | }// end namespace
59 | 
60 | #endif
61 | 


--------------------------------------------------------------------------------
/include/surf/construct_invidx.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SURF_CONSTRUCT_INVIDX_HPP
  2 | #define SURF_CONSTRUCT_INVIDX_HPP
  3 | 
  4 | #include "surf/config.hpp"
  5 | #include "sdsl/config.hpp"
  6 | #include "surf/doc_perm.hpp"
  7 | #include "construct_doc_cnt.hpp"
  8 | #include "surf/construct_darray.hpp"
  9 | #include "surf/construct_doc_border.hpp"
 10 | #include "sdsl/int_vector.hpp"
 11 | 
 12 | namespace surf{
 13 | 
 14 | 
 15 | void construct_term_ranges(sdsl::int_vector<>& ids, sdsl::int_vector<>& sp, 
 16 |                             sdsl::int_vector<>& ep,sdsl::cache_config& cconfig)
 17 | {
 18 |     if (!cache_file_exists(sdsl::conf::KEY_SA, cconfig)) {
 19 |         sdsl::construct_sa<sdsl::int_alphabet_tag::WIDTH>(cconfig);
 20 |     }
 21 |     register_cache_file(sdsl::conf::KEY_SA, cconfig);
 22 | 
 23 |     sdsl::int_vector_buffer<> sa(cache_file_name(sdsl::conf::KEY_SA,cconfig));
 24 |     sdsl::int_vector<> T;
 25 |     load_from_cache(T,sdsl::conf::KEY_TEXT_INT,cconfig);
 26 |     size_t range_start = 0;
 27 |     std::vector<std::tuple<size_t,size_t,size_t>> ranges;
 28 |     std::cout << "determine term ranges"<< std::endl;
 29 |     for(size_t i=1;i<T.size();i++) {
 30 |         if(T[sa[i]] != T[sa[i-1]]) {
 31 |             ranges.emplace_back(T[sa[i-1]],range_start,i-1);
 32 |             range_start = i;
 33 |         }
 34 |     }
 35 |     ranges.emplace_back(T[sa[T.size()-1]],range_start,T.size()-1);
 36 |     sp.resize(ranges.size());
 37 |     ep.resize(ranges.size());
 38 |     ids.resize(ranges.size());
 39 |     size_t num_sym = 0;
 40 |     for(const auto& range : ranges) {
 41 |         ids[num_sym] = std::get<0>(range);
 42 |         sp[num_sym] = std::get<1>(range);
 43 |         ep[num_sym++] = std::get<2>(range);
 44 |     }
 45 | }
 46 | 
 47 | void construct_invidx_doc_permuations(sdsl::int_vector<>& id_mapping,sdsl::cache_config& cconfig)
 48 | {
 49 |     surf::construct_doc_cnt<sdsl::int_alphabet_tag::WIDTH>(cconfig);
 50 |     uint64_t doc_cnt = 0;
 51 |     load_from_cache(doc_cnt, surf::KEY_DOCCNT, cconfig);
 52 |     sdsl::int_vector<> doc_mapping(doc_cnt);
 53 |     {
 54 |         auto url_file = cconfig.dir + "/../" + surf::URL2ID_FILENAME;
 55 |         std::ifstream ufs(url_file);
 56 |         if(ufs.is_open()) {
 57 |             /* load current/indri order */
 58 |             std::unordered_map<std::string,uint64_t> id_mapping;
 59 |             auto docnames_file = cconfig.dir + "/../" + surf::DOCNAMES_FILENAME;
 60 |             std::ifstream dfs(docnames_file);
 61 |             std::string name_mapping;
 62 |             size_t j=0;
 63 |             while( std::getline(dfs,name_mapping) ) {
 64 |                 id_mapping[name_mapping] = j;
 65 |                 j++;
 66 |             }
 67 |             /* load url sorted order */
 68 |             std::string url_mapping;
 69 |             j=0;
 70 |             while( std::getline(ufs,url_mapping) ) {
 71 |                 auto doc_name = url_mapping.substr(url_mapping.find(' ')+1);
 72 |                 auto itr = id_mapping.find(doc_name);
 73 |                 if(itr != id_mapping.end()) {
 74 |                     doc_mapping[itr->second] = j;
 75 |                 } else {
 76 |                     std::cerr << "could not find mapping for '" << doc_name << "'" << std::endl;
 77 |                 }
 78 |                 j++;
 79 |             }
 80 |         } else {
 81 |             // identity permutation
 82 |             for(size_t i=0;i<doc_mapping.size();i++) doc_mapping[i] = i;
 83 |         }
 84 |     }
 85 |     // create the inverse permutation
 86 |     id_mapping.resize(doc_mapping.size());
 87 |     for(size_t i=0;i<doc_mapping.size();i++) {
 88 |         id_mapping[doc_mapping[i]] = i;
 89 |     }
 90 | 
 91 |     // store the forward to disk
 92 |     store_to_cache(doc_mapping, KEY_INVFILE_DOCPERM, cconfig);
 93 | }
 94 | 
 95 | void construct_F_t(sdsl::int_vector<>& F_t,sdsl::cache_config& cconfig)
 96 | {
 97 |     // load term ranges 
 98 |     sdsl::int_vector<> ids; sdsl::int_vector<> sp; sdsl::int_vector<> ep;
 99 |     if( cache_file_exists(surf::KEY_INVFILE_TERM_RANGES,cconfig) ) {
100 |         std::ifstream ifs(cache_file_name(surf::KEY_INVFILE_TERM_RANGES,cconfig));
101 |         ids.load(ifs);
102 |         sp.load(ifs);
103 |         ep.load(ifs);
104 |     } else {
105 |         construct_term_ranges(ids,sp,ep,cconfig);
106 |         std::ofstream ofs(cache_file_name(surf::KEY_INVFILE_TERM_RANGES,cconfig));
107 |         serialize(ids,ofs);
108 |         serialize(sp,ofs);
109 |         serialize(ep,ofs);
110 |     }
111 | 
112 |     F_t.resize(ids.size());
113 |     for(size_t i=0;i<ids.size();i++) {
114 |         F_t[i] = ep[i] - sp[i] + 1;
115 |     }
116 | }
117 | 
118 | template<class t_pl,class t_rank>
119 | void construct_postings_lists(std::vector<t_pl>& postings_lists,sdsl::cache_config& cconfig)
120 | {
121 |     using namespace sdsl;
122 |     using namespace std;
123 | 
124 |     // load term ranges 
125 |     sdsl::int_vector<> ids; sdsl::int_vector<> sp; sdsl::int_vector<> ep;
126 |     if( cache_file_exists(surf::KEY_INVFILE_TERM_RANGES,cconfig) ) {
127 |         std::ifstream ifs(cache_file_name(surf::KEY_INVFILE_TERM_RANGES,cconfig));
128 |         ids.load(ifs);
129 |         sp.load(ifs);
130 |         ep.load(ifs);
131 |     } else {
132 |         construct_term_ranges(ids,sp,ep,cconfig);
133 |         std::ofstream ofs(cache_file_name(surf::KEY_INVFILE_TERM_RANGES,cconfig));
134 |         serialize(ids,ofs);
135 |         serialize(sp,ofs);
136 |         serialize(ep,ofs);
137 |     }
138 | 
139 | 
140 |     if (!cache_file_exists(surf::KEY_DOCBORDER, cconfig)){
141 |         construct_doc_border<sdsl::int_alphabet_tag::WIDTH>(cconfig);
142 |     }
143 |     if (!cache_file_exists(surf::KEY_DARRAY, cconfig)){
144 |         construct_darray<sdsl::int_alphabet_tag::WIDTH>(cconfig);
145 |     }
146 | 
147 |     // load or construct D array
148 |     std::cout << "stream D"<< std::endl;
149 |     int_vector_buffer<> D(cache_file_name(surf::KEY_DARRAY,cconfig));
150 | 
151 |     // load or construct rank function
152 |     std::cout << "load rank"<< std::endl;
153 |     t_rank ranker(cconfig);
154 | 
155 |     // load mapping if it exists
156 |     std::cout << "load docid mapping" << std::endl;
157 |     sdsl::int_vector<> doc_mapping;
158 |     doc_perm dp;
159 |     load_from_cache(dp, KEY_DOCPERM, cconfig);
160 |     load_from_cache(doc_mapping, KEY_INVFILE_DOCPERM, cconfig);
161 | 
162 |     // construct plist for each range
163 |     std::cout << "create postings lists"<< endl;
164 |     size_t max_id = ids[ids.size()-1];
165 |     postings_lists.resize(max_id+1);
166 |     for(size_t i=2;i<ids.size();i++) { // skip \0 and \1
167 |         size_t range_size = ep[i] - sp[i] + 1;
168 |         int_vector<> tmpD(range_size);
169 |         for(size_t j=sp[i];j<=ep[i];j++) tmpD[j-sp[i]] = doc_mapping[dp.len2id[D[j]]];
170 |         if(range_size>1000) std::cout << "(" << i << ") |<" << sp[i] << "," << ep[i] << ">| = " << range_size << std::endl;
171 |         postings_lists[ids[i]] = t_pl(ranker,tmpD,0,range_size-1);
172 |     }
173 | }
174 | 
175 | }// end namespace
176 | 
177 | #endif
178 | 


--------------------------------------------------------------------------------
/include/surf/doc_perm.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef SURF_DOC_PERM_HPP
 2 | #define SURF_DOC_PERM_HPP
 3 | 
 4 | #include <sdsl/int_vector.hpp>
 5 | #include <string>
 6 | 
 7 | namespace surf{
 8 | 
 9 | struct doc_perm{
10 |     typedef typename sdsl::int_vector<>::size_type size_type;
11 |     sdsl::int_vector<> id2len; // doc id to length ordered id
12 |     sdsl::int_vector<> len2id; // length ordered id to doc id
13 | 
14 |     inline size_type serialize(std::ostream& out, sdsl::structure_tree_node* v = NULL, std::string name = "") const {
15 |         using namespace sdsl;
16 |         structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
17 |         size_type written_bytes = 0;
18 |         written_bytes += id2len.serialize(out, child, "id2len");
19 |         written_bytes += len2id.serialize(out, child, "len2id");
20 |         structure_tree::add_size(child, written_bytes);
21 |         return written_bytes;
22 |     }
23 | 
24 |     inline void load(std::istream &in){
25 |         id2len.load(in);
26 |         len2id.load(in);
27 |     }
28 | };
29 | 
30 | }
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/include/surf/idx_d.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SURF_IDX_D_HPP
  2 | #define SURF_IDX_D_HPP
  3 | 
  4 | #include "sdsl/suffix_trees.hpp"
  5 | #include "surf/df_sada.hpp"
  6 | #include "surf/rank_functions.hpp"
  7 | #include "surf/construct_col_len.hpp"
  8 | #include <algorithm>
  9 | #include <limits>
 10 | #include <queue>
 11 | 
 12 | namespace surf{
 13 | 
 14 | using range_type = sdsl::range_type;
 15 | 
 16 | struct term_info{
 17 |     std::vector<uint64_t> t; // term_id
 18 |     uint64_t f_qt; // term_frequency
 19 |     uint64_t sp_Dt; // start of interval for term t in the suffix array
 20 |     uint64_t ep_Dt; // end of interval for term t in the suffix array
 21 |     uint64_t f_Dt;  // number of distinct document the term occurs in 
 22 | 
 23 |     term_info() = default;
 24 |     term_info(const std::vector<uint64_t>& t, uint64_t f_qt, uint64_t sp_Dt, uint64_t ep_Dt, uint64_t f_Dt) : 
 25 |         t(t), f_qt(f_qt), sp_Dt(sp_Dt), ep_Dt(ep_Dt), f_Dt(f_Dt) {
 26 |         
 27 |     }
 28 | 
 29 |     term_info(term_info&&) = default;
 30 |     term_info(const term_info&) = default;
 31 |     term_info& operator=(term_info&&) = default;
 32 |     term_info& operator=(const term_info&) = default;
 33 | 
 34 |     uint64_t F_Dt() const{
 35 |         return ep_Dt-sp_Dt+1;
 36 |     }
 37 | };
 38 | 
 39 | template<typename t_wt_node>
 40 | struct s_state_t{
 41 |     double score;
 42 |     t_wt_node v;
 43 |     std::vector<term_info*> t_ptrs; // pointers to term_info array
 44 |     std::vector<range_type> r; // ranges
 45 | 
 46 |     s_state_t() = default;
 47 | 
 48 |     s_state_t(double score, const t_wt_node& v, 
 49 |               const std::vector<term_info*>& t_ptrs,
 50 |               const std::vector<range_type>& r):
 51 |         score(score), v(v), t_ptrs(t_ptrs),
 52 |         r(r){}
 53 | 
 54 |     s_state_t(s_state_t&&) = default;
 55 |     s_state_t(const s_state_t&) = default;
 56 | 
 57 |     s_state_t& operator=(s_state_t&&) = default;
 58 |     s_state_t& operator=(const s_state_t&) = default;
 59 | 
 60 |     bool operator<(const s_state_t& s)const{
 61 |         if ( score != s.score ){
 62 |             return score < s.score;
 63 |         }
 64 |         return v < s.v;
 65 |     }
 66 | };
 67 | 
 68 | /*! Class idx_d consists of a 
 69 |  *   - CSA over the collection concatenation
 70 |  *   - document frequency structure
 71 |  *   - a WT over the D array
 72 |  */
 73 | template<typename t_csa,
 74 |          typename t_wtd,
 75 |          typename t_df,
 76 |          typename t_ranker=rank_bm25<>>
 77 | class idx_d{
 78 | public:
 79 |     using size_type = sdsl::int_vector<>::size_type;
 80 |     typedef t_csa    csa_type;
 81 |     typedef t_wtd    wtd_type;
 82 |     typedef typename wtd_type::node_type node_type;
 83 |     typedef t_df     df_type;
 84 |     typedef t_ranker ranker_type;
 85 | public:
 86 |     csa_type    m_csa;
 87 |     wtd_type    m_wtd;
 88 |     df_type     m_df;
 89 |     doc_perm    m_docperm;
 90 |     ranker_type m_ranker;
 91 | 
 92 |     using state_type = s_state_t<typename t_wtd::node_type>;
 93 | public:
 94 | 
 95 |     result search(const std::vector<query_token>& qry,size_t k,bool ranked_and = false,bool profile = false) const {
 96 |         typedef std::priority_queue<state_type> pq_type;
 97 |         typedef std::priority_queue<double, std::vector<double>, std::greater<double>> pq_min_type;
 98 |         std::vector<term_info> terms;
 99 |         std::vector<term_info*> term_ptrs;
100 |         std::vector<range_type> ranges;
101 |         result res;
102 | 
103 |         if(profile) {
104 |             res.wt_nodes = 2*m_wtd.sigma-1;
105 |         }
106 | 
107 |         for (size_t i=0; i<qry.size(); ++i){
108 |             size_type sp=1, ep=0;
109 |             if ( backward_search(m_csa, 0, m_csa.size()-1, 
110 |                                  qry[i].token_ids.begin(),
111 |                                  qry[i].token_ids.end(),
112 |                                  sp, ep) > 0 ) {
113 |                 auto f_Dt = std::get<0>(m_df(sp,ep)); // document frequency
114 |                 terms.emplace_back(qry[i].token_ids, qry[i].f_qt, sp, ep,  f_Dt);
115 |                 ranges.emplace_back(sp, ep);
116 |             }
117 |         }
118 |         term_ptrs.resize(terms.size()); 
119 |         for (size_type i=0; i<terms.size(); ++i){
120 |             term_ptrs[i] = &terms[i];
121 |         }
122 |         double initial_term_num = terms.size();
123 | 
124 |         auto push_node = [this,&initial_term_num, &res,&profile,&ranked_and]
125 |                          (pq_type& pq, const std::vector<term_info*>& t_ptrs,node_type& v,
126 |                           std::vector<range_type>& r,
127 |                           pq_min_type& pq_min, const size_t& k){
128 |             auto min_idx = m_wtd.sym(v) << (m_wtd.max_level - v.level);  
129 |             auto min_doc_len = m_ranker.doc_length(m_docperm.len2id[min_idx]);
130 |             state_type t; // new state
131 |             t.v = v;
132 | 
133 |             t.score = initial_term_num * m_ranker.calc_doc_weight(min_doc_len);
134 | 
135 |             bool eval = false;
136 |             bool is_leaf = m_wtd.is_leaf(v);
137 |             for (size_t i = 0; i < r.size(); ++i){
138 |                 if ( !empty(r[i]) ){
139 |                     eval = true;
140 |                     t.r.push_back(r[i]);
141 |                     t.t_ptrs.push_back(t_ptrs[i]);
142 | 
143 |                     auto score = m_ranker.calculate_docscore(
144 |                                  t.t_ptrs.back()->f_qt,
145 |                                  size(t.r.back()),
146 |                                  t.t_ptrs.back()->f_Dt,
147 |                                  t.t_ptrs.back()->F_Dt(),
148 |                                  min_doc_len,
149 |                                  is_leaf
150 |                                );
151 |                     t.score += score;
152 |                 } else if ( ranked_and ){
153 |                     return;
154 |                 }
155 |             }
156 |             if (!eval){
157 |                 return;
158 |             }
159 |             if ( pq_min.size() < k ){ // not yet k leaves in score queue
160 |                 pq.emplace(t);
161 |                 if (profile) res.wt_search_space++;
162 |                 if ( m_wtd.is_leaf(t.v) )
163 |                     pq_min.push(t.score);
164 |             } else { // more than k leaves in score queue
165 |                 if ( t.score > pq_min.top() ){
166 |                     pq.emplace(t);
167 |                     if (profile) res.wt_search_space++;
168 |                     if ( m_wtd.is_leaf(t.v) ){
169 |                         pq_min.pop();
170 |                         pq_min.push(t.score);
171 |                     }
172 |                 } 
173 |             }
174 |         };
175 | 
176 |         constexpr double max_score = std::numeric_limits<double>::max();
177 |         
178 |         pq_min_type pq_min;
179 |         pq_type pq;
180 |         pq.emplace(max_score, m_wtd.root(), term_ptrs, ranges);
181 |         if(profile) res.wt_search_space++;
182 | 
183 |         while ( !pq.empty() and res.list.size() < k ) {
184 |             state_type s = pq.top();
185 |             pq.pop();
186 |             if ( m_wtd.is_leaf(s.v) ){
187 |                 res.list.emplace_back(m_docperm.len2id[m_wtd.sym(s.v)], s.score);
188 |             } else {
189 | //fast_expand:               
190 |                 auto exp_v = m_wtd.expand(s.v);
191 |                 bool left_empty = m_wtd.empty(std::get<0>(exp_v));
192 |                 bool right_empty = m_wtd.empty(std::get<1>(exp_v));
193 |                 auto exp_r = m_wtd.expand(s.v, std::move(s.r));
194 |                 if ( std::get<1>(exp_r).size() == 0 and std::get<0>(exp_r).size() > 0 and !m_wtd.is_leaf(std::get<0>(exp_v) )){
195 |                     std::cout<<"easy"<<std::endl;
196 |                 } 
197 | 
198 |                 if ( !left_empty ) {
199 |                     push_node(pq, s.t_ptrs, std::get<0>(exp_v), std::get<0>(exp_r), pq_min, k);
200 |                 } else{
201 |                     //std::cout<<"left_empty"<<std::endl;
202 |                 }
203 |                 if ( !right_empty ) {
204 |                     push_node(pq, s.t_ptrs, std::get<1>(exp_v), std::get<1>(exp_r), pq_min, k);
205 |                 } else{
206 |                     //std::cout<<"right_empty"<<std::endl;
207 |                 }
208 |             }
209 |         }
210 |         return res;
211 |     }
212 | 
213 |     void load(sdsl::cache_config& cc){
214 |         load_from_cache(m_csa, surf::KEY_CSA, cc, true);
215 |         load_from_cache(m_wtd, surf::KEY_WTD, cc, true);
216 |         load_from_cache(m_df, surf::KEY_SADADF, cc, true);
217 |         load_from_cache(m_docperm, surf::KEY_DOCPERM, cc); 
218 |         m_ranker = ranker_type(cc);
219 |     }
220 | 
221 |     size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const {
222 |         structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
223 |         size_type written_bytes = 0;
224 |         written_bytes += m_csa.serialize(out, child, "CSA");
225 |         written_bytes += m_wtd.serialize(out, child, "WTD");
226 |         written_bytes += m_df.serialize(out, child, "DF");
227 |         written_bytes += m_docperm.serialize(out, child, "DOCPERM");
228 |         structure_tree::add_size(child, written_bytes);
229 |         return written_bytes;
230 |     }
231 | 
232 |     void mem_info(){
233 |         std::cout << sdsl::size_in_bytes(m_csa) << ";"; // CSA
234 |         std::cout << sdsl::size_in_bytes(m_wtd) << ";"; // WTD^\ell 
235 |         std::cout << sdsl::size_in_bytes(m_df) << ";";  // DF
236 |         std::cout << 0 << ";"; // WTR^\ell
237 |         std::cout << sdsl::size_in_bytes(m_docperm) << std::endl;  // DOCPERM
238 | 
239 |     }
240 | 
241 | };
242 | 
243 | template<typename t_csa,
244 |          typename t_wtd,
245 |          typename t_df,
246 |          typename t_ranker
247 |         >
248 | void construct(idx_d<t_csa,t_wtd,t_df,t_ranker>& idx,
249 |                const std::string&,
250 |                sdsl::cache_config& cc, uint8_t num_bytes)
251 | {    
252 |     using namespace sdsl;
253 |     using namespace std;
254 | 
255 |     construct_col_len<t_df::alphabet_category::WIDTH>(cc);
256 | 
257 |     cout<<"...CSA"<<endl;
258 |     if ( !cache_file_exists<t_csa>(surf::KEY_CSA, cc) )
259 |     {
260 |         t_csa csa;
261 |         construct(csa, "", cc, 0);
262 |         store_to_cache(csa, surf::KEY_CSA, cc, true);
263 |     }
264 |     cout<<"...WTD"<<endl;
265 |     if (!cache_file_exists<t_wtd>(surf::KEY_WTD, cc) ){
266 |         construct_doc_perm<t_csa::alphabet_type::int_width>(cc);
267 |         construct_darray<t_csa::alphabet_type::int_width>(cc);
268 |         t_wtd wtd;
269 |         construct(wtd, cache_file_name(surf::KEY_DARRAY, cc), cc);
270 |         cout << "wtd.size() = " << wtd.size() << endl;
271 |         cout << "wtd.sigma = " << wtd.sigma << endl;
272 |         store_to_cache(wtd, surf::KEY_WTD, cc, true);
273 |     }
274 |     cout<<"...DF"<<endl;
275 |     if (!cache_file_exists<t_df>(surf::KEY_SADADF, cc))
276 |     {
277 |         t_df df;
278 |         construct(df, "", cc, 0);
279 |         store_to_cache(df, surf::KEY_SADADF, cc, true);
280 |     }
281 | }
282 | 
283 | } // end namespace surf
284 | 
285 | #endif
286 | 


--------------------------------------------------------------------------------
/include/surf/idx_dr.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SURF_IDX_DR_HPP
  2 | #define SURF_IDX_DR_HPP
  3 | 
  4 | #include "sdsl/suffix_trees.hpp"
  5 | #include "surf/df_sada.hpp"
  6 | #include "surf/rank_functions.hpp"
  7 | #include "surf/idx_d.hpp"
  8 | #include "surf/construct_col_len.hpp"
  9 | #include "surf/construct_DUP2.hpp"
 10 | #include <algorithm>
 11 | #include <limits>
 12 | #include <queue>
 13 | 
 14 | namespace surf{
 15 | 
 16 | using range_type = sdsl::range_type;
 17 | 
 18 | 
 19 | template<typename t_wtd_node, typename t_wtr_node>
 20 | struct s_state2_t{
 21 |     double score;
 22 |     t_wtd_node v; // node in document array wavelet tree
 23 |     std::vector<term_info*> t_ptrs; // pointers to term_info array
 24 |     std::vector<range_type> r_v; // ranges in v
 25 |     t_wtr_node w; // node in repetition array wavelet tree
 26 |     std::vector<range_type> r_w; // ranges in w
 27 | 
 28 |     s_state2_t() = default;
 29 | 
 30 |     s_state2_t(double score, const t_wtd_node& v, 
 31 |               const std::vector<term_info*>& t_ptrs,
 32 |               const std::vector<range_type>& r_v,
 33 |               const t_wtr_node& w,
 34 |               const std::vector<range_type>& r_w):
 35 |         score(score), v(v), t_ptrs(t_ptrs),
 36 |         r_v(r_v),w(w),r_w(r_w)
 37 |     {}
 38 | 
 39 |     s_state2_t(s_state2_t&&) = default;
 40 |     s_state2_t(const s_state2_t&) = default;
 41 | 
 42 |     s_state2_t& operator=(s_state2_t&&) = default;
 43 |     s_state2_t& operator=(const s_state2_t&) = default;
 44 | 
 45 |     bool operator<(const s_state2_t& s)const{
 46 |         if ( score != s.score ){
 47 |             return score < s.score;
 48 |         }
 49 |         return v < s.v;
 50 |     }
 51 | };
 52 | 
 53 | 
 54 | template<typename t_wtd_node, typename t_wtr_node>
 55 | inline std::ostream& operator<<(std::ostream& os, const s_state2_t<t_wtd_node, t_wtr_node>& state)
 56 | {
 57 |     os << state.v.level << "-("<<state.score<<") ";
 58 |     for(auto x : state.r_v){ os<<"["<<x.first<<","<<x.second<<"]:"<<x.second-x.first+1<<" "; }
 59 |     os << "|";
 60 |     for(auto x : state.r_w){ os<<"["<<x.first<<","<<x.second<<"]:"<<x.second-x.first+1<<" "; }
 61 |     return os;
 62 | }
 63 | 
 64 | 
 65 | 
 66 | /*! Class idx_dr consists of a 
 67 |  *   - CSA over the collection concatenation
 68 |  *   - document frequency structure
 69 |  *   - a WT over the D array
 70 |  *   - a WT over the repetition array
 71 |  */
 72 | template<typename t_csa,
 73 |          typename t_df,
 74 |          typename t_wtd,
 75 |          typename t_wtr,
 76 |          typename t_ranker=rank_bm25<>,
 77 |          typename t_rbv=sdsl::rrr_vector<63>,
 78 |          typename t_rrank=typename t_rbv::rank_1_type>
 79 | class idx_dr{
 80 | public:
 81 |     using size_type = sdsl::int_vector<>::size_type;
 82 |     typedef t_csa                        csa_type;
 83 |     typedef t_wtd                        wtd_type;
 84 |     typedef typename wtd_type::node_type node_type;
 85 |     typedef t_df                         df_type;
 86 |     typedef t_wtr                        wtr_type;
 87 |     typedef typename wtr_type::node_type node2_type;
 88 |     typedef t_rbv                        rbv_type;
 89 |     typedef t_rrank                      rrank_type;
 90 |     typedef t_ranker                     ranker_type;
 91 | public:
 92 |     csa_type    m_csa;
 93 |     df_type     m_df;
 94 |     wtr_type    m_wtr; 
 95 |     t_wtd       m_wtd;
 96 |     rbv_type    m_rbv;
 97 |     rrank_type  m_rrank;
 98 |     doc_perm    m_docperm;
 99 |     ranker_type m_ranker;
100 | 
101 |     using state_type = s_state2_t<node_type, node2_type>;
102 | public:
103 | 
104 |     result search(const std::vector<query_token>& qry,size_t k,bool ranked_and = false,bool profile = false) const {
105 |         typedef std::priority_queue<state_type> pq_type;
106 |         std::vector<term_info> terms;
107 |         std::vector<term_info*> term_ptrs;
108 |         std::vector<range_type> v_ranges; // ranges in wtd
109 |         std::vector<range_type> w_ranges; // ranges in wtdup
110 |         result res;
111 | 
112 |         for (size_t i=0; i<qry.size(); ++i){
113 |             size_type sp=1, ep=0;
114 |             if ( backward_search(m_csa, 0, m_csa.size()-1, 
115 |                                 qry[i].token_ids.begin(),
116 |                                 qry[i].token_ids.end(),
117 |                                 sp, ep) > 0 ) {
118 |                 auto df_info = m_df(sp,ep);
119 | //std::cout<<"[sp,ep]=["<<sp<<","<<ep<<"]"<<std::endl;
120 |                 auto f_Dt = std::get<0>(df_info); // document frequency
121 |                 terms.emplace_back(qry[i].token_ids, qry[i].f_qt, sp, ep,  f_Dt);
122 | //for(size_t k=sp; k<=ep; ++k){ std::cout<<".."<<m_wtd[k]<<std::endl; }
123 | //std::cout<<std::endl;
124 |                 v_ranges.emplace_back(sp, ep);
125 |                 w_ranges.emplace_back(m_rrank(std::get<1>(df_info)),
126 |                                       m_rrank(std::get<2>(df_info)+1)-1);
127 | 
128 |             }
129 |         }
130 |         term_ptrs.resize(terms.size()); 
131 |         for (size_type i=0; i<terms.size(); ++i){
132 |             term_ptrs[i] = &terms[i];
133 |         }
134 |         double initial_term_num = terms.size();
135 | 
136 |         auto push_node = [this,&initial_term_num,&res,&profile,&ranked_and](pq_type& pq, state_type& s,node_type& v,std::vector<range_type>& r_v,
137 |                                 node2_type& w, std::vector<range_type>& r_w){
138 |             auto min_idx = m_wtd.sym(v) << (m_wtd.max_level - v.level);  
139 |             auto min_doc_len = m_ranker.doc_length(m_docperm.len2id[min_idx]);
140 |             state_type t; // new state
141 |             t.v = v;
142 |             t.w = w;
143 |             t.score = initial_term_num * m_ranker.calc_doc_weight(min_doc_len);
144 |             bool eval = false;
145 |             bool is_leaf = m_wtd.is_leaf(v);
146 |             for (size_t i = 0; i < r_v.size(); ++i){
147 |                 if ( !empty(r_v[i]) ){
148 |                     eval = true;
149 |                     t.r_v.push_back(r_v[i]);
150 |                     t.r_w.push_back(r_w[i]);
151 |                     t.t_ptrs.push_back(s.t_ptrs[i]);
152 |                     auto score = m_ranker.calculate_docscore(
153 |                                  t.t_ptrs.back()->f_qt,
154 |                                  size(t.r_w.back())+1,
155 |                                  t.t_ptrs.back()->f_Dt,
156 |                                  t.t_ptrs.back()->F_Dt(),
157 |                                  min_doc_len,
158 |                                  is_leaf
159 |                                );
160 |                     t.score += score;
161 |                 } else if ( ranked_and ) {
162 |                     return;
163 |                 }
164 |             }
165 |             if (eval){ 
166 | //                std::cout << t << std::endl;
167 |                 if (profile) res.wt_search_space++;
168 |                 pq.emplace(t);       
169 |             }
170 |         };
171 | 
172 |         constexpr double max_score = std::numeric_limits<double>::max();
173 |         
174 |         pq_type pq;
175 |         size_type search_space=0;
176 |         pq.emplace(max_score, m_wtd.root(), term_ptrs, v_ranges, m_wtr.root(), w_ranges);
177 | //        std::cout << "\n" << pq.top() << std::endl;
178 |         if(profile) res.wt_search_space++;
179 | 
180 |         while ( !pq.empty() and res.list.size() < k ) {
181 |             state_type s = pq.top();
182 |             pq.pop();
183 |             if ( m_wtd.is_leaf(s.v) ){
184 |                 res.list.emplace_back(m_docperm.len2id[m_wtd.sym(s.v)], s.score);
185 |             } else {
186 |                 auto exp_v = m_wtd.expand(s.v);
187 |                 auto exp_r_v = m_wtd.expand(s.v, s.r_v);
188 |                 auto exp_w = m_wtr.expand(s.w);
189 |                 auto exp_r_w = m_wtr.expand(s.w, s.r_w);
190 | 
191 |                 if ( !m_wtd.empty(std::get<0>(exp_v)) ) {
192 |                     push_node(pq, s, std::get<0>(exp_v), std::get<0>(exp_r_v), 
193 |                                      std::get<0>(exp_w), std::get<0>(exp_r_w));
194 |                 }
195 |                 if ( !m_wtd.empty(std::get<1>(exp_v)) ) {
196 |                     push_node(pq, s, std::get<1>(exp_v), std::get<1>(exp_r_v),
197 |                                      std::get<1>(exp_w), std::get<1>(exp_r_w));
198 |                 }
199 |             }
200 |         }
201 |         return res;
202 |     }
203 | 
204 |     void load(sdsl::cache_config& cc){
205 |         load_from_cache(m_csa, surf::KEY_CSA, cc, true);
206 |         load_from_cache(m_df, surf::KEY_SADADF, cc, true);
207 |         load_from_cache(m_wtr, surf::KEY_WTDUP2, cc, true);
208 |         std::cerr<<"m_wtr.size()="<<m_wtr.size()<<std::endl;
209 |         std::cerr<<"m_wtr.sigma()="<<m_wtr.sigma<<std::endl;
210 |         load_from_cache(m_wtd, surf::KEY_WTD, cc, true);
211 |         std::cerr<<"m_wtd.size()="<<m_wtd.size()<<std::endl;
212 |         std::cerr<<"m_wtd.sigma()="<<m_wtd.sigma<<std::endl;
213 |         load_from_cache(m_rbv, surf::KEY_DUPMARK, cc, true);
214 |         std::cerr<<"m_rbv.size()="<<m_rbv.size()<<std::endl;
215 |         load_from_cache(m_rrank, surf::KEY_DUPRANK, cc, true);
216 |         m_rrank.set_vector(&m_rbv);
217 |         std::cerr<<"m_rrank(m_rbv.size())="<<m_rrank(m_rbv.size())<<std::endl;
218 |         load_from_cache(m_docperm, surf::KEY_DOCPERM, cc); 
219 |         m_ranker = ranker_type(cc);
220 |     }
221 | 
222 |     size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const {
223 |         structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
224 |         size_type written_bytes = 0;
225 |         written_bytes += m_csa.serialize(out, child, "CSA");
226 |         written_bytes += m_df.serialize(out, child, "DF");
227 |         written_bytes += m_wtd.serialize(out, child, "WTD");
228 |         written_bytes += m_wtr.serialize(out, child, "WTR");
229 |         written_bytes += m_rbv.serialize(out, child, "R_BV");
230 |         written_bytes += m_rrank.serialize(out, child, "R_RANK");
231 |         written_bytes += m_docperm.serialize(out, child, "DOCPERM");
232 |         structure_tree::add_size(child, written_bytes);
233 |         return written_bytes;
234 |     }
235 | 
236 |     void mem_info(){
237 |         std::cout << sdsl::size_in_bytes(m_csa) << ";"; // CSA
238 |         std::cout << sdsl::size_in_bytes(m_wtd) << ";"; // WTD^\ell 
239 |         std::cout << sdsl::size_in_bytes(m_df) << ";";  // DF
240 |         std::cout << sdsl::size_in_bytes(m_wtr) 
241 |                    + sdsl::size_in_bytes(m_rbv) 
242 |                    + sdsl::size_in_bytes(m_rrank) 
243 |                   << ";"; // WTR^\ell
244 |         std::cout << sdsl::size_in_bytes(m_docperm) << std::endl;  // DOCPERM
245 |     }
246 | 
247 | };
248 | 
249 | template<typename t_csa,
250 |          typename t_df,
251 |          typename t_wtd,
252 |          typename t_wtr,
253 |          typename t_ranker,
254 |          typename t_rbv,
255 |          typename t_rrank>
256 | void construct(idx_dr<t_csa,t_df,t_wtd,t_wtr, t_ranker, t_rbv, t_rrank>& idx,
257 |                const std::string&,
258 |                sdsl::cache_config& cc, uint8_t num_bytes)
259 | {    
260 |     using namespace sdsl;
261 |     using namespace std;
262 | 
263 |     construct_col_len<t_df::alphabet_category::WIDTH>(cc);
264 | 
265 |     cout<<"...CSA"<<endl;
266 |     if ( !cache_file_exists<t_csa>(surf::KEY_CSA, cc) )
267 |     {
268 |         t_csa csa;
269 |         construct(csa, "", cc, 0);
270 |         store_to_cache(csa, surf::KEY_CSA, cc, true);
271 |     }
272 |     cout<<"...WTD"<<endl;
273 |     if (!cache_file_exists<t_wtd>(surf::KEY_WTD, cc) ){
274 |         construct_doc_perm<t_csa::alphabet_type::int_width>(cc);
275 |         construct_darray<t_csa::alphabet_type::int_width>(cc);
276 |         t_wtd wtd;
277 |         construct(wtd, cache_file_name(surf::KEY_DARRAY, cc), cc);
278 |         cout << "wtd.size() = " << wtd.size() << endl;
279 |         cout << "wtd.sigma = " << wtd.sigma << endl;
280 |         store_to_cache(wtd, surf::KEY_WTD, cc, true);
281 |     }
282 |     cout<<"...DF"<<endl;
283 |     if (!cache_file_exists<t_df>(surf::KEY_SADADF, cc))
284 |     {
285 |         t_df df;
286 |         construct(df, "", cc, 0);
287 |         store_to_cache(df, surf::KEY_SADADF, cc, true);
288 |     }
289 |     cout<<"...WTR"<<endl;
290 |     if (!cache_file_exists<t_wtr>(surf::KEY_WTDUP2,cc)){
291 |         construct_dup2<t_df>(cc); // construct DUP2 and DUPMARK
292 |         t_wtr wtr;
293 |         construct(wtr, cache_file_name(surf::KEY_DUP2, cc), cc);
294 |         store_to_cache(wtr, surf::KEY_WTDUP2, cc, true);
295 |         cout << "wtr.size() = " << wtr.size() << endl;
296 |         cout << "wtr.sigma = " << wtr.sigma << endl;
297 |     }
298 |     cout<<"...R_BV"<<endl;
299 |     if (!cache_file_exists<t_rbv>(surf::KEY_DUPMARK, cc) ){
300 |         bit_vector bv;
301 |         load_from_cache(bv, surf::KEY_DUPMARK, cc);
302 |         t_rbv rbv(bv);
303 |         store_to_cache(rbv, surf::KEY_DUPMARK, cc, true);
304 |         t_rrank rrank(&rbv);
305 |         store_to_cache(rrank, surf::KEY_DUPRANK, cc, true);
306 |     }
307 | }
308 | 
309 | } // end namespace surf
310 | 
311 | #endif
312 | 


--------------------------------------------------------------------------------
/include/surf/indexes.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef SURF_INDEXES_HPP
 3 | #define SURF_INDEXES_HPP
 4 | 
 5 | #include "idx_invfile.hpp"
 6 | #include "idx_d.hpp"
 7 | #include "idx_dr.hpp"
 8 | #include "idx_d1r1.hpp"
 9 | #include "idx_d1r1mtf.hpp"
10 | 
11 | #endif
12 | 


--------------------------------------------------------------------------------
/include/surf/phrase_parser.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SURF_PHRASE_PARSER_HPP
  2 | #define SURF_PHRASE_PARSER_HPP
  3 | 
  4 | #include <iostream>
  5 | #include <fstream>
  6 | #include <sstream>
  7 | #include <unordered_map>
  8 | #include <ratio>
  9 | #include <chrono>
 10 | 
 11 | #include "surf/config.hpp"
 12 | #include "surf/query.hpp"
 13 | 
 14 | namespace surf{
 15 | 
 16 | struct phrase_parser {
 17 |     phrase_parser() = delete;
 18 | 
 19 |     template<class t_csa>
 20 |     static query_t phrase_segmentation(t_csa& csa,
 21 |     						const std::vector<uint64_t>& query_ids,
 22 |     						const std::unordered_map<uint64_t,std::string>& reverse_mapping,
 23 |                             double threshold)
 24 |     {
 25 |     	//compute single term probabilities
 26 |     	std::vector<double> P_single;
 27 |     	for(size_t i=0;i<query_ids.size();i++) {
 28 |     		auto cnt = sdsl::count(csa,query_ids.begin()+i,query_ids.begin()+i+1);
 29 |     		double prob = (double)cnt / (double)csa.size();
 30 |     		P_single.push_back(prob);
 31 |     	}
 32 | 
 33 |     	//compute all probabilities
 34 |     	std::vector<std::vector<uint64_t>> phrases;
 35 |     	size_t start = 0;
 36 |     	size_t stop = query_ids.size();
 37 |     	while(start < stop) {
 38 |     		bool phrase_found = false;
 39 |     		bool phrase_added = false;
 40 |     		for(size_t i=start+1;i<stop;i++) {
 41 |                 // if we start at a very frequent word, a phrase can't start 
 42 |                 // there.
 43 |                 auto single_cnt = sdsl::count(csa,query_ids.begin()+start,query_ids.begin()+start+1);
 44 |                 if( single_cnt * 100 > csa.size() ) {
 45 |                     break;
 46 |                 }
 47 | 
 48 |     			auto cnt = sdsl::count(csa,query_ids.begin()+start,query_ids.begin()+i+1);
 49 |     			double prob = (double)cnt / (double)csa.size();
 50 | 
 51 |     			// single
 52 |     			double single = P_single[i];
 53 |     			for(size_t l=start;l<=i;l++) single *= P_single[l];
 54 | 
 55 |     			// calc ratio
 56 |     			double assoc_ratio = log(prob)-log(single);
 57 | 
 58 |                 // debug
 59 |                 /*
 60 |                 std::cout << "SCORE(";            
 61 |                 for(size_t l=start;l<=i;l++) {
 62 |                     auto id = query_ids[l];
 63 |                     auto stritr = reverse_mapping.find(id);
 64 |                     std::cout << stritr->second << " ";
 65 |                 }
 66 |                 std::cout << ") -> " << assoc_ratio << std::endl;
 67 |                 */
 68 | 
 69 |     			if(assoc_ratio < threshold) {
 70 |     				// not a phrase. if the prev one was a phrase we use it
 71 |     				if(phrase_found) {
 72 |     					std::vector<uint64_t> phrase;
 73 |     					for(size_t j=start;j<i;j++) {
 74 |     						phrase.push_back(query_ids[j]);
 75 |     					}
 76 |     					phrases.push_back(phrase);
 77 |     				    phrase_added = true;
 78 |     				    start = i;
 79 |     				    break;
 80 |     				}
 81 |     			} else {
 82 |     				// still a phrase. continue!
 83 |     				phrase_found = true;
 84 |     			}
 85 |     		}
 86 |     		if(!phrase_added) {
 87 |     			if(phrase_found) {
 88 |     				// we found a phrase that goes to the end of the id list
 89 |     				std::vector<uint64_t> phrase;
 90 |     				for(size_t i=start;i<stop;i++) {
 91 |     					phrase.push_back(query_ids[i]);
 92 |     				}
 93 |     				phrases.push_back(phrase);
 94 |     				start = stop;
 95 |     			} else {
 96 |     				// for this term we have not found any phrase 
 97 |     				// with it. add it as a single
 98 |     				std::vector<uint64_t> single;
 99 |     				single.push_back(query_ids[start]);
100 |     				phrases.push_back(single);
101 |     				start++;
102 |     			}
103 |     		}
104 |     	}
105 | 
106 |     	// check if all phrases are uniq
107 |     	query_t q;
108 |     	auto itr = phrases.begin();
109 |     	while(itr != phrases.end()) {
110 |     		auto cur_list = *itr;
111 |     		uint64_t num_equal = 0;
112 |     		auto next = itr+1;
113 |     		while(next != phrases.end()) {
114 |     			auto next_list = *next;
115 |     			if(std::equal(cur_list.begin(),cur_list.end(),next_list.begin())) {
116 |     				num_equal++;
117 |     				next = phrases.erase(next);
118 |     			} else {
119 |     				next++;
120 |     			}
121 |     		}
122 | 
123 |     		/* get the string representation */
124 |     		std::vector<std::string> qry_str;
125 |     		for(const auto& id : cur_list) {
126 |     			auto rmitr = reverse_mapping.find(id);
127 |                 if(rmitr != reverse_mapping.end()) {
128 |     			    qry_str.push_back(rmitr->second);
129 |                 }
130 |     		}
131 |     		std::get<1>(q).emplace_back(*itr,qry_str,num_equal);
132 |     		itr++;
133 |     	}
134 |         std::sort(std::get<1>(q).begin(),std::get<1>(q).end()); // sort
135 |     	return q;
136 |     }
137 | };
138 | }// end namespace
139 | 
140 | #endif
141 | 


--------------------------------------------------------------------------------
/include/surf/query.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef SURF_QUERY_HPP
 3 | #define SURF_QUERY_HPP
 4 | 
 5 | #include <vector>
 6 | 
 7 | namespace surf {
 8 | 
 9 | struct doc_score {
10 | 	uint64_t doc_id;
11 | 	double score;
12 |     bool operator>(const doc_score& rhs) const {
13 |     	if(score == rhs.score)
14 |     		return doc_id > rhs.doc_id;
15 |         return score > rhs.score;
16 |     }
17 |     doc_score() {};
18 |     doc_score(uint64_t did,double s) : doc_id(did) , score(s) {};
19 | };
20 | 
21 | struct result {
22 |     std::vector<doc_score> list;
23 |     uint64_t wt_search_space = 0;
24 |     uint64_t wt_nodes = 0;
25 |     uint64_t postings_evaluated = 0;
26 |     uint64_t postings_total = 0;
27 | };
28 | 
29 | struct query_token{
30 |     std::vector<uint64_t> token_ids;
31 |     std::vector<std::string> token_strs;
32 | 	uint64_t f_qt;
33 | 	query_token(const std::vector<uint64_t>& ids,
34 |                 const std::vector<std::string>& strs,
35 |                 uint64_t f) : token_ids(ids), token_strs(strs), f_qt(f) 
36 |     {
37 |     }
38 |     bool operator<(const query_token& qt) const {
39 |         return std::lexicographical_compare(token_ids.begin(), token_ids.end(),
40 |                                             qt.token_ids.begin(), qt.token_ids.end());
41 |     }
42 | };
43 | 
44 | using query_t = std::tuple<uint64_t,std::vector<query_token>>;
45 | 
46 | 
47 | }
48 | 
49 | #endif


--------------------------------------------------------------------------------
/include/surf/query_parser.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SURF_QUERY_PARSER_HPP
  2 | #define SURF_QUERY_PARSER_HPP
  3 | 
  4 | #include <iostream>
  5 | #include <fstream>
  6 | #include <sstream>
  7 | #include <unordered_map>
  8 | #include <algorithm>
  9 | 
 10 | #include "surf/config.hpp"
 11 | #include "surf/query.hpp"
 12 | 
 13 | namespace surf{
 14 | 
 15 | struct query_parser {
 16 |     query_parser() = delete;
 17 |     using mapping_t = std::pair<std::unordered_map<std::string,uint64_t>,
 18 |                      std::unordered_map<uint64_t,std::string>
 19 |                      >;
 20 | 
 21 |     static mapping_t
 22 |          load_dictionary(const std::string& collection_dir)
 23 |     {
 24 |         std::unordered_map<std::string,uint64_t> id_mapping;
 25 |         std::unordered_map<uint64_t,std::string> reverse_id_mapping;
 26 |         {
 27 |             auto dict_file = collection_dir + "/" + surf::DICT_FILENAME;
 28 |             std::ifstream dfs(dict_file);
 29 |             if(!dfs.is_open()) {
 30 |                 std::cerr << "cannot load dictionary file.";
 31 |                 exit(EXIT_FAILURE);
 32 |             }
 33 |             std::string term_mapping;
 34 |             while( std::getline(dfs,term_mapping) ) {
 35 |                 auto sep_pos = term_mapping.find(' ');
 36 |                 auto term = term_mapping.substr(0,sep_pos);
 37 |                 auto idstr = term_mapping.substr(sep_pos+1);
 38 |                 uint64_t id = std::stoull(idstr);
 39 |                 id_mapping[term] = id;
 40 |                 reverse_id_mapping[id] = term;
 41 |             }
 42 |         }
 43 |         return {id_mapping,reverse_id_mapping};
 44 |     }
 45 | 
 46 |     static std::tuple<bool,uint64_t,std::vector<uint64_t>> 
 47 |         map_to_ids(const std::unordered_map<std::string,uint64_t>& id_mapping,
 48 |                    std::string query_str,bool only_complete,bool integers)
 49 |     {
 50 |         auto id_sep_pos = query_str.find(';');
 51 |         auto qryid_str = query_str.substr(0,id_sep_pos);
 52 |         auto qry_id = std::stoull(qryid_str);
 53 |         auto qry_content = query_str.substr(id_sep_pos+1);
 54 | 
 55 |         std::vector<uint64_t> ids;
 56 |         std::istringstream qry_content_stream(qry_content);
 57 |         for(std::string qry_token; std::getline(qry_content_stream,qry_token,' ');) {
 58 |             if(integers) {
 59 |                 uint64_t id = std::stoull(qry_token);
 60 |                 ids.push_back(id);
 61 |             } else {
 62 |                 auto id_itr = id_mapping.find(qry_token);
 63 |                 if(id_itr != id_mapping.end()) {
 64 |                     ids.push_back(id_itr->second);
 65 |                 } else {
 66 |                     std::cerr << "ERROR: could not find '" << qry_token << "' in the dictionary." << std::endl;
 67 |                     if(only_complete) {
 68 |                         return std::make_tuple(false,qry_id,ids);
 69 |                     }
 70 |                 }
 71 |             }
 72 |         }
 73 |         return std::make_tuple(true,qry_id,ids);
 74 |     }
 75 | 
 76 |     static std::pair<bool,query_t> parse_query(const mapping_t& mapping,
 77 |                 const std::string& query_str,bool only_complete = false,bool integers = false)
 78 |     {
 79 | 
 80 |         const auto& id_mapping = mapping.first;
 81 |         const auto& reverse_mapping = mapping.second;
 82 | 
 83 |         auto mapped_qry = map_to_ids(id_mapping,query_str,only_complete,integers);
 84 | 
 85 |         bool parse_ok = std::get<0>(mapped_qry);
 86 |         auto qry_id = std::get<1>(mapped_qry);
 87 |         if(parse_ok) {
 88 |             std::unordered_map<uint64_t,uint64_t> qry_set;
 89 |             const auto& qids = std::get<2>(mapped_qry);
 90 |             for(const auto& qid : qids) {
 91 |                 qry_set[qid] += 1;
 92 |             }
 93 |             std::vector<query_token> query_tokens;
 94 |             for(const auto& qry_tok : qry_set) {
 95 |                 std::vector<uint64_t> term;
 96 |                 term.push_back(qry_tok.first);
 97 |                 auto rmitr = reverse_mapping.find(qry_tok.first);
 98 |                 std::vector<std::string> term_str;
 99 |                 if(rmitr != reverse_mapping.end()) {
100 |                     std::string qry_str = rmitr->second;
101 |                     term_str.push_back(qry_str);
102 |                 }
103 |                 query_tokens.emplace_back(term,term_str,qry_tok.second);
104 |             }
105 |             std::sort(query_tokens.begin(),query_tokens.end()); // sort
106 |             query_t q(qry_id,query_tokens);
107 |             return {true,q};
108 |         }
109 | 
110 |         // error
111 |         query_t q;
112 |         return {false,q};
113 |     }
114 | 
115 |     static std::vector<query_t> parse_queries(const std::string& collection_dir,
116 |                                             const std::string& query_file,bool only_complete = false) 
117 |     {
118 |         std::vector<query_t> queries;
119 | 
120 |         /* load the mapping */
121 |         auto mapping = load_dictionary(collection_dir);
122 | 
123 |         /* parse queries */
124 |         std::ifstream qfs(query_file); 
125 |         if(!qfs.is_open()) {
126 |             std::cerr << "cannot load query file.";
127 |             exit(EXIT_FAILURE);
128 |         }
129 | 
130 |         std::string query_str;
131 |         while( std::getline(qfs,query_str) ) {
132 |             auto parsed_qry = parse_query(mapping,query_str,only_complete);
133 |             if(parsed_qry.first) {
134 |                 queries.emplace_back(parsed_qry.second);
135 |             }
136 |         }
137 | 
138 |         return queries;
139 |     }
140 | };
141 | 
142 | }// end namespace
143 | 
144 | #endif
145 | 


--------------------------------------------------------------------------------
/include/surf/rank_functions.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SURF_RANK_FUNCTIONS_HPP
  2 | #define SURF_RANK_FUNCTIONS_HPP
  3 | 
  4 | #include "construct_doc_lengths.hpp"
  5 | #include "surf/config.hpp"
  6 | #include <sdsl/suffix_trees.hpp>
  7 | #include "sdsl/int_vector.hpp"
  8 | #include "surf/util.hpp"
  9 | 
 10 | using namespace sdsl;
 11 | 
 12 | namespace surf {
 13 | 
 14 | template<uint32_t t_k1=120,uint32_t t_b=75>
 15 | struct rank_bm25 {
 16 |     static const double k1;
 17 |     static const double b;
 18 |     static const double epsilon_score;
 19 | 	size_t num_docs;
 20 | 	size_t num_terms;
 21 | 	double avg_doc_len;
 22 | 	double min_doc_len;
 23 | 	sdsl::int_vector<> doc_lengths;
 24 | 
 25 | 	static std::string name() {
 26 | 		return "bm25";
 27 | 	}
 28 | 
 29 |     rank_bm25(){}
 30 | 
 31 |     rank_bm25& operator=(const rank_bm25&) = default;
 32 | 
 33 | 	rank_bm25(cache_config& cconfig) {
 34 | 		uint64_t num_terms;
 35 |         load_from_cache(num_terms, surf::KEY_COLLEN, cconfig);
 36 |         if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){
 37 |             surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig);
 38 |         }
 39 |         load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig);
 40 | 		num_docs = doc_lengths.size();
 41 |         std::cerr<<"num_docs = "<<num_docs<<std::endl;
 42 | 	    avg_doc_len = (double)num_terms / (double)num_docs;
 43 |         std::cerr<<"avg_doc_len = "<<avg_doc_len<<std::endl;
 44 | 	}
 45 | 	double doc_length(size_t doc_id) const {
 46 | 		return (double) doc_lengths[doc_id];
 47 | 	}
 48 | 	double calc_doc_weight(double ) const {
 49 | 		return 0;
 50 | 	}
 51 | 	double calculate_docscore(const double f_qt,const double f_dt,const double f_t,
 52 | 							  const double F_t,const double W_d,bool) const 
 53 | 	{
 54 |         double w_qt = std::max(epsilon_score, log((num_docs - f_t + 0.5) / (f_t+0.5)) * f_qt);
 55 |         double K_d = k1*((1-b) + (b*(W_d/avg_doc_len)));
 56 |         double w_dt = ((k1+1)*f_dt) / (K_d + f_dt);
 57 |         return w_dt*w_qt;
 58 |     }
 59 | };
 60 | 
 61 | 
 62 | template<uint32_t t_k1=120,uint32_t t_b=75>
 63 | struct rank_bm25_simple_est {
 64 |     static const double k1;
 65 |     static const double b;
 66 |     static const double epsilon_score;
 67 | 	size_t num_docs;
 68 | 	size_t num_terms;
 69 | 	double avg_doc_len;
 70 | 	double min_doc_len;
 71 | 	sdsl::int_vector<> doc_lengths;
 72 | 
 73 | 	static std::string name() {
 74 | 		return "bm25_simple_est";
 75 | 	}
 76 | 
 77 |     rank_bm25_simple_est(){}
 78 | 
 79 |     rank_bm25_simple_est& operator=(const rank_bm25_simple_est&) = default;
 80 | 
 81 | 	rank_bm25_simple_est(cache_config& cconfig) {
 82 | 		uint64_t num_terms;
 83 |         load_from_cache(num_terms, surf::KEY_COLLEN, cconfig);
 84 |         if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){
 85 |             surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig);
 86 |         }
 87 |         load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig);
 88 | 		num_docs = doc_lengths.size();
 89 |         std::cerr<<"num_docs = "<<num_docs<<std::endl;
 90 | 	    avg_doc_len = (double)num_terms / (double)num_docs;
 91 | 	    auto min_itr = std::min_element(doc_lengths.begin(),doc_lengths.end());
 92 | 	    min_doc_len = *min_itr;
 93 |         std::cerr<<"avg_doc_len = "<<avg_doc_len<<std::endl;
 94 |         std::cerr<<"min_doc_len = "<<min_doc_len<<std::endl;
 95 | 	}
 96 | 	double doc_length(size_t doc_id) const {
 97 | 		return (double) doc_lengths[doc_id];
 98 | 	}
 99 | 	double calc_doc_weight(double) const {
100 | 		return 0;
101 | 	}
102 | 	double calculate_docscore(const double f_qt,const double f_dt,const double f_t,
103 | 							  const double F_t,double W_d,bool use_W_d = true) const 
104 | 	{
105 | 		if(!use_W_d) {
106 | 			W_d = min_doc_len;
107 | 		}
108 |         double w_qt = std::max(epsilon_score, log((num_docs - f_t + 0.5) / (f_t+0.5)) * f_qt);
109 |         double K_d = k1*((1-b) + (b*(W_d/avg_doc_len)));
110 |         double w_dt = ((k1+1)*f_dt) / (K_d + f_dt);
111 |         return w_dt*w_qt;
112 |     }
113 | };
114 | 
115 | template<uint32_t t_smoothing_param = 2500>
116 | class rank_lmds
117 | {
118 |     static const double smoothing_param;
119 | 	size_t num_docs;
120 | 	uint64_t num_terms;
121 | 	double avg_doc_len;
122 | 	double min_doc_len;
123 | 	sdsl::int_vector<> doc_lengths;
124 | 
125 | public:	
126 | 
127 | 	static std::string name() {
128 | 		return "lmds";
129 | 	}
130 | 
131 |     rank_lmds() = default;
132 | 
133 | 	rank_lmds(cache_config& cconfig) {
134 |         load_from_cache(num_terms, surf::KEY_COLLEN, cconfig);
135 |         if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){
136 |             surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig);
137 |         }
138 |         load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig);
139 | 		num_docs = doc_lengths.size();
140 |         std::cerr<<"num_docs = "<<num_docs<<std::endl;
141 | 	    avg_doc_len = (double)num_terms / (double)num_docs;
142 | 	    auto min_itr = std::min_element(doc_lengths.begin(),doc_lengths.end());
143 | 	    min_doc_len = *min_itr;
144 |         std::cerr<<"avg_doc_len = "<<avg_doc_len<<std::endl;
145 |         std::cerr<<"min_doc_len = "<<min_doc_len<<std::endl;
146 | 	}
147 | 	double doc_length(size_t doc_id) const {
148 | 		return (double) doc_lengths[doc_id];
149 | 	}
150 | 
151 | 	double calculate_docscore(const double f_qt,const double f_dt,const double f_t,
152 | 							  const double F_t,double W_d,bool use_W_d = true) const 
153 | 	{
154 |         double normalization = num_terms/F_t;
155 |         double doc_score = (f_dt/smoothing_param) * normalization;
156 |         return log(doc_score+1);
157 |     }
158 | 
159 |     double calc_doc_weight(double W_d) const {
160 |         return log(smoothing_param / (smoothing_param + W_d) );
161 |     }
162 | };
163 | 
164 | class rank_tfidf
165 | {
166 |     static const double smoothing_param;
167 | 	size_t num_docs;
168 | 	uint64_t num_terms;
169 | 	double min_doc_len;
170 | 	sdsl::int_vector<> doc_lengths;
171 | 
172 | public:	
173 | 
174 | 	static std::string name() {
175 | 		return "tfidf";
176 | 	}
177 | 
178 |     rank_tfidf() = default;
179 | 
180 | 	rank_tfidf(cache_config& cconfig) {
181 |         load_from_cache(num_terms, surf::KEY_COLLEN, cconfig);
182 |         if (!cache_file_exists(surf::KEY_DOC_LENGTHS, cconfig)){
183 |             surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cconfig);
184 |         }
185 |         load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cconfig);
186 | 		num_docs = doc_lengths.size();
187 |         std::cerr<<"num_docs = "<<num_docs<<std::endl;
188 | 	    auto min_itr = std::min_element(doc_lengths.begin(),doc_lengths.end());
189 | 	    min_doc_len = *min_itr;
190 |         std::cerr<<"min_doc_len = "<<min_doc_len<<std::endl;
191 | 	}
192 | 	double doc_length(size_t doc_id) const {
193 | 		return (double) doc_lengths[doc_id];
194 | 	}
195 | 
196 | 	double calculate_docscore(const double f_qt,const double f_dt,const double f_t,
197 | 							  const double F_t,double W_d,bool use_W_d = true) const 
198 | 	{
199 | 		double doc_norm = 1.0/W_d;
200 | 		double w_dq = 1.0 + log(f_dt);
201 | 		double w_Qq = log(1.0 + ((double)num_docs/f_t));
202 | 		return doc_norm * w_dq * w_Qq;
203 |     }
204 |     
205 | 	double calc_doc_weight(double ) const {
206 | 		return 0;
207 | 	}
208 | };
209 | 
210 | 
211 | struct rank_freq
212 | {
213 | 	static std::string name() {
214 | 		return "freq";
215 | 	}
216 | 
217 |     rank_freq() = default;
218 | 
219 | 	rank_freq(cache_config& ) {
220 | 	}
221 | 	double doc_length(size_t doc_id) const {
222 | 		return 0;
223 | 	}
224 | 
225 | 	double calculate_docscore(const double f_qt,const double f_dt,const double f_t,
226 | 							  const double F_t,double W_d,bool use_W_d = true) const 
227 | 	{
228 | 		return f_dt;
229 |     }
230 |     
231 | 	double calc_doc_weight(double) const {
232 | 		return 0;
233 | 	}
234 | };
235 | 
236 | template<uint32_t t_s>
237 | const double rank_lmds<t_s>::smoothing_param = (double)t_s;
238 | 
239 | template<uint32_t t_k1,uint32_t t_b>
240 | const double rank_bm25_simple_est<t_k1,t_b>::k1 = (double)t_k1/100.0;
241 | 
242 | template<uint32_t t_k1,uint32_t t_b>
243 | const double rank_bm25_simple_est<t_k1,t_b>::b = (double)t_b/100.0;
244 | 
245 | template<uint32_t t_k1,uint32_t t_b>
246 | const double rank_bm25_simple_est<t_k1,t_b>::epsilon_score = 1e-6;
247 | 
248 | template<uint32_t t_k1,uint32_t t_b>
249 | const double rank_bm25<t_k1,t_b>::k1 = (double)t_k1/100.0;
250 | 
251 | template<uint32_t t_k1,uint32_t t_b>
252 | const double rank_bm25<t_k1,t_b>::b = (double)t_b/100.0;
253 | 
254 | template<uint32_t t_k1,uint32_t t_b>
255 | const double rank_bm25<t_k1,t_b>::epsilon_score = 1e-6;
256 | 
257 | 
258 | } // end surf namespace
259 | 
260 | #endif
261 | 


--------------------------------------------------------------------------------
/include/surf/util.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SURF_UTIL_HPP
  2 | #define SURF_UTIL_HPP
  3 | 
  4 | #include "surf/config.hpp"
  5 | #include "sdsl/io.hpp"
  6 | 
  7 | #include <string>
  8 | #include <unistd.h>
  9 | #include <stdlib.h>
 10 | #include <iostream>
 11 | 
 12 | #include <sys/types.h>
 13 | #include <sys/stat.h>
 14 | #include <unistd.h>
 15 | 
 16 | namespace surf{
 17 | 
 18 | bool
 19 | directory_exists(std::string dir)
 20 | {
 21 |     struct stat sb;
 22 |     const char* pathname = dir.c_str();
 23 |     if (stat(pathname, &sb) == 0 && (S_IFDIR&sb.st_mode)) {
 24 |         return true;
 25 |     }
 26 |     return false;
 27 | }
 28 | 
 29 | bool
 30 | file_exists(std::string file_name)
 31 | {
 32 |     sdsl::isfstream in(file_name);
 33 |     if (in) {
 34 |         in.close();
 35 |         return true;
 36 |     }
 37 |     return false;
 38 | }
 39 | 
 40 | bool
 41 | symlink_exists(std::string file)
 42 | {
 43 |     struct stat sb;
 44 |     const char* filename = file.c_str();
 45 |     if (stat(filename, &sb) == 0 && (S_IFLNK&sb.st_mode) ) {
 46 |         return true;
 47 |     }
 48 |     return false;
 49 | }
 50 | 
 51 | void
 52 | create_directory(std::string dir)
 53 | {
 54 |     if (!directory_exists(dir)) {
 55 |         if (mkdir(dir.c_str(),0777) == -1) {
 56 |             perror("could not create directory");
 57 |             exit(EXIT_FAILURE);
 58 |         }
 59 |     }
 60 | }
 61 | 
 62 | bool
 63 | valid_collection(std::string collection_dir)
 64 | {
 65 |     if (! surf::directory_exists(collection_dir)) {
 66 |         std::cerr << collection_dir << " is not a valid directory.\n";
 67 |         return false;
 68 |     } else {
 69 |         /* make sure the necessary files are present */
 70 |         if( ! surf::file_exists(collection_dir+"/"+surf::TEXT_FILENAME) ||
 71 |             ! surf::file_exists(collection_dir+"/"+surf::DICT_FILENAME) ||
 72 |             ! surf::file_exists(collection_dir+"/"+surf::DOCNAMES_FILENAME) )
 73 |         {
 74 |             std::cerr << collection_dir << " does not contain a valid surf collection.\n";
 75 |             std::cerr << "The files " << surf::TEXT_FILENAME << " , " << surf::DICT_FILENAME 
 76 |                       << " , " << surf::DOCNAMES_FILENAME << " have to be present" << std::endl;
 77 |             return false;
 78 |         }
 79 |     }
 80 |     return true;
 81 | }
 82 | 
 83 | 
 84 | 
 85 | sdsl::cache_config
 86 | parse_collection(std::string collection_dir)
 87 | {
 88 |     /* check if all the directories exist */
 89 |     if( !surf::valid_collection(collection_dir) ) {
 90 |         exit(EXIT_FAILURE);
 91 |     }
 92 | 
 93 |     std::string index_directory = collection_dir+"/index/";
 94 |     surf::create_directory(index_directory);
 95 | 
 96 |     std::string results_directory = collection_dir+"/results/";
 97 |     surf::create_directory(results_directory);
 98 | 
 99 |     /* populate cache config */
100 |     sdsl::cache_config config(false,collection_dir+"/index/","SURF");
101 | 
102 |     /* create symlink to text in index directory */
103 |     std::string symlink_name = cache_file_name(sdsl::conf::KEY_TEXT_INT,config);
104 |     if( ! surf::symlink_exists(cache_file_name(sdsl::conf::KEY_TEXT_INT,config)) ) {
105 |         std::string collection_file = collection_dir+"/"+surf::TEXT_FILENAME;
106 |         char* col_file_absolute = realpath(collection_file.c_str(), NULL);
107 |         if( symlink(col_file_absolute,symlink_name.c_str()) != 0) {
108 |             perror("cannot create symlink to collection file in index directory");
109 |             exit(EXIT_FAILURE);
110 |         }
111 |         free(col_file_absolute);
112 |     }
113 | 
114 |     /* register files that are present */
115 |     for(const auto& key : surf::storage_keys) {
116 |         register_cache_file(key,config);
117 |     }
118 | 
119 |     return config;
120 | }
121 | 
122 | 
123 | } // end of surf namespace
124 | #endif
125 | 


--------------------------------------------------------------------------------
/queries/trec0406-adhoc.qry:
--------------------------------------------------------------------------------
  1 | 701;us oil industry history
  2 | 702;pearl farming
  3 | 703;us against international criminal court
  4 | 704;green party politics view
  5 | 705;iraq foreign debt reduction
  6 | 706;control type ii diabetes
  7 | 707;aspirin cancer prevention
  8 | 708;decorative slate source
  9 | 709;horse racing jockey weight
 10 | 710;prostate cancer treatment
 11 | 711;train station security measure
 12 | 712;pyramid scheme
 13 | 713;chesapeake bay maryland clean
 14 | 714;license restriction old driver
 15 | 715;schizophrenia drug
 16 | 716;spam arrest sue
 17 | 717;gifted talented student program
 18 | 718;control acid rain
 19 | 719;cruise ship damage sea life
 20 | 720;federal welfare reform
 21 | 721;census data application
 22 | 722;iran terrorism
 23 | 723;executive privilege
 24 | 724;iran contra
 25 | 725;low white blood cell count
 26 | 726;hubble telescope repair
 27 | 727;church arson
 28 | 728;whale save endanger
 29 | 729;whistle blower department of defense
 30 | 730;gastric bypass complication
 31 | 731;kurd history
 32 | 732;us cheese production
 33 | 733;airline overbooke
 34 | 734;recycle success
 35 | 735;afghanistan women condition
 36 | 736;location bse infection
 37 | 737;enron california energy crisis
 38 | 738;anthrax hoax
 39 | 739;habitat for humanity
 40 | 740;regulate assist living maryland
 41 | 741;artificial intelligence
 42 | 742;hedge funds fraud protection
 43 | 743;freighter ship registration
 44 | 744;counterfeit id punishment
 45 | 745;doomsday cult
 46 | 746;outsource job india
 47 | 747;library computer oversight
 48 | 748;nuclear reactor type
 49 | 749;puerto rico state
 50 | 750;john edwards women issue
 51 | 751;scrabble player
 52 | 752;dam removal
 53 | 753;bully prevention program
 54 | 754;domestic adoption law
 55 | 755;scotland highland games
 56 | 756;volcanic activity
 57 | 757;mural
 58 | 758;embryonic stem cell
 59 | 759;civil war battle reenactment
 60 | 760;america muslim mosque school
 61 | 761;problem of hmong immigrant
 62 | 762;history of physician in america
 63 | 763;hunting death
 64 | 764;increase mass transit use
 65 | 765;ephedra ma huang death
 66 | 766;diamond smuggle
 67 | 767;pharmacist license requirement
 68 | 768;women in state legislature
 69 | 769;kroll associate employee
 70 | 770;kyrgyzstan united states relations
 71 | 771;deform leopard frog
 72 | 772;flag display rule
 73 | 773;pennsylvania slot machine gamble
 74 | 774;cause of homeless
 75 | 775;commercial candy maker
 76 | 776;magnet school success
 77 | 777;hybrid alternative fuel car
 78 | 778;golden ratio
 79 | 779;javelina range and description
 80 | 780;arable land
 81 | 781;squirrel control and protection
 82 | 782;orange variety season
 83 | 783;school mercury poison
 84 | 784;mersenne prime
 85 | 785;ivory billed woodpecker
 86 | 786;yew tree
 87 | 787;sunflower cultivation
 88 | 788;reverse mortgage
 89 | 789;abandoned mine reclamation
 90 | 790;women rights in saudi arabia
 91 | 791;gullah geechee language culture
 92 | 792;social security means test
 93 | 793;bagpipe band
 94 | 794;pet therapy
 95 | 795;notable cock spaniel
 96 | 796;blue grass music festival history
 97 | 797;reintroduction of gray wolf
 98 | 798;massachusetts textile mill
 99 | 799;animal in alzheimer research
100 | 800;ovarian cancer treatment
101 | 801;kudzu pueraria lobata
102 | 802;volcano eruption global temperature
103 | 803;may day
104 | 804;ban on human clone
105 | 805;identity theft passport
106 | 806;doctor without border
107 | 807;sugar tariff rate quota
108 | 808;north korea counterfeit
109 | 809;wetland wastewater treatment
110 | 810;timeshare resale
111 | 811;handwriting recognition
112 | 812;total knee replacement surgery
113 | 813;atlantic intracoastal waterway
114 | 814;johnstown flood
115 | 815;coast guard rescue
116 | 816;usaid assistance to galapago
117 | 817;sports stadium name rights
118 | 818;chaco culture national park
119 | 819;1890 census
120 | 820;import fire ant
121 | 821;internet work at home scam
122 | 822;custer last stand
123 | 823;continue care retirement community
124 | 824;civil air patrol
125 | 825;national guard involve in iraq
126 | 826;florida seminole indian
127 | 827;hidden markov model hmm
128 | 828;secret shop
129 | 829;spain civil war support
130 | 830;model railroad
131 | 831;dulles airport security
132 | 832;labor union activity
133 | 833;iceland government
134 | 834;global position system earthquake
135 | 835;big dig pork
136 | 836;illegal immigrant wages
137 | 837;eskimo history
138 | 838;urban suburban coyote
139 | 839;textile dye technique
140 | 840;geyser
141 | 841;camel north america
142 | 842;david mccullough
143 | 843;pol pot
144 | 844;segment duplicate
145 | 845;new jersey tomato
146 | 846;heredity and obese
147 | 847;portugal world war ii
148 | 848;radio station call letters
149 | 849;scale vector graphics
150 | 850;mississippi river flood
151 | 


--------------------------------------------------------------------------------
/queries/trec2005-efficiency-10.qry:
--------------------------------------------------------------------------------
 1 | 70;bentley automobile
 2 | 211;downtown orlando florida
 3 | 257;shannyn sossamon
 4 | 259;theoriginal rainbow cone
 5 | 450;heterogeneous uterus
 6 | 527;john steinbeck
 7 | 591;diamond blackfan anemia
 8 | 626;j boog
 9 | 717;pregnancy
10 | 736;america fidelity insurance company
11 | 


--------------------------------------------------------------------------------
/queries/trec2005-efficiency-100.qry:
--------------------------------------------------------------------------------
  1 | 70;bentley automobile
  2 | 211;downtown orlando florida
  3 | 257;shannyn sossamon
  4 | 259;theoriginal rainbow cone
  5 | 450;heterogeneous uterus
  6 | 527;john steinbeck
  7 | 591;diamond blackfan anemia
  8 | 626;j boog
  9 | 717;pregnancy
 10 | 736;america fidelity insurance company
 11 | 775;can i drive a motorcycle wit a driving permit
 12 | 892;reverse phone look up
 13 | 966;katie holmes
 14 | 1113;bella frisk
 15 | 1131;real estate value
 16 | 1180;pc to xbox interface for steer wheel
 17 | 1209;in living color jamie foxx wanda
 18 | 1235;potterybarn
 19 | 1243;canne watches
 20 | 1248;samsung lcd flat screen
 21 | 1253;rom
 22 | 1264;northwestern memorial hospital
 23 | 1286;boxer breeder in new york
 24 | 1412;cheap ticket
 25 | 1468;texas news paper
 26 | 1550;warre
 27 | 1551;stocks
 28 | 1613;driving directions
 29 | 1661;cheat code
 30 | 1796;john cena
 31 | 1847;gameboy advance fire emblem walkthrough
 32 | 1903;hotel in wright city missouri
 33 | 1969;bead jewlry
 34 | 2118;gienn county child sopport
 35 | 2146;mcfarlane art
 36 | 2217;ca lottery numbers
 37 | 2271;field s
 38 | 2321;weather by the hour
 39 | 2343;house for rent in charlotte nc
 40 | 2348;john dillon day
 41 | 2383;bad homburg germany
 42 | 2388;infiltration of liver
 43 | 2449;bank of america
 44 | 2550;jewish camp in queens
 45 | 2568;cheap rental apartment in miami
 46 | 2602;avon ohio school
 47 | 2669;las vega show
 48 | 2761;goldie hawn
 49 | 2825;mike jones
 50 | 2826;translator
 51 | 2828;leon county school
 52 | 3064;jagermeister
 53 | 3068;recipy
 54 | 3091;provo lds temple prayer roll
 55 | 3161;ebay
 56 | 3234;frontier hotel and casino
 57 | 3281;sympathy john r
 58 | 3398;people
 59 | 3421;boutique in key west florida
 60 | 3443;text letter of condolence of a mother
 61 | 3628;animal shelter clermont
 62 | 3764;abercrombie
 63 | 3828;hen mankell
 64 | 3845;science magazine
 65 | 3895;cinderella man movie
 66 | 3942;heart candle tin
 67 | 3950;phoenix arizona state death record
 68 | 3972;audi usa
 69 | 4001;baby falcon and falconry equipment
 70 | 4151;shane hmiel
 71 | 4153;luca county auditor
 72 | 4172;all marine
 73 | 4212;copycat recipe
 74 | 4234;utah chat rooms
 75 | 4290;hairstyle
 76 | 4379;father s day idea
 77 | 4521;birth control pill
 78 | 4526;nickelodean hotel
 79 | 4580;johnny depp
 80 | 4684;alaskaair
 81 | 4688;spiderman vs venom
 82 | 4720;deconsal
 83 | 4802;baby einsten
 84 | 4829;nietzche
 85 | 4846;lake geneva boat
 86 | 4855;halle berry
 87 | 4942;map of saudi arabia
 88 | 5001;tcc
 89 | 5005;north fork bank
 90 | 5016;train station
 91 | 5023;the honeymoon movie
 92 | 5036;disney free fr
 93 | 5183;key west
 94 | 5209;d d home medium
 95 | 5241;marijuana plant
 96 | 5274;very yonug children model art gallery
 97 | 5302;ny lottery
 98 | 5332;zona girdle zone
 99 | 5336;prom dress
100 | 5340;history of communication
101 | 


--------------------------------------------------------------------------------
/queries/trec2006-efficiency-10.qry:
--------------------------------------------------------------------------------
 1 | 68;slight anemia high tibc
 2 | 250;medical board of georgia administrative action
 3 | 503;leading export in kiribati
 4 | 561;why is is important to study primary source document
 5 | 683;sample cosmetic product recall plan
 6 | 918;phone number for middle town ri dmv
 7 | 1153;campo band of mission indian
 8 | 1178;spring mountain ranch las vega nv
 9 | 1206;ny city employment opportunity
10 | 1394;prevent gas in bottlefe baby
11 | 


--------------------------------------------------------------------------------
/queries/trec2006-efficiency-100.qry:
--------------------------------------------------------------------------------
  1 | 68;slight anemia high tibc
  2 | 250;medical board of georgia administrative action
  3 | 503;leading export in kiribati
  4 | 561;why is is important to study primary source document
  5 | 683;sample cosmetic product recall plan
  6 | 918;phone number for middle town ri dmv
  7 | 1153;campo band of mission indian
  8 | 1178;spring mountain ranch las vega nv
  9 | 1206;ny city employment opportunity
 10 | 1394;prevent gas in bottlefe baby
 11 | 1465;hematology competent test
 12 | 1734;membrane in the egg
 13 | 1847;va medical record form
 14 | 1984;samhsa lidocaine
 15 | 2007;textile fiber identification act
 16 | 2127;animal that have vertebrae in there back different kind
 17 | 2176;special needs health insurance
 18 | 2198;julliard string quartet schedule
 19 | 2203;criminal law
 20 | 2244;operate room mistake
 21 | 2388;passport form
 22 | 2440;pre divorce stress
 23 | 2653;georgia seasonal fruits and vegetable
 24 | 2875;snohomish county evidence department
 25 | 2896;ac 61 107
 26 | 3219;us government parkersburg west virginia
 27 | 3414;wac code of washington
 28 | 3605;advocate for resident rights
 29 | 3622;city of milwaukee assessment
 30 | 3768;irs mileage expense rate
 31 | 3910;mark d shrive is locate gene
 32 | 3931;be 11 government form
 33 | 4257;apd albuquerque
 34 | 4314;pell federal grant
 35 | 4411;plan first medicaid
 36 | 4415;nys math exam
 37 | 4448;how to make steel form machine
 38 | 4523;employment opportunity with the us district court of arizona
 39 | 4572;mental health institute cherokee iowa
 40 | 4708;washing powder ingest
 41 | 4712;grass tillage
 42 | 4720;treatment for narcissism
 43 | 4727;five level of muscle contraction
 44 | 4740;clare b connaughton
 45 | 4875;tim cord
 46 | 4956;ca gov claim
 47 | 4983;lewis and clark item sent to jefferson at monticello
 48 | 5013;history of subway
 49 | 5017;los alto hill town government
 50 | 5138;blue winged warbler
 51 | 5258;test for ny city firefighter application
 52 | 5884;check insurance with ohio fair plan
 53 | 5927;were there any lives lost in the 1811 and 1812 earth quake that form reelfoot lake
 54 | 5941;citgo petroleum corporation
 55 | 5971;what should a freshmen do to prepare to be a sophomore in high school
 56 | 6062;culpeper county dogs adoption
 57 | 6161;hov lane compliance
 58 | 6480;complicated migraine
 59 | 6785;somerset refinery co
 60 | 6811;operate without reasonable control citation
 61 | 7005;nutrition and food service
 62 | 7078;webster county mo sales tax
 63 | 7172;oregon per
 64 | 7188;nurse practitioner in legislature
 65 | 7211;stony creek virginia fishing
 66 | 7450;s c rule on transcription of court record
 67 | 7552;trask venture fund
 68 | 7635;cycloheximide
 69 | 7712;bus from jfk to lga
 70 | 7956;brandon mayfield
 71 | 8018;section 8 apartments in terre haute
 72 | 8847;pennsylvania coin
 73 | 8946;channel stuffing and consignment sales and fasb
 74 | 8961;polycystic kidney and pericardial effusion
 75 | 9061;workforce development ma
 76 | 9178;irs head of household
 77 | 9275;prisoner of war life
 78 | 9523;business by federal id number
 79 | 9527;home equity loan for poor credit
 80 | 9535;finger injury statistics
 81 | 9540;houston anderson thyme surgery
 82 | 9726;space stars
 83 | 9730;my history of real estate tax nyc
 84 | 9839;teacher edition books online
 85 | 9840;connecticut attorney general
 86 | 9866;what schedule is methadone
 87 | 9950;gemini program nasa
 88 | 9957;childhood leukemia
 89 | 9977;infertile after iud
 90 | 9991;fuse panel diagram for old old original tennessee homeowner obtain
 91 | 10021;state of michigan state bird
 92 | 10063;regulatory system quiz
 93 | 10139;epa freon test
 94 | 10231;the waterpocket fold in capitol reef national park
 95 | 10395;cobalt urine level
 96 | 10397;san francisco maritime park management plan
 97 | 10470;california worker compensation law
 98 | 11060;lancaster va
 99 | 11077;toddler self regulation
100 | 11155;home for sale okalona ms
101 | 


--------------------------------------------------------------------------------
/queries/trec2006-efficiency.qry:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simongog/surf/c8caa199391793395df85dede6df88c17514097e/queries/trec2006-efficiency.qry


--------------------------------------------------------------------------------
/results/trec8_wtdup_stat.R:
--------------------------------------------------------------------------------
 1 | data <- read.csv2("trec8_wtdup_stat.txt",sep=",",header=F)
 2 | 
 3 | pdf("trec8_wtdup_stat.pdf")
 4 | 
 5 | plot(data$V1,cumsum(data$V2*data$V1)/crossprod(data$V2,data$V1),ylim=c(0,1),xlab="Node size",ylab="Ratio of covered elements in DUP")
 6 | 
 7 | #dev.copy2pdf(file="trec8_wtdup_stat.pdf")
 8 | 
 9 | dev.off()
10 | 


--------------------------------------------------------------------------------
/results/trec8_wtdup_stat.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simongog/surf/c8caa199391793395df85dede6df88c17514097e/results/trec8_wtdup_stat.pdf


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | !.cpp
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/src/doc_lengths.cpp:
--------------------------------------------------------------------------------
 1 | #include <unistd.h>
 2 | #include <stdlib.h>
 3 | #include <iostream>
 4 | #include <iomanip>
 5 | #include <ctime>
 6 | 
 7 | #include <string>
 8 | #include <sys/types.h>
 9 | #include <sys/stat.h>
10 | #include <unistd.h>
11 | #include "surf/util.hpp"
12 | #include "sdsl/config.hpp"
13 | #include "surf/construct_doc_lengths.hpp"
14 | 
15 | typedef struct cmdargs {
16 |     std::string collection_dir;
17 | } cmdargs_t;
18 | 
19 | void
20 | print_usage(char* program)
21 | {
22 |     fprintf(stdout,"%s -c <collection directory> -p <port> -r\n",program);
23 |     fprintf(stdout,"where\n");
24 |     fprintf(stdout,"  -c <collection directory>  : the directory the collection is stored.\n");
25 | };
26 | 
27 | cmdargs_t
28 | parse_args(int argc,char* const argv[])
29 | {
30 |     cmdargs_t args;
31 |     int op;
32 |     args.collection_dir = "";
33 |     while ((op=getopt(argc,argv,"c:")) != -1) {
34 |         switch (op) {
35 |             case 'c':
36 |                 args.collection_dir = optarg;
37 |                 break;
38 |             case '?':
39 |             default:
40 |                 print_usage(argv[0]);
41 |         }
42 |     }
43 |     if (args.collection_dir=="") {
44 |         std::cerr << "Missing command line parameters.\n";
45 |         print_usage(argv[0]);
46 |         exit(EXIT_FAILURE);
47 |     }
48 |     return args;
49 | }
50 | 
51 | int main(int argc,char* const argv[])
52 | {
53 |     /* parse command line */
54 |     cmdargs_t args = parse_args(argc,argv);
55 | 
56 |     /* parse repo */
57 |     auto cc = surf::parse_collection(args.collection_dir);
58 |     char tmp_str[256] = {0};
59 |     strncpy(tmp_str,args.collection_dir.c_str(),256);
60 |     std::string base_name = basename(tmp_str);
61 | 
62 |     sdsl::int_vector<> doc_lengths;
63 |     if (!sdsl::cache_file_exists(surf::KEY_DOC_LENGTHS, cc)){
64 |         surf::construct_doc_lengths<sdsl::int_alphabet_tag::WIDTH>(cc);
65 |     }
66 |     sdsl::load_from_cache(doc_lengths, surf::KEY_DOC_LENGTHS, cc);
67 | 
68 |     std::sort(doc_lengths.begin(),doc_lengths.end());
69 | 
70 |     std::cout << "count;len\n";
71 |     auto cur = doc_lengths[0];
72 |     size_t count = 1;
73 |     for(size_t i=1;i<doc_lengths.size();i++) {
74 |         if( doc_lengths[i] != cur) {
75 |             std::cout << count << ";" << cur << "\n";
76 |             cur = doc_lengths[i];
77 |             count = 0;
78 |         }
79 |         count++;
80 |     }
81 |     std::cout << count << ";" << cur << std::endl;
82 | 
83 |     return EXIT_SUCCESS;
84 | }
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/src/surf_daemon.cpp:
--------------------------------------------------------------------------------
  1 | #include <unistd.h>
  2 | #include <stdlib.h>
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | #include <ctime>
  6 | 
  7 | #include <string>
  8 | #include <sys/types.h>
  9 | #include <sys/stat.h>
 10 | #include <unistd.h>
 11 | #include "surf/query.hpp"
 12 | #include "sdsl/config.hpp"
 13 | #include "surf/indexes.hpp"
 14 | #include "surf/query_parser.hpp"
 15 | #include "surf/comm.hpp"
 16 | #include "surf/phrase_parser.hpp"
 17 | #include "surf/rank_functions.hpp"
 18 | 
 19 | #include "zmq.hpp"
 20 | 
 21 | typedef struct cmdargs {
 22 |     std::string collection_dir;
 23 |     std::string port;
 24 |     bool load_dictionary;
 25 | } cmdargs_t;
 26 | 
 27 | void
 28 | print_usage(char* program)
 29 | {
 30 |     fprintf(stdout,"%s -c <collection directory> -p <port> -r\n",program);
 31 |     fprintf(stdout,"where\n");
 32 |     fprintf(stdout,"  -c <collection directory>  : the directory the collection is stored.\n");
 33 |     fprintf(stdout,"  -p <port>  : the port the daemon is running on.\n");
 34 |     fprintf(stdout,"  -r : do not load the dictionary.\n");
 35 | };
 36 | 
 37 | cmdargs_t
 38 | parse_args(int argc,char* const argv[])
 39 | {
 40 |     cmdargs_t args;
 41 |     int op;
 42 |     args.collection_dir = "";
 43 |     args.port = std::to_string(12345);
 44 |     args.load_dictionary = true;
 45 |     while ((op=getopt(argc,argv,"c:p:r")) != -1) {
 46 |         switch (op) {
 47 |             case 'c':
 48 |                 args.collection_dir = optarg;
 49 |                 break;
 50 |             case 'p':
 51 |                 args.port = optarg;
 52 |                 break;
 53 |             case 'r':
 54 |                 args.load_dictionary = false;
 55 |                 break;
 56 |             case '?':
 57 |             default:
 58 |                 print_usage(argv[0]);
 59 |         }
 60 |     }
 61 |     if (args.collection_dir=="") {
 62 |         std::cerr << "Missing command line parameters.\n";
 63 |         print_usage(argv[0]);
 64 |         exit(EXIT_FAILURE);
 65 |     }
 66 |     return args;
 67 | }
 68 | 
 69 | int main(int argc,char* const argv[])
 70 | {
 71 |     using clock = std::chrono::high_resolution_clock;
 72 |     /* parse command line */
 73 |     cmdargs_t args = parse_args(argc,argv);
 74 | 
 75 |     /* parse repo */
 76 |     auto cc = surf::parse_collection(args.collection_dir);
 77 |     char tmp_str[256] = {0};
 78 |     strncpy(tmp_str,args.collection_dir.c_str(),256);
 79 |     std::string base_name = basename(tmp_str);
 80 | 
 81 |     /* parse queries */
 82 |     surf::query_parser::mapping_t term_map;
 83 |     if(args.load_dictionary) {
 84 |         std::cout << "Loading dictionary and creating term map." << std::endl;
 85 |         term_map = surf::query_parser::load_dictionary(args.collection_dir);
 86 |     }
 87 | 
 88 |     /* define types */
 89 |     using surf_index_t = INDEX_TYPE;
 90 |     std::string index_name = IDXNAME;
 91 | 
 92 |     /* load the index */
 93 |     std::cout << "Loading index." << std::endl;
 94 |     surf_index_t index;
 95 |     auto load_start = clock::now();
 96 |     construct(index, "", cc, 0);
 97 |     index.load(cc);
 98 |     auto load_stop = clock::now();
 99 |     auto load_time_sec = std::chrono::duration_cast<std::chrono::seconds>(load_stop-load_start);
100 |     std::cout << "Index loaded in " << load_time_sec.count() << " seconds." << std::endl;
101 | 
102 | 
103 |     /* daemon mode */
104 |     {
105 |     	std::cout << "Starting daemon mode on port " << args.port << std::endl;
106 |     	zmq::context_t context(1);
107 |     	zmq::socket_t server(context, ZMQ_REP);
108 |     	server.bind(std::string("tcp://*:"+args.port).c_str());
109 | 
110 |     	while(true) {
111 |     		zmq::message_t request;
112 |     		/* wait for msg */
113 |     		server.recv(&request);
114 |             surf_qry_request* surf_req = (surf_qry_request*) request.data();
115 | 
116 |             if(surf_req->type == REQ_TYPE_QUIT) {
117 |                 std::cout << "Quitting..." << std::endl;
118 |                 break;
119 |             }
120 | 
121 |     		/* perform query */
122 |     		auto qry_start = clock::now();
123 | 
124 |             surf::query_t prased_query;
125 |             bool parse_ok = false;
126 | 
127 |             if(surf_req->phrases) { 
128 | #ifdef PHRASE_SUPPORT
129 |                 const auto& id_mapping = term_map.first;
130 |                 const auto& reverse_mapping = term_map.second;
131 |                 auto qry_mapping = surf::query_parser::map_to_ids(id_mapping,
132 |                                             std::string(surf_req->qry_str),true,surf_req->int_qry);
133 |                 if(std::get<0>(qry_mapping)) {
134 |                     auto qid = std::get<1>(qry_mapping);
135 |                     auto qry_ids = std::get<2>(qry_mapping);
136 |                     prased_query = surf::phrase_parser::phrase_segmentation(index.m_csa,qry_ids,reverse_mapping,
137 |                                                                            surf_req->phrase_threshold);
138 |                     std::get<0>(prased_query) = qid;
139 |                     parse_ok = true;
140 |                 }
141 | #endif
142 |             } else {
143 |                 auto qry = surf::query_parser::parse_query(term_map,
144 |                                             std::string(surf_req->qry_str),
145 |                                             true,
146 |                                             surf_req->int_qry);
147 |                 if(qry.first) {
148 |                     prased_query = qry.second;
149 |                     parse_ok = true;
150 |                 }
151 |             }
152 | 
153 |             if(!parse_ok) {
154 |                 // error parsing the qry. send back error
155 |                 surf_time_resp surf_resp;
156 |                 surf_resp.status = REQ_PARSE_ERROR;
157 |                 surf_resp.req_id = surf_req->id;
158 |                 zmq::message_t reply (sizeof(surf_time_resp));
159 |                 memcpy(reply.data(),&surf_resp,sizeof(surf_time_resp));
160 |                 server.send(reply);
161 |                 std::cout << "ERROR IN QUERY PARSING PROCESS. SKIPPING QUERY" << std::endl;
162 |                 continue;
163 |             }
164 | 
165 |     		/* (1) parse qry terms */
166 |             bool profile = false;
167 |             if(surf_req->mode == REQ_MODE_PROFILE) {
168 |                 profile = true;
169 |             }
170 |             bool ranked_and = false;
171 |             if(surf_req->type == REQ_TYPE_QRY_AND) {
172 |                 ranked_and = true;
173 |             }
174 | 
175 |     		/* (2) query the index */
176 |             auto qry_id = std::get<0>(prased_query);
177 |             auto qry_tokens = std::get<1>(prased_query);
178 |             auto search_start = clock::now();
179 |             auto results = index.search(qry_tokens,surf_req->k,ranked_and,profile);
180 |             auto search_stop = clock::now();
181 |             auto search_time = std::chrono::duration_cast<std::chrono::microseconds>(search_stop-search_start);
182 | 
183 |     		auto qry_stop = clock::now();
184 |     		auto query_time = std::chrono::duration_cast<std::chrono::microseconds>(qry_stop-qry_start);
185 | 
186 |             /* (3a) output to qry to console */
187 |             std::cout << "REQ=" << std::left << std::setw(10) << surf_req->id << " " 
188 |                       << " k="  << std::setw(5) << surf_req->k 
189 |                       << " QID=" << std::setw(5) << qry_id 
190 |                       << " TIME=" << std::setw(7) << query_time.count()/1000.0
191 |                       << " AND=" << ranked_and
192 |                       << " PHRASE=" << surf_req->phrases;
193 |             std::cout << " [";
194 |             if(args.load_dictionary) {
195 |                 for(const auto& token : qry_tokens) {
196 |                     if(token.token_ids.size() > 1) {
197 |                         // phrase
198 |                         std::cout << "(";
199 |                         for(const auto tstr : token.token_strs) {
200 |                             std::cout << tstr << " ";
201 |                         }
202 |                         std::cout << ") ";
203 |                     } else {
204 |                         std::cout << token.token_strs[0] << " ";
205 |                     }
206 |                 }
207 |             } else {
208 |                 for(const auto& token : qry_tokens) {
209 |                     if(token.token_ids.size() > 1) {
210 |                         // phrase
211 |                         std::cout << "(";
212 |                         for(const auto tid : token.token_ids) {
213 |                             std::cout << tid << " ";
214 |                         }
215 |                         std::cout << ") ";
216 |                     } else {
217 |                         std::cout << token.token_ids[0] << " ";
218 |                     }
219 |                 }
220 |             }
221 |             std::cout << "]" << std::endl;
222 | 
223 |     		/* (3) create answer and send */
224 |             if(!surf_req->output_results) {
225 |                 surf_time_resp surf_resp;
226 |                 surf_resp.status = REQ_RESPONE_OK;
227 |                 strncpy(surf_resp.index,index_name.c_str(),sizeof(surf_resp.index));
228 |                 strncpy(surf_resp.collection,base_name.c_str(),sizeof(surf_resp.collection));
229 |                 strncpy(surf_resp.ranker,surf_index_t::ranker_type::name().c_str(),sizeof(surf_resp.ranker));
230 |                 surf_resp.req_id = surf_req->id;
231 |                 surf_resp.k = surf_req->k;
232 |                 surf_resp.qry_id = qry_id;
233 |                 surf_resp.qry_len = qry_tokens.size();
234 |                 surf_resp.result_size = results.list.size();
235 |                 surf_resp.qry_time = query_time.count();
236 |                 surf_resp.search_time = search_time.count();
237 |                 surf_resp.wt_search_space = results.wt_search_space;
238 |                 surf_resp.wt_nodes = results.wt_nodes;
239 |                 surf_resp.postings_evaluated = results.postings_evaluated;
240 |                 surf_resp.postings_total = results.postings_total;
241 | 
242 |         		zmq::message_t reply (sizeof(surf_time_resp));
243 |         		memcpy(reply.data(),&surf_resp,sizeof(surf_time_resp));
244 |         		server.send (reply);
245 |             } else {
246 |                 size_t res_size = results.list.size()*2*sizeof(double) + sizeof(uint64_t);
247 |                 zmq::message_t zmq_results (res_size);
248 |                 surf_results* sr = (surf_results*)(zmq_results.data());
249 |                 sr->size = results.list.size();
250 |                 for(size_t i=0;i<results.list.size();i++) {
251 |                     sr->data[i*2] = results.list[i].doc_id;
252 |                     sr->data[i*2+1] = results.list[i].score;
253 |                 }
254 |                 server.send (zmq_results);
255 |             }
256 |     	}
257 |     }
258 | 
259 | 
260 |     return EXIT_SUCCESS;
261 | }
262 | 


--------------------------------------------------------------------------------
/src/surf_index.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "sdsl/config.hpp"
 3 | #include "surf/indexes.hpp"
 4 | #include "surf/util.hpp"
 5 | 
 6 | typedef struct cmdargs {
 7 |     std::string collection_dir;
 8 |     bool print_memusage;
 9 | } cmdargs_t;
10 | 
11 | void
12 | print_usage(char* program)
13 | {
14 |     fprintf(stdout,"%s -c <collection directory> -m\n",program);
15 |     fprintf(stdout,"where\n");
16 |     fprintf(stdout,"  -c <collection directory>  : the directory the collection is stored.\n");
17 |     fprintf(stdout,"  -m : print memory usage.\n");
18 | };
19 | 
20 | cmdargs_t
21 | parse_args(int argc,char* const argv[])
22 | {
23 |     cmdargs_t args;
24 |     int op;
25 |     args.collection_dir = "";
26 |     args.print_memusage = false;
27 |     while ((op=getopt(argc,argv,"c:m")) != -1) {
28 |         switch (op) {
29 |             case 'c':
30 |                 args.collection_dir = optarg;
31 |                 break;
32 |             case 'm':
33 |                 args.print_memusage = true;
34 |                 break;
35 |             case '?':
36 |             default:
37 |                 print_usage(argv[0]);
38 |         }
39 |     }
40 |     if (args.collection_dir=="") {
41 |         std::cerr << "Missing command line parameters.\n";
42 |         print_usage(argv[0]);
43 |         exit(EXIT_FAILURE);
44 |     }
45 |     return args;
46 | }
47 | 
48 | int main(int argc,char* const argv[])
49 | {
50 |     using clock = std::chrono::high_resolution_clock;
51 |     /* parse command line */
52 |     cmdargs_t args = parse_args(argc,argv);
53 | 
54 |     /* parse repo */
55 |     sdsl::cache_config cc = surf::parse_collection(args.collection_dir);
56 |     std::cout<<"parse collections"<<std::endl;
57 |     for(auto x : cc.file_map){
58 |         std::cout<<x.first<<" "<<x.second<<std::endl;
59 |     }
60 | 
61 |     /* define types */
62 |     using surf_index_t = INDEX_TYPE;
63 |     std::string index_name = IDXNAME;
64 | 
65 |     /* build the index */
66 |     surf_index_t index;
67 |     auto build_start = clock::now();
68 |     construct(index, "", cc, 0);
69 |     auto build_stop = clock::now();
70 |     auto build_time_sec = std::chrono::duration_cast<std::chrono::seconds>(build_stop-build_start);
71 |     std::cout << "Index built in " << build_time_sec.count() << " seconds." << std::endl;
72 | 
73 |     /* visualize space usage */
74 |     index.load(cc);
75 |     std::cout<<"Write structure"<<std::endl;
76 |     std::ofstream vofs(args.collection_dir+"/index/"+surf::SPACEUSAGE_FILENAME+"_"+IDXNAME+".html");
77 |     write_structure<HTML_FORMAT>(index,vofs);
78 | 
79 |     /* print mem usage */
80 |     if(args.print_memusage) {
81 |         index.mem_info();
82 |     }
83 | 
84 |     return EXIT_SUCCESS;
85 | }
86 | 


--------------------------------------------------------------------------------
/src/surf_query.cpp:
--------------------------------------------------------------------------------
  1 | #include <unistd.h>
  2 | #include <stdlib.h>
  3 | #include <iostream>
  4 | #include <fstream>
  5 | #include <iomanip>
  6 | #include <ctime>
  7 | #include <chrono>
  8 | 
  9 | #include <sys/types.h>
 10 | #include <sys/stat.h>
 11 | #include <unistd.h>
 12 | #include "surf/comm.hpp"
 13 | #include "surf/util.hpp"
 14 | #include "surf/query_parser.hpp"
 15 | 
 16 | #include "zmq.hpp"
 17 | 
 18 | typedef struct cmdargs {
 19 |     std::string host;
 20 |     std::string query_file;
 21 |     uint64_t k;
 22 |     uint64_t runs;
 23 |     bool profile;
 24 |     bool quit;
 25 |     bool ranked_and;
 26 |     bool phrases;
 27 |     double phrase_threshold;
 28 |     bool output_results;
 29 |     bool integer_mode;
 30 |     std::string collection_dir;
 31 | } cmdargs_t;
 32 | 
 33 | void
 34 | print_usage(char* program)
 35 | {
 36 |     fprintf(stdout,"%s -h <host> -q <query file> -k <top-k> -r <runs> -p -P <thres> -s -a -R -i <collection>\n",program);
 37 |     fprintf(stdout,"where\n");
 38 |     fprintf(stdout,"  -h <host>  : host of the daemon.\n");
 39 |     fprintf(stdout,"  -q <query file>  : the queries to be performed.\n");
 40 |     fprintf(stdout,"  -k <top-k>  : the top-k documents to be retrieved for each query.\n");
 41 |     fprintf(stdout,"  -r <runs>  : the number of runs.\n");
 42 |     fprintf(stdout,"  -R : output results only\n");
 43 |     fprintf(stdout,"  -p : run queries in profile mode.\n");
 44 |     fprintf(stdout,"  -P <thres> : run queries with phrase parsing enabled and threshold <thres>.\n");
 45 |     fprintf(stdout,"  -s : stop the daemon after queries are processed.\n");
 46 |     fprintf(stdout,"  -a : perform ranked AND instead of ranked OR.\n");
 47 |     fprintf(stdout,"  -i : perform dict lookup at the client from <collection>.\n");
 48 | };
 49 | 
 50 | cmdargs_t
 51 | parse_args(int argc,char* const argv[])
 52 | {
 53 |     cmdargs_t args;
 54 |     int op;
 55 |     args.host = "127.0.0.1:12345";
 56 |     args.query_file = "";
 57 |     args.k = 10;    
 58 |     args.runs = 3;
 59 |     args.profile = false;
 60 |     args.quit = false;
 61 |     args.ranked_and = false;
 62 |     args.phrases = false;
 63 |     args.phrase_threshold = 0.0f;
 64 |     args.output_results = false;
 65 |     args.integer_mode = false;
 66 |     while ((op=getopt(argc,argv,"r:h:q:k:psaP:Ri:")) != -1) {
 67 |         switch (op) {
 68 |             case 'r':
 69 |                 args.runs = std::strtoul(optarg,NULL,10);
 70 |                 break;
 71 |             case 'h':
 72 |                 args.host = optarg;
 73 |                 break;
 74 |             case 'p':
 75 |                 args.profile = true;
 76 |                 break;
 77 |             case 'P':
 78 |                 args.phrases = true;
 79 |                 args.phrase_threshold = std::strtod(optarg,NULL);
 80 |                 break;
 81 |             case 's':
 82 |                 args.quit = true;
 83 |                 break;
 84 |             case 'a':
 85 |                 args.ranked_and = true;
 86 |                 break;
 87 |             case 'R':
 88 |                 args.output_results = true;
 89 |                 break;
 90 |             case 'q':
 91 |                 args.query_file = optarg;
 92 |                 break;
 93 |             case 'k':
 94 |                 args.k = std::strtoul(optarg,NULL,10);
 95 |                 break;
 96 |             case 'i':
 97 |                 args.integer_mode = true;
 98 |                 args.collection_dir = optarg;
 99 |                 break;
100 |             case '?':
101 |             default:
102 |                 print_usage(argv[0]);
103 |         }
104 |     }
105 |     if (args.query_file=="") {
106 |         std::cerr << "Missing command line parameters.\n";
107 |         print_usage(argv[0]);
108 |         exit(EXIT_FAILURE);
109 |     }
110 |     return args;
111 | }
112 | 
113 | int main(int argc,char* const argv[])
114 | {
115 |     using clock = std::chrono::high_resolution_clock;
116 | 
117 |     /* parse command line */
118 |     cmdargs_t args = parse_args(argc,argv);
119 | 
120 |     /* load queries from disk */
121 |     std::cerr << "Loading queries from disk." << std::endl;
122 |     std::ifstream qfs(args.query_file);
123 |     std::string qry_str;
124 |     std::vector<std::string> queries;
125 |     while(std::getline(qfs,qry_str)) {
126 |         if(qry_str.size() < MAX_QRY_LEN) {
127 |             queries.push_back(qry_str);
128 |         }
129 |     }
130 | 
131 |     if(args.integer_mode) {
132 |         surf::parse_collection(args.collection_dir); // makes sure dir is valid
133 |         std::cout << "Loading dictionary and creating term map." << std::endl;
134 |         auto term_map = surf::query_parser::load_dictionary(args.collection_dir);
135 |         const auto& id_mapping = term_map.first;
136 |         std::vector<std::string> mapped_queries;
137 |         for(auto& query: queries) {
138 |             auto qry_mapping = surf::query_parser::map_to_ids(id_mapping,query,true,false);
139 |             if(std::get<0>(qry_mapping)) {
140 |                 auto qid = std::get<1>(qry_mapping);
141 |                 auto qry_ids = std::get<2>(qry_mapping);
142 |                 std::string new_qry_str;
143 |                 new_qry_str += std::to_string(qid) + ";";
144 |                 for(size_t i=0;i<qry_ids.size()-1;i++) {
145 |                     new_qry_str += std::to_string(qry_ids[i]) + " ";
146 |                 }
147 |                 // last one
148 |                 new_qry_str += std::to_string(qry_ids.back());
149 |                 mapped_queries.push_back(new_qry_str);
150 |             }
151 |         }
152 |         queries = mapped_queries; // copy!!
153 |     }
154 | 
155 |     /* zmq magic! */
156 |     std::cerr << "Connecting to surf daemon." << std::endl;
157 |     zmq::context_t context (1);
158 |     zmq::socket_t socket (context, ZMQ_REQ);
159 |     socket.connect (std::string("tcp://"+args.host).c_str());
160 |     if(!socket.connected()) {
161 |         std::cerr << "Error connecting to daemon." << std::endl;
162 |     }
163 | 
164 |     /* process the queries */
165 |     std::cerr << "Processing queries..." << std::endl;
166 |     size_t num_runs = args.runs;
167 |     for(size_t i=0;i<num_runs;i++) {
168 |         for(const auto& query: queries) {
169 | 
170 |             auto req_start = clock::now();
171 | 
172 |             surf_qry_request surf_req;
173 |             surf_req.type = REQ_TYPE_QRY_OR;
174 |             uint8_t qry_mode = 0;
175 |             if(args.ranked_and) {
176 |                 surf_req.type = REQ_TYPE_QRY_AND;
177 |                 qry_mode = 1;
178 |             }
179 | 
180 |             surf_req.int_qry = 0;
181 |             if(args.integer_mode) {
182 |                 surf_req.int_qry = 1;
183 |             }
184 | 
185 |             if(args.phrases) {
186 |                 surf_req.phrases = 1;
187 |                 surf_req.phrase_threshold =  args.phrase_threshold;
188 |                 qry_mode += 2;
189 |             } else {
190 |                 surf_req.phrases = 0;
191 |                 surf_req.phrase_threshold = 0.0;
192 |             }
193 | 
194 |             if(args.profile) {
195 |                 surf_req.mode = REQ_MODE_PROFILE;
196 |             } else {
197 |                 surf_req.mode = REQ_MODE_TIME;
198 |             }
199 | 
200 |             if(args.output_results) {
201 |                 surf_req.output_results = 1;
202 |             } else {
203 |                 surf_req.output_results = 0;
204 |             }
205 | 
206 |             surf_req.id = rand();
207 |             surf_req.k = args.k;
208 |             memcpy(surf_req.qry_str,query.data(),query.size());
209 | 
210 |             zmq::message_t request(sizeof(surf_qry_request));
211 |             memcpy ((void *) request.data (), &surf_req, sizeof(surf_qry_request));
212 |             socket.send (request);
213 | 
214 |             /* wait for reply */
215 |             if(!args.output_results) {
216 |                 zmq::message_t reply;
217 |                 socket.recv (&reply);
218 |                 surf_time_resp* surf_resp = static_cast<surf_time_resp*>(reply.data());
219 | 
220 |                 auto req_stop = clock::now();
221 |                 auto req_time = std::chrono::duration_cast<std::chrono::microseconds>(req_stop-req_start);
222 | 
223 |                 if(surf_resp->req_id != surf_req.id) {
224 |                     std::cerr << "ERROR: got response for wrong request id!" << std::endl;
225 |                 }
226 | 
227 |                 if(surf_resp->status != REQ_PARSE_ERROR) {
228 |                     /* output */
229 |                     std::cout << surf_resp->qry_id << ";" 
230 |                               << surf_resp->collection << ";"
231 |                               << surf_resp->ranker << ";"
232 |                               << surf_resp->index << ";"
233 |                               << (int)qry_mode << ";"
234 |                               << surf_resp->k << ";"
235 |                               << surf_resp->qry_len << ";"
236 |                               << surf_resp->result_size << ";"
237 |                               << surf_resp->qry_time << ";"
238 |                               << surf_resp->search_time << ";"
239 |                               << surf_resp->wt_search_space << ";"
240 |                               << surf_resp->wt_nodes << ";"
241 |                               << surf_resp->postings_evaluated << ";"
242 |                               << surf_resp->postings_total << ";"
243 |                               << req_time.count() << std::endl;
244 |                 } else {
245 |                     std::cerr << "Error processing query '" << query << "'" << std::endl;
246 |                 }
247 |             } else {
248 |                 zmq::message_t output;
249 |                 socket.recv (&output);
250 |                 surf_results* sr = (surf_results*)(output.data());
251 |                 for(size_t j=0;j<sr->size;j++) {
252 |                     std::cout << "(" << j+1 << ") : " 
253 |                               << (uint64_t)sr->data[j*2]
254 |                               << " - "
255 |                               << sr->data[j*2+1] << std::endl;
256 |                 }
257 |             }
258 |         }
259 |     }
260 | 
261 |     // stop the daemon
262 |     if(args.quit) {
263 |         surf_qry_request surf_req;
264 |         surf_req.type = REQ_TYPE_QUIT;
265 |         zmq::message_t request(sizeof(surf_qry_request));
266 |         memcpy ((void *) request.data (), &surf_req, sizeof(surf_qry_request));
267 |         socket.send (request);
268 |     }
269 | 
270 | 
271 |     return EXIT_SUCCESS;
272 | }
273 | 


--------------------------------------------------------------------------------
/src/surf_search.cpp:
--------------------------------------------------------------------------------
  1 | #include <unistd.h>
  2 | #include <stdlib.h>
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | #include <ctime>
  6 | 
  7 | #include <sys/types.h>
  8 | #include <sys/stat.h>
  9 | #include <unistd.h>
 10 | #include "surf/query.hpp"
 11 | #include "sdsl/config.hpp"
 12 | #include "surf/indexes.hpp"
 13 | #include "surf/query_parser.hpp"
 14 | 
 15 | typedef struct cmdargs {
 16 |     std::string collection_dir;
 17 |     std::string query_file;
 18 |     uint64_t k;
 19 | } cmdargs_t;
 20 | 
 21 | void
 22 | print_usage(char* program)
 23 | {
 24 |     fprintf(stdout,"%s -c <collection directory> -q <query file> -k <top-k> -o <output.csv>\n",program);
 25 |     fprintf(stdout,"where\n");
 26 |     fprintf(stdout,"  -c <collection directory>  : the directory the collection is stored.\n");
 27 |     fprintf(stdout,"  -q <query file>  : the queries to be performed.\n");
 28 |     fprintf(stdout,"  -k <top-k>  : the top-k documents to be retrieved for each query.\n");
 29 | };
 30 | 
 31 | cmdargs_t
 32 | parse_args(int argc,char* const argv[])
 33 | {
 34 |     cmdargs_t args;
 35 |     int op;
 36 |     args.collection_dir = "";
 37 |     args.query_file = "";
 38 |     args.k = 10;
 39 |     while ((op=getopt(argc,argv,"c:q:k:")) != -1) {
 40 |         switch (op) {
 41 |             case 'c':
 42 |                 args.collection_dir = optarg;
 43 |                 break;
 44 |             case 'q':
 45 |                 args.query_file = optarg;
 46 |                 break;
 47 |             case 'k':
 48 |                 args.k = std::strtoul(optarg,NULL,10);
 49 |                 break;
 50 |             case '?':
 51 |             default:
 52 |                 print_usage(argv[0]);
 53 |         }
 54 |     }
 55 |     if (args.collection_dir==""||args.query_file=="") {
 56 |         std::cerr << "Missing command line parameters.\n";
 57 |         print_usage(argv[0]);
 58 |         exit(EXIT_FAILURE);
 59 |     }
 60 |     return args;
 61 | }
 62 | 
 63 | int main(int argc,char* const argv[])
 64 | {
 65 |     using clock = std::chrono::high_resolution_clock;
 66 |     /* parse command line */
 67 |     cmdargs_t args = parse_args(argc,argv);
 68 | 
 69 |     /* parse repo */
 70 |     auto cc = surf::parse_collection(args.collection_dir);
 71 | 
 72 |     /* parse queries */
 73 |     std::cout << "Parsing query file '" << args.query_file << "'" << std::endl;
 74 |     auto queries = surf::query_parser::parse_queries(args.collection_dir,args.query_file);
 75 |     std::cout << "Found " << queries.size() << " queries." << std::endl;
 76 | 
 77 |     /* define types */
 78 |     using surf_index_t = INDEX_TYPE;
 79 |     std::string index_name = IDXNAME;
 80 | 
 81 |     /* load the index */
 82 |     surf_index_t index;
 83 |     auto load_start = clock::now();
 84 |     construct(index, "", cc, 0);
 85 |     index.load(cc);
 86 |     auto load_stop = clock::now();
 87 |     auto load_time_sec = std::chrono::duration_cast<std::chrono::seconds>(load_stop-load_start);
 88 |     std::cout << "Index loaded in " << load_time_sec.count() << " seconds." << std::endl;
 89 | 
 90 |     /* process the queries */
 91 |     std::map<uint64_t,std::chrono::microseconds> query_times;
 92 |     std::map<uint64_t,surf::result> query_results;
 93 |     std::map<uint64_t,uint64_t> query_lengths;
 94 | 
 95 |     size_t num_runs = 1;
 96 |     for(size_t i=0;i<num_runs;i++) {
 97 |         for(const auto& query: queries) {
 98 |             auto id = std::get<0>(query);
 99 |             auto qry_tokens = std::get<1>(query);
100 |             std::cout << "[" << id << "] |Q|=" << qry_tokens.size(); std::cout.flush();
101 | 
102 |             // run the query
103 |             auto qry_start = clock::now();
104 |             auto results = index.search(qry_tokens,args.k);
105 |             auto qry_stop = clock::now();
106 | 
107 |             auto query_time = std::chrono::duration_cast<std::chrono::microseconds>(qry_stop-qry_start);
108 |             std::cout << " TIME = " << std::setprecision(5)
109 |                       << query_time.count() / 1000.0 
110 |                       << " ms" << std::endl;
111 | 
112 |             auto itr = query_times.find(id);
113 |             if(itr != query_times.end()) {
114 |                 itr->second += query_time;
115 |             } else {
116 |                 query_times[id] = query_time;
117 |             }
118 | 
119 |             if(i==0) {
120 |                 query_results[id] = results;
121 |                 query_lengths[id] = qry_tokens.size();
122 |             }
123 |         }
124 |     }
125 | 
126 |     /* output results to csv */
127 |     char time_buffer [80] = {0};
128 |     std::time_t t = std::time(NULL);
129 |     auto timeinfo = localtime (&t);
130 |     strftime (time_buffer,80,"%F-%H:%M:%S",timeinfo);
131 |     std::string time_output_file = args.collection_dir + "/results/" 
132 |                    + "surf-timings-" + index_name + "-k" + std::to_string(args.k) 
133 |                    + "-" + std::string(time_buffer) + ".csv";
134 |     std::string res_output_file = args.collection_dir + "/results/" 
135 |                    + "surf-results-" + index_name + "-k" + std::to_string(args.k) 
136 |                    + "-" + std::string(time_buffer) + ".csv";
137 | 
138 |     /* calc average */
139 |     for(auto& timing : query_times) {
140 |         timing.second = timing.second / num_runs;
141 |     }
142 | 
143 |     /* output */
144 |     {
145 |         std::cout << "Writing timing results to '" << time_output_file << "'" << std::endl;     
146 |         std::ofstream resfs(time_output_file);
147 |         if(resfs.is_open()) {
148 |             resfs << "id;index;k;num_terms;time_ms" << std::endl;
149 |             for(const auto& timing: query_times) {
150 |                 auto qry_id = timing.first;
151 |                 auto qry_time = timing.second;
152 |                 resfs << qry_id << ";" << index_name << ";" << args.k << ";"
153 |                           << query_lengths[qry_id] << ";"
154 |                           << qry_time.count() / 1000.0  << "\n"; 
155 |             }
156 |         } else {
157 |             perror("could not output results to file.");
158 |         }
159 |         std::cout << "Writing result listing to '" << res_output_file << "'" << std::endl;
160 |         std::ofstream res_outfs(res_output_file);
161 |         if(res_outfs.is_open()) {
162 |             res_outfs << "id;rank;docid;score" << std::endl;
163 |             for(const auto& result: query_results) {
164 |                 auto qry_id = result.first;
165 |                 auto qry_res = result.second.list;
166 |                 for(size_t i=1;i<=qry_res.size();i++) {
167 |                     res_outfs << qry_id << ";" 
168 |                               << i  << ";" 
169 |                               << qry_res[i-1].doc_id << ";" 
170 |                               << qry_res[i-1].score << "\n"; 
171 |                 }
172 |             }
173 |         } else {
174 |             perror("could not output results to file.");
175 |         }
176 |     }
177 | 
178 | 
179 |     return EXIT_SUCCESS;
180 | }
181 | 


--------------------------------------------------------------------------------
/src/surf_trec.cpp:
--------------------------------------------------------------------------------
  1 | #include <unistd.h>
  2 | #include <stdlib.h>
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | #include <ctime>
  6 | 
  7 | #include <sys/types.h>
  8 | #include <sys/stat.h>
  9 | #include <unistd.h>
 10 | #include "surf/query.hpp"
 11 | #include "sdsl/config.hpp"
 12 | #include "surf/indexes.hpp"
 13 | #include "surf/query_parser.hpp"
 14 | 
 15 | typedef struct cmdargs {
 16 |     std::string collection_dir;
 17 |     std::string query_file;
 18 |     std::string output_file;
 19 |     uint64_t k;
 20 | } cmdargs_t;
 21 | 
 22 | void
 23 | print_usage(char* program)
 24 | {
 25 |     fprintf(stdout,"%s -c <collection directory> -q <query file> -k <top-k> -o <output.csv>\n",program);
 26 |     fprintf(stdout,"where\n");
 27 |     fprintf(stdout,"  -c <collection directory>  : the directory the collection is stored.\n");
 28 |     fprintf(stdout,"  -q <query file>  : the queries to be performed.\n");
 29 |     fprintf(stdout,"  -k <top-k>  : the top-k documents to be retrieved for each query.\n");
 30 |     fprintf(stdout,"  -o <output.csv>  : output results to file in csv format.\n");
 31 | };
 32 | 
 33 | cmdargs_t
 34 | parse_args(int argc,char* const argv[])
 35 | {
 36 |     cmdargs_t args;
 37 |     int op;
 38 |     args.collection_dir = "";
 39 |     args.query_file = "";
 40 |     args.output_file = "";
 41 |     args.k = 10;
 42 |     while ((op=getopt(argc,argv,"c:q:k:o:")) != -1) {
 43 |         switch (op) {
 44 |             case 'c':
 45 |                 args.collection_dir = optarg;
 46 |                 break;
 47 |             case 'q':
 48 |                 args.query_file = optarg;
 49 |                 break;
 50 |             case 'o':
 51 |                 args.output_file = optarg;
 52 |                 break;
 53 |             case 'k':
 54 |                 args.k = std::strtoul(optarg,NULL,10);
 55 |                 break;
 56 |             case '?':
 57 |             default:
 58 |                 print_usage(argv[0]);
 59 |         }
 60 |     }
 61 |     if (args.collection_dir==""||args.query_file=="") {
 62 |         std::cerr << "Missing command line parameters.\n";
 63 |         print_usage(argv[0]);
 64 |         exit(EXIT_FAILURE);
 65 |     }
 66 |     return args;
 67 | }
 68 | 
 69 | int main(int argc,char* const argv[])
 70 | {
 71 |     using clock = std::chrono::high_resolution_clock;
 72 |     /* parse command line */
 73 |     cmdargs_t args = parse_args(argc,argv);
 74 | 
 75 |     /* parse repo */
 76 |     auto cc = surf::parse_collection(args.collection_dir);
 77 | 
 78 |     /* parse queries */
 79 |     std::cout << "Parsing query file '" << args.query_file << "'" << std::endl;
 80 |     auto queries = surf::query_parser::parse_queries(args.collection_dir,args.query_file);
 81 |     std::cout << "Found " << queries.size() << " queries." << std::endl;
 82 | 
 83 |     /* define types */
 84 |     using surf_index_t = INDEX_TYPE;
 85 |     std::string index_name = IDXNAME;
 86 | 
 87 |     /* load the index */
 88 |     surf_index_t index;
 89 |     auto load_start = clock::now();
 90 |     construct(index, "", cc, 0);
 91 |     index.load(cc);
 92 |     auto load_stop = clock::now();
 93 |     auto load_time_sec = std::chrono::duration_cast<std::chrono::seconds>(load_stop-load_start);
 94 |     std::cout << "Index loaded in " << load_time_sec.count() << " seconds." << std::endl;
 95 | 
 96 |     /* process the queries */
 97 |     std::map<uint64_t,surf::result_t> query_results;
 98 | 
 99 |     for(const auto& query: queries) {
100 |         auto id = std::get<0>(query);
101 |         auto qry_tokens = std::get<1>(query);
102 |         std::cout << "[" << id << "] |Q|=" << qry_tokens.size(); std::cout.flush();
103 | 
104 |         // run the query
105 |         auto qry_start = clock::now();
106 |         auto results = index.search(qry_tokens,args.k);
107 |         auto qry_stop = clock::now();
108 | 
109 |         auto query_time = std::chrono::duration_cast<std::chrono::microseconds>(qry_stop-qry_start);
110 |         std::cout << " TIME = " << std::setprecision(5)
111 |                   << query_time.count() / 1000.0 
112 |                   << " ms" << std::endl;
113 | 
114 |         query_results[id] = results;
115 |     }
116 |     /* output results to csv */
117 |     std::string output_file = args.output_file;
118 |     if(output_file.empty()) {
119 |         char time_buffer [80] = {0};
120 |         std::time_t t = std::time(NULL);
121 |         auto timeinfo = localtime (&t);
122 |         strftime (time_buffer,80,"%F-%H:%M:%S",timeinfo);
123 |         output_file = "surf-timings-" + index_name + "-k" + std::to_string(args.k) 
124 |                        + "-" + std::string(time_buffer) + ".trec";
125 |     }
126 |     std::cout << "Writing timing results to '" << output_file << "'" << std::endl;
127 | 
128 |     /* output */
129 |     {
130 |     	/* load the url mapping */
131 | 
132 |         std::ofstream resfs(output_file);
133 |         if(resfs.is_open()) {
134 |             for(const auto& res: query_results) {
135 |                 for (size_t j=0; j<output_k; j++) {
136 |                     of << res.first << "\t"
137 |                        << "Q0" << "\t"
138 |                        << index.docmap.name(std::get<0>(res.list[j])) << "\t"
139 |                        << j                                           << "\t"
140 |                        << std::get<1>(res.list[j])                    << "\t"
141 |                        << index_name                          << std::endl;
142 |                 }
143 |             }
144 |         } else {
145 |             perror("could not output results to file.");
146 |         }
147 |     }
148 | 
149 | 
150 |     return EXIT_SUCCESS;
151 | }
152 | 


--------------------------------------------------------------------------------
/src/test.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "sdsl/int_vector_buffer.hpp"
 3 | 
 4 | using namespace sdsl;
 5 | using namespace std;
 6 | 
 7 | int main(int argc, char* argv[]){
 8 |     if ( argc < 2 ){
 9 |         cout << "./" << argv[0] << " file" << endl;
10 |         cout << "file has to contain a serialized sdsl::int_vector<>" << endl;
11 |         cout << "Program outputs the size of elements and the width per element" << endl;
12 |         return 1;
13 |     }
14 |     int_vector_buffer<> ivb(argv[1]);
15 |     cout << ivb.size() << " " << (int) ivb.width() << endl;
16 | }
17 | 


--------------------------------------------------------------------------------
/src/test_postings_list.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <vector>
 3 | #include <iostream>
 4 | 
 5 | #include "surf/block_postings_list.hpp"
 6 | 
 7 | int main( int argc, char** argv ) {
 8 |     using plist_type = surf::block_postings_list<128>;
 9 | 
10 |     // test small uncompressed lists 
11 |     for(size_t i=0;i<500;i++) {
12 |         size_t n = 1 + rand()%20;
13 |         std::vector< std::pair<uint64_t,uint64_t> > A;
14 |         uint64_t cur_id = rand()%5000;
15 |         for(size_t j=0;j<n;j++) {
16 |             cur_id += rand()%5000;
17 |             uint64_t cur_freq = 1 + rand() % 50;
18 |             A.emplace_back(cur_id,cur_freq);
19 |         }
20 |         plist_type pl(A);
21 | 
22 |         auto itr = pl.begin();
23 |         auto end = pl.end();
24 |         size_t j=0;
25 |         while( itr != end) {
26 |             auto id = itr.docid();
27 |             auto freq = itr.freq();
28 |             if(id != A[j].first && freq != A[j].second) {
29 |                 std::cerr << "ERROR: uncompressed list";
30 |             }
31 |             j++;
32 |             ++itr;
33 |         }
34 |     }
35 | 
36 |     // test larger compressed lists 
37 |     for(size_t i=0;i<500;i++) {
38 |         size_t n = 1 + rand()%20000;
39 |         std::vector< std::pair<uint64_t,uint64_t> > A;
40 |         uint64_t cur_id = rand()%500;
41 |         for(size_t j=0;j<n;j++) {
42 |             cur_id += rand()%500;
43 |             uint64_t cur_freq = 1 + rand() % 50;
44 |             A.emplace_back(cur_id,cur_freq);
45 |         }
46 |         plist_type pl(A);
47 | 
48 |         auto itr = pl.begin();
49 |         auto end = pl.end();
50 |         size_t j=0;
51 |         while( itr != end) {
52 |             auto id = itr.docid();
53 |             auto freq = itr.freq();
54 |             if(id != A[j].first && freq != A[j].second) {
55 |                 std::cerr << "ERROR: uncompressed list";
56 |             }
57 |             j++;
58 |             ++itr;
59 |         }
60 |     }
61 | 
62 | }
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/tools/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | # CONFIG REQUIRED: SET THIS VARIABLE TO THE PATH OF THE INDRI SOURCE CODE
 3 | INDRISRC=/home/mpetri/dev/collection-parser/indri-5.6
 4 | # CONFIG REQUIRED: SET THIS TO THE SDSL INSTALL PATH
 5 | SDSLPREFIX=../build/external/sdsl-lite/
 6 | 
 7 | include $(INDRISRC)/MakeDefns
 8 | SHARED=
 9 | INCPATH=-I$(INDRISRC)/include $(patsubst %, -I$(INDRISRC)/contrib/%/include, $(DEPENDENCIES)) -I $(SDSLPREFIX)/include/
10 | LIBPATH=-L$(INDRISRC)/obj  $(patsubst %, -L$(INDRISRC)/contrib/%/obj, $(DEPENDENCIES)) -L $(SDSLPREFIX)/lib/
11 | LIBS=-lindri $(patsubst %, -l%, $(DEPENDENCIES)) -lsdsl
12 | 
13 | all: indri
14 | 	$(CXX) $(CXXFLAGS) -std=c++11 indri_to_surf.cpp -o indri_to_surf $(INCPATH) $(LIBPATH) $(LIBS) $(CPPLDFLAGS)
15 | 	$(CXX) $(CXXFLAGS) -std=c++11 indri_stem_krovetz.cpp -o indri_stem_krovetz $(INCPATH) $(LIBPATH) $(LIBS) $(CPPLDFLAGS)
16 | 
17 | indri:
18 | 	cd  $(INDRISRC); ./configure
19 | 	make -C $(INDRISRC) contrib
20 | 
21 | clean:
22 | 	rm -f $(APP)
23 | 


--------------------------------------------------------------------------------
/tools/convert_results_to_trec.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <unistd.h>
  3 | #include <stdlib.h>
  4 | #include <algorithm>
  5 | #include <vector>
  6 | #include <iostream>
  7 | #include <sstream>
  8 | 
  9 | #include "surf/config.hpp"
 10 | #include "surf/util.hpp"
 11 | 
 12 | typedef struct cmdargs {
 13 |     std::string collection_dir;
 14 |     std::string surf_file;
 15 |     std::string trec_file;
 16 | } cmdargs_t;
 17 | 
 18 | void
 19 | print_usage(char* program)
 20 | {
 21 |     fprintf(stdout,"%s -c <collection directory> -q <query file> -r <surf_res.csv> -o <res.trec>\n",program);
 22 |     fprintf(stdout,"where\n");
 23 |     fprintf(stdout,"  -c <collection directory>  : the directory the collection is stored.\n");
 24 |     fprintf(stdout,"  -r <surf results csv>  : the results file produced by surf.\n");
 25 |     fprintf(stdout,"  -o <trec file>  : results converted to trec format.\n");
 26 | };
 27 | 
 28 | cmdargs_t
 29 | parse_args(int argc,char* const argv[])
 30 | {
 31 |     cmdargs_t args;
 32 |     int op;
 33 |     args.collection_dir = "";
 34 |     args.surf_file = "";
 35 |     args.trec_file = "";
 36 |     while ((op=getopt(argc,argv,"c:r:o:")) != -1) {
 37 |         switch (op) {
 38 |             case 'c':
 39 |                 args.collection_dir = optarg;
 40 |                 break;
 41 |             case 'r':
 42 |                 args.surf_file = optarg;
 43 |                 break;
 44 |             case 'o':
 45 |                 args.trec_file = optarg;
 46 |                 break;
 47 |             case '?':
 48 |             default:
 49 |                 print_usage(argv[0]);
 50 |         }
 51 |     }
 52 |     if (args.collection_dir==""||args.surf_file==""||args.trec_file=="") {
 53 |         std::cerr << "Missing command line parameters.\n";
 54 |         print_usage(argv[0]);
 55 |         exit(EXIT_FAILURE);
 56 |     }
 57 |     return args;
 58 | }
 59 | 
 60 | std::vector<std::string>
 61 | tokenize(std::string line) {
 62 |     std::vector<std::string> tokens;
 63 |     size_t pos = 0;
 64 |     std::string token;
 65 |     while ((pos = line.find(";")) != std::string::npos) {
 66 |         token = line.substr(0, pos);
 67 |         tokens.push_back(token);
 68 |         line.erase(0, pos + 1);
 69 |     }
 70 |     tokens.push_back(line);
 71 |     return tokens;
 72 | }
 73 | 
 74 | int main( int argc, char** argv ) {
 75 | 
 76 |     /* parse command line */
 77 |     cmdargs_t args = parse_args(argc,argv);
 78 | 
 79 |     /* parse repo */
 80 |     surf::parse_collection(args.collection_dir);
 81 | 
 82 |     /* load the docnames map */
 83 |     std::unordered_map<uint64_t,std::string> id_mapping;
 84 |     auto docnames_file = args.collection_dir + surf::DOCNAMES_FILENAME;
 85 |     std::ifstream dfs(docnames_file);
 86 |     size_t j=0;
 87 |     std::string name_mapping;
 88 |     while( std::getline(dfs,name_mapping) ) {
 89 |         id_mapping[j] = name_mapping;
 90 |         j++;
 91 |     }
 92 | 
 93 |     std::ofstream trec_out(args.trec_file);
 94 |     std::ifstream surfres_fs(args.surf_file);
 95 |     bool first = true;
 96 |     for(std::string line; std::getline(surfres_fs,line);) {
 97 |         if(first) {
 98 |             first = false;
 99 |             continue;
100 |         }
101 |         auto tokens = tokenize(line);
102 |         auto qry_id = std::strtoul(tokens[0].c_str(),NULL,10);
103 |         auto rank = std::strtoul(tokens[1].c_str(),NULL,10);
104 |         auto doc_id = std::strtoul(tokens[2].c_str(),NULL,10);
105 |         auto doc_score = std::strtod(tokens[3].c_str(),NULL);
106 | 
107 |         trec_out 
108 |             << qry_id << "\t"
109 |             << "Q0" << "\t"
110 |             << id_mapping[doc_id] << "\t"
111 |             << rank               << "\t"
112 |             << doc_score << "\t"
113 |             << "SURF" << std::endl;
114 |     }
115 | }
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/tools/create_surf_collection.cpp:
--------------------------------------------------------------------------------
  1 | // extracts from an indri index a monoton sequence of integers in sdsl format
  2 | // which represent the parsed text collection.
  3 | #include <iostream>
  4 | 
  5 | #include "surf/config.hpp"
  6 | #include "surf/util.hpp"
  7 | #include "sdsl/int_vector_buffer.hpp"
  8 | 
  9 | 
 10 | int main( int argc, char** argv ) {
 11 |     if(argc != 3) {
 12 |         std::cout << "USAGE: " << argv[0] 
 13 |                   << " <string with # separated docs> <surf collection folder>" << std::endl;
 14 |         return EXIT_FAILURE;
 15 |     }
 16 |     std::string test_str = argv[1];
 17 |     std::string dir = argv[2];
 18 | 
 19 |     // setup collection directory
 20 |     if(surf::directory_exists(dir)) {
 21 |         std::cerr << "ERROR: collection directory already exists." << std::endl;
 22 |         return EXIT_FAILURE;
 23 |     }
 24 |     surf::create_directory(dir);
 25 | 
 26 |     if( test_str.back() != '#' ) {
 27 |         std::cerr << "ERROR: test string must end with doc seperator '#'" << std::endl;
 28 |         return EXIT_FAILURE;
 29 |     }
 30 |     
 31 |     // write collection string
 32 |     std::map<std::string::value_type,sdsl::int_vector<>::value_type> existing_syms;
 33 |     std::map<sdsl::int_vector<>::value_type,std::string::value_type> sym_mapping;
 34 |     sdsl::int_vector<> text_col(test_str.size()+1);
 35 |     size_t j=0;
 36 |     size_t num_docs = 0;
 37 |     for(const auto& sym : test_str) {
 38 |         if(sym == '#') {
 39 |             text_col[j++] = 1;
 40 |             num_docs++;
 41 |         } else {
 42 |             auto itr = existing_syms.find(sym);
 43 |             if(itr != existing_syms.end()) {
 44 |                 text_col[j++] = itr->second;
 45 |             } else {
 46 |                 sdsl::int_vector<>::value_type new_sym = existing_syms.size()+2;
 47 |                 existing_syms[sym] = new_sym;
 48 |                 sym_mapping[new_sym] = sym;
 49 |                 text_col[j++] = new_sym;
 50 |             }
 51 |         }
 52 |     }
 53 |     text_col[j] = 0;
 54 |     std::ofstream ofs(dir+"/"+surf::TEXT_FILENAME);
 55 |     if(ofs.is_open()) {
 56 |         text_col.serialize(ofs);
 57 |     } else {
 58 |         std::cerr << "ERROR: could not write collection file." << std::endl;
 59 |         return EXIT_FAILURE;
 60 |     }
 61 | 
 62 |     // write the dict
 63 |     std::ofstream dict_ofs(dir+"/"+surf::DICT_FILENAME);
 64 |     if(dict_ofs.is_open()) {
 65 |         for(const auto& mapping : sym_mapping) {
 66 |             dict_ofs << mapping.second << " " << mapping.first << std::endl;
 67 |         }
 68 |     } else {
 69 |         std::cerr << "ERROR: could not write dictionary file." << std::endl;
 70 |         return EXIT_FAILURE;
 71 |     }
 72 | 
 73 |     // write docnames file
 74 |     std::ofstream docnames_ofs(dir+"/"+surf::DOCNAMES_FILENAME);
 75 |     if(docnames_ofs.is_open()) {
 76 |         for(size_t i=1;i<=num_docs;i++) {
 77 |             docnames_ofs << "DOCUMENT " << i << std::endl;
 78 |         }
 79 |     } else {
 80 |         std::cerr << "ERROR: could not write docnames file." << std::endl;
 81 |         return EXIT_FAILURE;
 82 |     }
 83 | 
 84 |     std::cout << "Created surf collection for string '" << test_str << "'" << std::endl;
 85 |     std::cout << "Found " << num_docs << " documents." << std::endl;
 86 |     std::cout << "Document delimiter = " << 1 << std::endl;
 87 |     std::cout << surf::TEXT_FILENAME << ": ";
 88 |     for(const auto& sym : text_col) {
 89 |         std::cout << sym << " ";
 90 |     }
 91 |     std::cout << std::endl;
 92 |     std::cout << "Mapping: ";
 93 |     for(const auto& mapping : sym_mapping) {
 94 |         std::cout << mapping.second << " -> " << mapping.first << "; ";
 95 |     }
 96 |     std::cout << std::endl;
 97 |     std::cout << "Document Names: ";
 98 |     for(size_t i=1;i<=num_docs;i++) {
 99 |         std::cout << "'DOCUMENT " << i << "'; ";
100 |     }
101 |     std::cout << std::endl;
102 | }
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/tools/extract_document.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #include <unistd.h>
  4 | #include <stdlib.h>
  5 | #include <algorithm>
  6 | #include <vector>
  7 | #include <iostream>
  8 | #include <sstream>
  9 | 
 10 | #include "surf/config.hpp"
 11 | #include "surf/util.hpp"
 12 | #include "sdsl/int_vector.hpp"
 13 | #include "sdsl/select_support_mcl.hpp"
 14 | 
 15 | typedef struct cmdargs {
 16 |     std::string collection_dir;
 17 |     uint64_t doc_id;
 18 | } cmdargs_t;
 19 | 
 20 | void
 21 | print_usage(char* program)
 22 | {
 23 |     fprintf(stdout,"%s -c <collection directory> -d <docid>",program);
 24 |     fprintf(stdout,"where\n");
 25 |     fprintf(stdout,"  -c <collection directory>  : the directory the collection is stored.\n");
 26 |     fprintf(stdout,"  -d <docid>  : the document to output\n");
 27 | };
 28 | 
 29 | cmdargs_t
 30 | parse_args(int argc,char* const argv[])
 31 | {
 32 |     cmdargs_t args;
 33 |     int op;
 34 |     args.collection_dir = "";
 35 |     int64_t doc_id = -1;
 36 |     while ((op=getopt(argc,argv,"c:d:")) != -1) {
 37 |         switch (op) {
 38 |             case 'c':
 39 |                 args.collection_dir = optarg;
 40 |                 break;
 41 |             case 'd':
 42 |                 doc_id = std::strtoll(optarg,NULL,10);
 43 |                 break;
 44 |             case '?':
 45 |             default:
 46 |                 print_usage(argv[0]);
 47 |         }
 48 |     }
 49 |     if (args.collection_dir==""||doc_id<0) {
 50 |         std::cerr << "Missing command line parameters.\n";
 51 |         print_usage(argv[0]);
 52 |         exit(EXIT_FAILURE);
 53 |     }
 54 | 
 55 |     args.doc_id = (uint64_t) doc_id;
 56 | 
 57 |     return args;
 58 | }
 59 | 
 60 | int main( int argc, char** argv ) {
 61 | 
 62 |     /* parse command line */
 63 |     cmdargs_t args = parse_args(argc,argv);
 64 | 
 65 |     /* parse repo */
 66 |     auto cc = surf::parse_collection(args.collection_dir);
 67 | 
 68 |     /* load doc border bv and build select structure */
 69 |     sdsl::bit_vector doc_border;
 70 |     sdsl::load_from_cache(doc_border, surf::KEY_DOCBORDER, cc);
 71 |     sdsl::bit_vector::select_1_type doc_border_select(&doc_border);
 72 | 
 73 |     /* load dictionary and create mapping */
 74 |     std::unordered_map<uint64_t,std::string> id_mapping;
 75 |     {
 76 |         auto dict_file = args.collection_dir + "/" + surf::DICT_FILENAME;
 77 |         std::ifstream dfs(dict_file);
 78 |         if(!dfs.is_open()) {
 79 |             std::cerr << "cannot load dictionary file.";
 80 |             exit(EXIT_FAILURE);
 81 |         }
 82 |         std::string term_mapping;
 83 |         while( std::getline(dfs,term_mapping) ) {
 84 |             auto sep_pos = term_mapping.find(' ');
 85 |             auto term = term_mapping.substr(0,sep_pos);
 86 |             auto idstr = term_mapping.substr(sep_pos+1);
 87 |             uint64_t id = std::stoull(idstr);
 88 |             id_mapping[id] = term;
 89 |         }
 90 |     }
 91 | 
 92 |     auto text_file = args.collection_dir + "/" + surf::TEXT_FILENAME;
 93 |     sdsl::int_vector_buffer<> T(text_file);
 94 |     uint64_t doc_id = args.doc_id;
 95 |     size_t doc_start = 0;
 96 |     if(doc_id != 0) {
 97 |       	doc_start = doc_border_select(doc_id) + 1;
 98 |     }
 99 |     auto doc_stop = doc_border_select(doc_id+1) - 1;
100 | 
101 |     std::cout << "document length = " << doc_stop - doc_start + 1 << std::endl;
102 |     std::cout << "document content  = '";
103 |     for(size_t i=doc_start;i<=doc_stop;i++) {
104 |       	std::cout << id_mapping[T[i]] << " ";
105 |     }
106 |     std::cout << "'" << std::endl;
107 | }
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/tools/extract_documents.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #include <unistd.h>
  4 | #include <stdlib.h>
  5 | #include <algorithm>
  6 | #include <vector>
  7 | #include <iostream>
  8 | #include <sstream>
  9 | 
 10 | #include "surf/config.hpp"
 11 | #include "surf/util.hpp"
 12 | #include "sdsl/int_vector.hpp"
 13 | #include "sdsl/select_support_mcl.hpp"
 14 | 
 15 | typedef struct cmdargs {
 16 |     std::string collection_dir;
 17 |     std::string surf_file;
 18 | } cmdargs_t;
 19 | 
 20 | void
 21 | print_usage(char* program)
 22 | {
 23 |     fprintf(stdout,"%s -c <collection directory> -q <query file> -r <surf_res.csv>",program);
 24 |     fprintf(stdout,"where\n");
 25 |     fprintf(stdout,"  -c <collection directory>  : the directory the collection is stored.\n");
 26 |     fprintf(stdout,"  -r <surf results csv>  : the results file produced by surf.\n");
 27 | };
 28 | 
 29 | cmdargs_t
 30 | parse_args(int argc,char* const argv[])
 31 | {
 32 |     cmdargs_t args;
 33 |     int op;
 34 |     args.collection_dir = "";
 35 |     args.surf_file = "";
 36 |     while ((op=getopt(argc,argv,"c:r:")) != -1) {
 37 |         switch (op) {
 38 |             case 'c':
 39 |                 args.collection_dir = optarg;
 40 |                 break;
 41 |             case 'r':
 42 |                 args.surf_file = optarg;
 43 |                 break;
 44 |             case '?':
 45 |             default:
 46 |                 print_usage(argv[0]);
 47 |         }
 48 |     }
 49 |     if (args.collection_dir==""||args.surf_file=="") {
 50 |         std::cerr << "Missing command line parameters.\n";
 51 |         print_usage(argv[0]);
 52 |         exit(EXIT_FAILURE);
 53 |     }
 54 |     return args;
 55 | }
 56 | 
 57 | std::vector<std::string>
 58 | tokenize(std::string line) {
 59 |     std::vector<std::string> tokens;
 60 |     size_t pos = 0;
 61 |     std::string token;
 62 |     while ((pos = line.find(";")) != std::string::npos) {
 63 |         token = line.substr(0, pos);
 64 |         tokens.push_back(token);
 65 |         line.erase(0, pos + 1);
 66 |     }
 67 |     tokens.push_back(line);
 68 |     return tokens;
 69 | }
 70 | 
 71 | int main( int argc, char** argv ) {
 72 | 
 73 |     /* parse command line */
 74 |     cmdargs_t args = parse_args(argc,argv);
 75 | 
 76 |     /* parse repo */
 77 |     auto cc = surf::parse_collection(args.collection_dir);
 78 | 
 79 |     /* load doc border bv and build select structure */
 80 |     sdsl::bit_vector doc_border;
 81 |     sdsl::load_from_cache(doc_border, surf::KEY_DOCBORDER, cc);
 82 |     sdsl::bit_vector::select_1_type doc_border_select(&doc_border);
 83 | 
 84 |     /* load dictionary and create mapping */
 85 |     std::unordered_map<uint64_t,std::string> id_mapping;
 86 |     {
 87 |         auto dict_file = args.collection_dir + "/" + surf::DICT_FILENAME;
 88 |         std::ifstream dfs(dict_file);
 89 |         if(!dfs.is_open()) {
 90 |             std::cerr << "cannot load dictionary file.";
 91 |             exit(EXIT_FAILURE);
 92 |         }
 93 |         std::string term_mapping;
 94 |         while( std::getline(dfs,term_mapping) ) {
 95 |             auto sep_pos = term_mapping.find(' ');
 96 |             auto term = term_mapping.substr(0,sep_pos);
 97 |             auto idstr = term_mapping.substr(sep_pos+1);
 98 |             uint64_t id = std::stoull(idstr);
 99 |             id_mapping[id] = term;
100 |         }
101 |     }
102 | 
103 |     auto text_file = args.collection_dir + "/" + surf::TEXT_FILENAME;
104 |     sdsl::int_vector_buffer<> T(text_file);
105 |     std::ifstream surfres_fs(args.surf_file);
106 |     bool first = true;
107 |     for(std::string line; std::getline(surfres_fs,line);) {
108 |         if(first) {
109 |             first = false;
110 |             continue;
111 |         }
112 |         auto tokens = tokenize(line);
113 |         auto qry_id = std::strtoul(tokens[0].c_str(),NULL,10);
114 |         auto rank = std::strtoul(tokens[1].c_str(),NULL,10);
115 |         auto doc_id = std::strtoul(tokens[2].c_str(),NULL,10);
116 |         auto doc_score = std::strtod(tokens[3].c_str(),NULL);
117 | 
118 |         std::cout << "=====================================================================================\n";
119 |         std::cout << "[Q]=" << qry_id << " rank=" << rank << " docid=" 
120 |         		  << doc_id << " score=" << doc_score <<  std::endl;
121 | 
122 |         size_t doc_start = 0;
123 |         if(doc_id != 0) {
124 |         	doc_start = doc_border_select(doc_id) + 1;
125 |         }
126 |         auto doc_stop = doc_border_select(doc_id+1) - 1;
127 | 
128 |         std::cout << "document length = " << doc_stop - doc_start + 1 << std::endl;
129 |         std::cout << "document content  = '";
130 |         for(size_t i=doc_start;i<=doc_stop;i++) {
131 |         	std::cout << id_mapping[T[i]] << " ";
132 |         }
133 |         std::cout << "'" << std::endl;
134 |     }
135 | }
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/tools/indri_stem_krovetz.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <algorithm>
 3 | #include <vector>
 4 | #include <iostream>
 5 | #include <sstream>
 6 | 
 7 | #include "indri/KrovetzStemmer.hpp"
 8 | 
 9 | int main( int argc, char** argv ) {
10 |     if(argc != 1) {
11 |         std::cout << "USAGE: " << argv[0] << " < <input> > <output> " << std::endl;
12 |         return EXIT_FAILURE;
13 |     }
14 | 
15 |     using stemmer_t = indri::parse::KrovetzStemmer;
16 |     stemmer_t ks;
17 |     for(std::string line; std::getline(std::cin,line);) {
18 |         auto id_sep_pos = line.find(';');
19 |         auto qryid_str = line.substr(0,id_sep_pos);
20 |         auto qry_id = std::stoull(qryid_str);
21 |         std::istringstream qry_content_stream(line.substr(id_sep_pos+1));
22 |         std::vector<std::string> stemmed_qry;
23 |         for(std::string qry_token; std::getline(qry_content_stream,qry_token,' ');) {
24 |             char stem_buf[stemmer_t::MAX_WORD_LENGTH+1] = {0};
25 |             char original_word[stemmer_t::MAX_WORD_LENGTH+1] = {0};
26 |             std::replace(qry_token.begin(),qry_token.end(),'-',' ');
27 |             qry_token.erase(std::remove(qry_token.begin(),qry_token.end(),'\''),qry_token.end());
28 |             qry_token.erase(std::remove(qry_token.begin(),qry_token.end(),'.'),qry_token.end());
29 |             std::transform(qry_token.begin(), qry_token.end(), qry_token.begin(), ::tolower);
30 |             std::copy(qry_token.begin(),qry_token.end(),std::begin(original_word));
31 |             auto ret = ks.kstem_stem_tobuffer(original_word,stem_buf);
32 |             if (ret > 0) {
33 |                 std::string tmp(stem_buf);
34 |                 stemmed_qry.push_back(tmp);
35 |             } else {
36 |                 stemmed_qry.push_back(qry_token);
37 |             }
38 |         }
39 |         std::cout << qry_id << ";";
40 |         for(size_t i=0;i<stemmed_qry.size()-1;i++) {
41 |             std::cout << stemmed_qry[i] << " ";
42 |         }
43 |         std::cout << stemmed_qry.back() << std::endl;
44 |     }
45 | }
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/tools/indri_to_surf.cpp:
--------------------------------------------------------------------------------
  1 | // extracts from an indri index a monoton sequence of integers in sdsl format
  2 | // which represent the parsed text collection.
  3 | #include <iostream>
  4 | 
  5 | #include "indri/Repository.hpp"
  6 | #include "indri/CompressedCollection.hpp"
  7 | #include "sdsl/int_vector_buffer.hpp"
  8 | 
  9 | bool
 10 | directory_exists(std::string dir)
 11 | {
 12 |     struct stat sb;
 13 |     const char* pathname = dir.c_str();
 14 |     if (stat(pathname, &sb) == 0 && S_ISDIR(sb.st_mode)) {
 15 |         return true;
 16 |     }
 17 |     return false;
 18 | }
 19 | 
 20 | void
 21 | create_directory(std::string dir)
 22 | {
 23 |     if (!directory_exists(dir)) {
 24 |         if (mkdir(dir.c_str(),0777) == -1) {
 25 |             perror("could not create directory");
 26 |             exit(EXIT_FAILURE);
 27 |         }
 28 |     }
 29 | }
 30 | 
 31 | 
 32 | int main( int argc, char** argv ) {
 33 |     if(argc != 3) {
 34 |         std::cout << "USAGE: " << argv[0] << " <indri repository> <surf collection folder>" << std::endl;
 35 |         return EXIT_FAILURE;
 36 |     }
 37 | 
 38 |     // parse cmd line
 39 |     std::string repository_name = argv[1];
 40 |     std::string surf_collection_folder = argv[2];
 41 |     create_directory(surf_collection_folder);
 42 |     std::string dict_file = surf_collection_folder + "/dict.txt";
 43 |     std::string doc_names_file = surf_collection_folder + "/doc_names.txt";
 44 |     std::string text_int_file = surf_collection_folder + "/text_int_SURF.sdsl";
 45 | 
 46 |     // load stuff
 47 |     indri::collection::Repository repo;
 48 |     repo.openRead( repository_name );
 49 | 
 50 |     // extract
 51 |     std::cout << "extracting sdsl integer file from indri index into file " << text_int_file << std::endl;
 52 |     std::vector<std::string> document_names;
 53 |     indri::collection::Repository::index_state state = repo.indexes();
 54 |     const auto& index = (*state)[0];
 55 |     uint64_t uniq_terms = index->uniqueTermCount();
 56 |     uniq_terms += 2; // we will shift all ids from idri by 2 so \0 and \1 is free
 57 |     uint8_t out_int_width = sdsl::bits::hi(uniq_terms)+1;
 58 |     sdsl::int_vector_buffer<> sdsl_col_file(text_int_file,std::ios::out,1024*1024,out_int_width,false);
 59 |     size_t written_term_ids = 0;
 60 |     indri::collection::CompressedCollection* collection = repo.collection();
 61 |     int64_t document_id = index->documentBase();
 62 |     indri::index::TermListFileIterator* iter = index->termListFileIterator();
 63 |     iter->startIteration();
 64 |     while( !iter->finished() ) {
 65 |         indri::index::TermList* list = iter->currentEntry();
 66 | 
 67 |         // find document name
 68 |         std::string doc_name = collection->retrieveMetadatum( document_id , "docno" );
 69 |         document_names.push_back(doc_name);
 70 | 
 71 |         if(document_id % 10000 == 0) {
 72 |             std::cout << ".";
 73 |             std::cout.flush();
 74 |         }
 75 | 
 76 |         // iterate over termlist
 77 |         for(const auto& term_id : list->terms()) {
 78 |             // we will shift all ids from idri by 1 so \0 and \1 is free
 79 |             if(term_id != 0) {
 80 |                 sdsl_col_file[written_term_ids++] = term_id+1; 
 81 |             }
 82 |         }
 83 |         sdsl_col_file[written_term_ids++] = 1; // end of doc sep
 84 | 
 85 |         document_id++;
 86 |         iter->nextEntry();
 87 |     }
 88 |     std::cout << std::endl;
 89 |     sdsl_col_file[written_term_ids++] = 0; // end of collection sep
 90 | 
 91 |     // write document names
 92 |     {
 93 |         std::cout << "writing document names to " << doc_names_file << std::endl;
 94 |         std::ofstream of_doc_names(doc_names_file);
 95 |         for(const auto& doc_name : document_names) {
 96 |             of_doc_names << doc_name << std::endl;
 97 |         }
 98 |     }
 99 |     // write dictionary
100 |     {
101 |         std::cout << "writing dictionary to " << dict_file << std::endl;
102 |         const auto& index = (*state)[0];
103 |         std::ofstream of_dict(dict_file);
104 |         for(size_t i=1;i<index->uniqueTermCount();i++) {
105 |             auto term_str = index->term(i);
106 |             of_dict << term_str << " " << i+1 << std::endl;
107 |         }
108 |     }
109 | }
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/tools/select_random_queries.cpp:
--------------------------------------------------------------------------------
  1 | // extracts from an indri index a monoton sequence of integers in sdsl format
  2 | // which represent the parsed text collection.
  3 | #include <iostream>
  4 | #include <unistd.h>
  5 | #include <stdlib.h>
  6 | 
  7 | #include "surf/query.hpp"
  8 | #include "sdsl/config.hpp"
  9 | #include "surf/query_parser.hpp"
 10 | #include "surf/util.hpp"
 11 | 
 12 | typedef struct cmdargs {
 13 |     std::string collection_dir;
 14 |     std::string query_file;
 15 |     std::string output_file;
 16 |     uint64_t num_qrys;
 17 | } cmdargs_t;
 18 | 
 19 | void
 20 | print_usage(char* program)
 21 | {
 22 |     fprintf(stdout,"%s -c <collection directory> -q <query file> -n <top-k> -o <output.qry>\n",program);
 23 |     fprintf(stdout,"where\n");
 24 |     fprintf(stdout,"  -c <collection directory>  : the directory the collection is stored.\n");
 25 |     fprintf(stdout,"  -q <query file>  : the queries to be processed.\n");
 26 |     fprintf(stdout,"  -n <num-qrys>  : the number of queries to be selected.\n");
 27 |     fprintf(stdout,"  -o <output.qry>  : selected queries.\n");
 28 | };
 29 | 
 30 | cmdargs_t
 31 | parse_args(int argc,char* const argv[])
 32 | {
 33 |     cmdargs_t args;
 34 |     int op;
 35 |     args.collection_dir = "";
 36 |     args.query_file = "";
 37 |     args.output_file = "";
 38 |     args.num_qrys = 1000;
 39 |     while ((op=getopt(argc,argv,"c:q:n:o:")) != -1) {
 40 |         switch (op) {
 41 |             case 'c':
 42 |                 args.collection_dir = optarg;
 43 |                 break;
 44 |             case 'q':
 45 |                 args.query_file = optarg;
 46 |                 break;
 47 |             case 'o':
 48 |                 args.output_file = optarg;
 49 |                 break;
 50 |             case 'n':
 51 |                 args.num_qrys = std::strtoul(optarg,NULL,10);
 52 |                 break;
 53 |             case '?':
 54 |             default:
 55 |                 print_usage(argv[0]);
 56 |         }
 57 |     }
 58 |     if (args.collection_dir==""||args.query_file=="") {
 59 |         std::cerr << "Missing command line parameters.\n";
 60 |         print_usage(argv[0]);
 61 |         exit(EXIT_FAILURE);
 62 |     }
 63 |     return args;
 64 | }
 65 | int main( int argc, char** argv ) {
 66 |     if(argc < 3) {
 67 |         print_usage(argv[0]);
 68 |         return EXIT_FAILURE;
 69 |     }
 70 | 
 71 |     /* parse command line */
 72 |     cmdargs_t args = parse_args(argc,argv);
 73 | 
 74 |     /* parse repo */
 75 |     auto cc = surf::parse_collection(args.collection_dir);
 76 | 
 77 |     /* parse queries */
 78 |     std::cout << "Parsing query file '" << args.query_file << "'" << std::endl;
 79 |     auto queries = surf::query_parser::parse_queries(args.collection_dir,args.query_file,true);
 80 |     std::cout << "Found " << queries.size() << " queries." << std::endl;
 81 | 
 82 |     /* select num_queries random ones */
 83 |     std::mt19937 gen(4711);
 84 |     std::shuffle(queries.begin(), queries.end(), gen);
 85 |     auto id_sort = [](const surf::query_t& a,const surf::query_t& b) {
 86 |         return std::get<0>(a) < std::get<0>(b);
 87 |     };
 88 |     std::sort(queries.begin(),queries.begin()+args.num_qrys,id_sort);
 89 |     
 90 |     /* output */
 91 |     std::ofstream selected_fs(args.output_file);
 92 |     if(selected_fs.is_open()) {
 93 |         for(size_t i=0;i<args.num_qrys;i++) {
 94 |             selected_fs << std::get<0>(queries[i]) << ";";
 95 |             const auto& tokens = std::get<1>(queries[i]);
 96 |             for(size_t j=0;j<tokens.size()-1;j++) {
 97 |                 selected_fs << tokens[j].token_strs[0] << " ";
 98 |             }
 99 |             selected_fs << tokens.back().token_strs[0] << std::endl;
100 |         }
101 |     } else {
102 |         perror("could not open output file.");
103 |     }
104 | }
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/tools/surf_collection_info.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <unistd.h>
 3 | #include <stdlib.h>
 4 | #include <algorithm>
 5 | #include <vector>
 6 | #include <iostream>
 7 | #include <sstream>
 8 | 
 9 | #include "surf/config.hpp"
10 | #include "surf/util.hpp"
11 | #include "surf/construct_doc_cnt.hpp"
12 | 
13 | typedef struct cmdargs {
14 |     std::string collection_dir;
15 |     std::string surf_file;
16 |     std::string trec_file;
17 | } cmdargs_t;
18 | 
19 | void
20 | print_usage(char* program)
21 | {
22 |     fprintf(stdout,"%s -c <collection directory>\n",program);
23 |     fprintf(stdout,"where\n");
24 |     fprintf(stdout,"  -c <collection directory>  : the directory the collection is stored.\n");
25 | };
26 | 
27 | cmdargs_t
28 | parse_args(int argc,char* const argv[])
29 | {
30 |     cmdargs_t args;
31 |     int op;
32 |     args.collection_dir = "";
33 |     while ((op=getopt(argc,argv,"c:")) != -1) {
34 |         switch (op) {
35 |             case 'c':
36 |                 args.collection_dir = optarg;
37 |                 break;
38 |             case '?':
39 |             default:
40 |                 print_usage(argv[0]);
41 |         }
42 |     }
43 |     if (args.collection_dir=="") {
44 |         std::cerr << "Missing command line parameters.\n";
45 |         print_usage(argv[0]);
46 |         exit(EXIT_FAILURE);
47 |     }
48 |     return args;
49 | }
50 | 
51 | int main( int argc, char** argv ) {
52 |     /* parse command line */
53 |     cmdargs_t args = parse_args(argc,argv);
54 | 
55 |     /* parse repo */
56 |     auto cc = surf::parse_collection(args.collection_dir);
57 |     sdsl::int_vector_buffer<> T(args.collection_dir+"/"+surf::TEXT_FILENAME);
58 |     std::cout << "n = |T|= " << T.size() << std::endl;
59 |     surf::construct_doc_cnt<sdsl::int_alphabet_tag::WIDTH>(cc);
60 |     uint64_t doc_cnt = 0;
61 |     load_from_cache(doc_cnt, surf::KEY_DOCCNT, cc);
62 |     std::cout << "number of documents = N = " << doc_cnt << std::endl;
63 | 	std::ifstream dic_fs(args.collection_dir+"/"+surf::DICT_FILENAME);
64 | 	std::string line;
65 | 	size_t num_terms = 0;
66 | 	while( std::getline(dic_fs,line) ) {
67 | 		num_terms++;
68 | 	}
69 | 	std::cout << "number of terms = sigma = " << num_terms << std::endl;
70 | 	std::cout << "avg document length = " << T.size() / doc_cnt << std::endl;
71 | }
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/update-sdsl.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | cd external/sdsl-lite && \
3 | git checkout master && \
4 | git pull && \
5 | cd ../.. && \
6 | git add external/sdsl-lite && \
7 | git commit -m "forwarded sdsl-lite to current master"
8 | 


--------------------------------------------------------------------------------